In [1]:
import os
import sys

In [2]:
sys.path.append("../")

In [27]:
from data.creators.trace_dataset_creator import TraceDatasetCreator
from data.creators.multi_trace_dataset_creator import MultiTraceDatasetCreator
from data.datasets.trace_dataset import TraceDataset
from data.readers.hub_project_reader import HubProjectReader

from sklearn.metrics import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from util.logging.logger_manager import LoggerManager, logger
from util.logging.logger_config import LoggerConfig

import pandas as pd

In [4]:
log_path = os.path.expanduser("~/desktop/safa/logs")
LoggerManager.configure_logger(LoggerConfig(output_dir=log_path))

<TGenLogger tgen (INFO)>

In [5]:
def calculate_similarity_matrix_from_term_frequencies(tf_source, tf_target):
    """
    Calculates the similarity matrix used for predicting traces from the term frequencies of the sources and targets
    :param tf_source: The term frequencies of the sources
    :param tf_target: The term frequencies of the targets
    :return: The similarity matrix where each cell contains the similarity of the corresponding source (row) and target (col)
    """
    return 1 - pairwise_distances(tf_source, Y=tf_target, metric="cosine", n_jobs=-1)

def create_term_frequency_matrices(model, raw_sources: pd.Series, raw_targets: pd.Series):
    """
    Creates 2 TermFrequencyMatrices (one for A another for B) where the weight of
    each (row, col) pair is calculated via TF-IDF
    :param raw_sources : The source documents whose matrix is the first element
    :param raw_targets : The target documents whose matrix is the second element
    :return: CountMatrix for raw_sources and raw_targets, and also the trained model
    """
    set_source: csr_matrix = model.transform(raw_sources)
    set_target: csr_matrix = model.transform(raw_targets)
    return set_source, set_target

# Read Project

In [28]:
project_readers = [HubProjectReader("cm1"), HubProjectReader("drone"), HubProjectReader("traincontroller"),
                  HubProjectReader("itrust"), HubProjectReader("mip"), HubProjectReader("cchit")]
dataset_creator = MultiTraceDatasetCreator(project_readers)
dataset = dataset_creator.create()
trace_df = dataset.to_dataframe()
print(len(trace_df))

Downloading dataset from hub: https://safa-datasets-open.s3.amazonaws.com/datasets/open-source/cm1.zip
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/4719a95a7911878e5dd403ca7f5b5600d50ea6eb74744ceb527265f0637985ad/CM1/CM1-sourceArtifacts.xml:22
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/4719a95a7911878e5dd403ca7f5b5600d50ea6eb74744ceb527265f0637985ad/CM1/CM1-targetArtifacts.xml:53
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/4719a95a7911878e5dd403ca7f5b5600d50ea6eb74744ceb527265f0637985ad/CM1/CM1-answerSet.xml:45
Artifacts: 75 Traces: 45 Queries: 1
Number of orphan artifacts (26)
No missing source artifacts. (0)
No missing target artifacts. (0)


Cleaning artifacts...: 100%|████████████████| 75/75 [00:00<00:00, 927943.36it/s]
Generating negative links between High Level Requirements -> Low Level Requireme

[High Level Requirements: 22],[Low Level Requirements: 53]





Trace dataset(+45, -(1121) = 1166)
Downloading dataset from hub: https://safa-datasets-open.s3.amazonaws.com/datasets/open-source/drone.zip
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/30e2f908ec8ca05678e1daf3aab6d65f5b26e4424fc3af099482cba26f2fcdc6/drone_tasks/base/Code.csv:458
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/30e2f908ec8ca05678e1daf3aab6d65f5b26e4424fc3af099482cba26f2fcdc6/drone_tasks/base/Design Definitions.csv:99
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/30e2f908ec8ca05678e1daf3aab6d65f5b26e4424fc3af099482cba26f2fcdc6/drone_tasks/base/Requirements.csv:55
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/30e2f908ec8ca05678e1daf3aab6d65f5b26e4424fc3af099482cba26f2fcdc6/drone_tasks/base/Requirements2Design Definitions.csv:58
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/30e2f908ec8ca05678e1daf3aab6d65f5b26e4424fc3af099482cba26f2fcdc6/drone_tasks/base/Design Defin

Cleaning artifacts...: 100%|█████████████| 612/612 [00:00<00:00, 2234041.82it/s]
Generating negative links between Requirements -> Design Definitions: 100%|█| 55
Generating negative links between Design Definitions -> Code: 100%|█| 99/99 [00:


[Code: 458],[Design Definitions: 99],[Requirements: 55]
Trace dataset(+280, -(50507) = 50787)


  data1 = dataframe1.to_dict(orient=orient)
  data2 = dataframe2.to_dict(orient=orient)


Downloading dataset from hub: https://safa-datasets-open.s3.amazonaws.com/datasets/open-source/traincontroller.zip


Downloading data:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/b7d23709b95609c6d8f17529bb93ae67f92b248a0826edd4a00db0104d900df7/TrainController/Originals/Source_1/SRS.csv:219
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/b7d23709b95609c6d8f17529bb93ae67f92b248a0826edd4a00db0104d900df7/TrainController/Originals/Source_1/SDD.csv:534
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/b7d23709b95609c6d8f17529bb93ae67f92b248a0826edd4a00db0104d900df7/TrainController/Originals/Source_1/SSRS.csv:583
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/b7d23709b95609c6d8f17529bb93ae67f92b248a0826edd4a00db0104d900df7/TrainController/Originals/Source_1/SDD2SRS.txt:581
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/b7d23709b95609c6d8f17529bb93ae67f92b248a0826edd4a00db0104d900df7/TrainController/Originals/Source_1/SSRS2SDD.txt:700
Artifacts: 1336 Traces: 1280 Queries: 2
Number of orphan artifacts (150)
No missing source

Cleaning artifacts...: 100%|███████████| 1336/1336 [00:00<00:00, 1461933.25it/s]
Generating negative links between SDD -> SRS: 100%|█| 534/534 [00:41<00:00, 12.7
Generating negative links between SSRS -> SDD: 100%|█| 583/583 [01:48<00:00,  5.


[Srs: 219],[Sdd: 534],[Ssrs: 583]
Trace dataset(+1259, -(427009) = 428268)
Downloading dataset from hub: https://safa-datasets-open.s3.amazonaws.com/datasets/open-source/itrust.zip


Downloading data:   0%|          | 0.00/1.17M [00:00<?, ?B/s]

/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/32a72197dd693cf25ecbc63b850ba3e68df3151db71f040a488b7ed73b847b8c/itrust_tasks/base/Use Cases.csv:131
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/32a72197dd693cf25ecbc63b850ba3e68df3151db71f040a488b7ed73b847b8c/itrust_tasks/base/JSP Code.csv:165
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/32a72197dd693cf25ecbc63b850ba3e68df3151db71f040a488b7ed73b847b8c/itrust_tasks/base/Java Code.csv:226
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/32a72197dd693cf25ecbc63b850ba3e68df3151db71f040a488b7ed73b847b8c/itrust_tasks/base/Use Cases2Java Code.csv:286
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/32a72197dd693cf25ecbc63b850ba3e68df3151db71f040a488b7ed73b847b8c/itrust_tasks/base/Use Cases2JSP Code.csv:113
Artifacts: 522 Traces: 399 Queries: 2
No missing source artifacts. (0)
No missing target artifacts. (0)


Cleaning artifacts...: 100%|█████████████| 522/522 [00:00<00:00, 1897250.16it/s]
Generating negative links between Use Cases -> Java Code: 100%|█| 131/131 [00:10
Generating negative links between Use Cases -> JSP Code: 100%|█| 131/131 [00:07<


[Use Cases: 131],[Jsp Code: 165],[Java Code: 226]
Trace dataset(+399, -(50822) = 51221)
Downloading dataset from hub: https://safa-datasets-open.s3.amazonaws.com/datasets/open-source/mip.zip


Downloading data:   0%|          | 0.00/100k [00:00<?, ?B/s]

/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/4129319c0de134940aa6e16baa1bb36832b3782a54ecaa7bf1ebb836d33cc03f/mip/clean/components.csv:21
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/4129319c0de134940aa6e16baa1bb36832b3782a54ecaa7bf1ebb836d33cc03f/mip/clean/requirements.csv:126
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/4129319c0de134940aa6e16baa1bb36832b3782a54ecaa7bf1ebb836d33cc03f/mip/AnswerMatrix.csv:132
Artifacts: 147 Traces: 132 Queries: 1
Number of orphan artifacts (22)
No missing source artifacts. (0)
No missing target artifacts. (0)


Cleaning artifacts...: 100%|█████████████| 147/147 [00:00<00:00, 1529932.23it/s]
Generating negative links between Requirements -> Components: 100%|█| 126/126 [0


[Components: 21],[Requirements: 126]
Trace dataset(+132, -(2646) = 2778)
Downloading dataset from hub: https://safa-datasets-open.s3.amazonaws.com/datasets/open-source/cchit.zip


Downloading data:   0%|          | 0.00/161k [00:00<?, ?B/s]

/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/de7d81e78e8cd5bd1c84b62f339304a970f4f4819b0978cf3cbdfcf9c7748617/cchit/source.xml:118
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/de7d81e78e8cd5bd1c84b62f339304a970f4f4819b0978cf3cbdfcf9c7748617/cchit/target.xml:1066
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/de7d81e78e8cd5bd1c84b62f339304a970f4f4819b0978cf3cbdfcf9c7748617/cchit/answer.txt:587
Artifacts: 1180 Traces: 587 Queries: 1
Number of orphan artifacts (693)
No missing source artifacts. (0)
No missing target artifacts. (0)


Cleaning artifacts...: 100%|███████████| 1180/1180 [00:00<00:00, 2259944.62it/s]
Generating negative links between Requirements -> Regulatory Codes: 100%|█| 116/


[Requirements: 116],[Regulatory Codes: 1064]
Trace dataset(+587, -(122837) = 123424)
657644


In [29]:
trace_df.to_csv(os.path.expanduser("~/desktop/safa/datasets/openai/all.csv"))

# Generate Scores

In [31]:
model = TfidfVectorizer()
sources = list(set(trace_df["source"]))
targets = list(set(trace_df["target"]))
artifacts = sources + targets
print(len(artifacts))
model.fit(artifacts)

4618


TfidfVectorizer()

In [32]:
def get_hardness_label(hardness):
    if hardness < 0.3:
        return "easy"
    if hardness < 0.66:
        return "medium"
    else:
        return "hard"

In [33]:
scores = []
hardnesses = []
hardness_labels = []
for i, row in trace_df.iterrows():
    source = row["source"]
    target = row["target"]
    label = row["label"]
    
    s_matrix, t_matrix = create_term_frequency_matrices(model, [source], [target])
    sim_matrix = calculate_similarity_matrix_from_term_frequencies(s_matrix, t_matrix)
    sim_score = sim_matrix[0][0]
    hardness = abs(label - sim_score)
    hardness_label = get_hardness_label(hardness)
    
    scores.append(sim_score)
    hardnesses.append(hardness)
    hardness_labels.append(hardness_label)
trace_df["similarity"] = scores
trace_df["hardness"] = hardnesses
trace_df["hardness_label"] = hardness_labels

KeyboardInterrupt: 

In [35]:
print(len(trace_df))
trace_df.head()

657644


Unnamed: 0,source_id,source,target_id,target,label
0,OBU.SSRS.1059,The OBU shall take a configurable brake build-...,PTC.SDD.2915,The OBU calculates the permitted speed from a ...,1
1,1715,The system shall include the ability to print ...,262,The system shall provide the ability to receiv...,1
2,OBU.SSRS.627,The OBU software shall be developed to use an ...,PTC.SDD.1052,The TG-PTC system shall abstract hardware from...,1
3,UC31S1,The expired prescription report list is titled...,ViewExpiredPrescriptionsAction,package edu.ncsu.csc.itrust.action; import jav...,1
4,1714,The system shall include Standard Laboratory a...,565,The system shall relate medication allergies t...,1


# Selection

In [40]:
n_top = 100
hardness_lower = 0
hardness_upper = 0.9
strat = "sample"

selection_strats = {
    "top": lambda df: df[:n_top],
    "sample": lambda df: df.sample(n=n_top)
}

In [41]:
def select_from(df):
    pos_df = df[df["label"] == 1]
    neg_df = df[df["label"] == 0]

    selection_strat = selection_strats[strat]
    selected_df = pd.concat([selection_strat(pos_df), selection_strat(neg_df)])
    return selected_df

selected_df = select_from(trace_df)

unselected_indices = set(range(len(trace_df))) - set(selected_df.index)
unselected_df = trace_df.iloc[list(unselected_indices)]

print(f"Selected: {len(selected_df)}")
selected_df.head()

Selected: 200


Unnamed: 0,source_id,source,target_id,target,label
102,OBU.SSRS.1595,The OBU shall handle train crew acknowledment ...,PTC.SDD.4104,Transitioning between PTC territory and Non-PT...,1
1024,DD-213,Log all GCS related events All commands sent t...,ReadDispatcher.java,package edu.nd.dronology.gstation.connector.di...,1
284,OBU.SSRS.1957,The OBU shall be able to load an ITCSM Agent s...,PTC.SDD.7415,Each TG-PTC subsystem shall adhere to the agen...,1
803,PTC.SDD.6661,Centralized Logging is the ability of the syst...,PTC.SRS.226,6.6.2 Centralized Logging (REQ1185) The TG-PTC...,1
531,OBU.SSRS.717,The OBU shall react to any user request in a m...,PTC.SDD.7200,The TG-PTC system shall always keep users info...,1


# Splitting: Train / Val

In [42]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(selected_df, test_size=0.5, stratify=selected_df["label"])
print(f"Train: {len(train_df)}")

Train: 100


# Formatting

In [44]:
query_format = "1. {}\n2. {}\n\n###\n\n"
export_dir = os.path.expanduser("~/desktop/safa/datasets/openai/cm1")

def export_as_formatted(df, stage_name, dir_path):
    entries = []
    queries = []
    responses = []

    for i, row in df.iterrows():
        row_query = query_format.format(row["source"], row["target"], row["label"])
        row_response = " yes###" if row["label"] == 1 else " no###"
        entries.append({
            "prompt": row_query,
            "completion": row_response
        })

    openai_df = pd.DataFrame(entries)
    export_path = os.path.join(dir_path, f"{stage_name}.jsonl")
    openai_df.to_json(export_path,orient='records', lines=True)
    print(f"Exported: {export_path} ({len(openai_df)})")
    
export_as_formatted(train_df, "train", export_dir)
export_as_formatted(val_df, "val", export_dir)
export_as_formatted(unselected_df, "test", export_dir)

Exported: /Users/albertorodriguez/desktop/safa/datasets/openai/cm1/train.jsonl (100)
Exported: /Users/albertorodriguez/desktop/safa/datasets/openai/cm1/val.jsonl (100)
Exported: /Users/albertorodriguez/desktop/safa/datasets/openai/cm1/test.jsonl (657444)


In [25]:
row = train_df.sample(n=1).iloc[0]
print(len(row["source"]) + len(row["target"]))

1163
