In [1]:
import os
import sys

In [2]:
sys.path.append("../")

In [3]:
from data.creators.trace_dataset_creator import TraceDatasetCreator
from data.creators.multi_trace_dataset_creator import MultiTraceDatasetCreator
from data.datasets.trace_dataset import TraceDataset
from data.readers.hub_project_reader import HubProjectReader

from sklearn.metrics import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from util.logging.logger_manager import LoggerManager, logger
from util.logging.logger_config import LoggerConfig

import pandas as pd

In [4]:
log_path = os.path.expanduser("~/desktop/safa/logs")
LoggerManager.configure_logger(LoggerConfig(output_dir=log_path))

<TGenLogger tgen (INFO)>

In [5]:
def calculate_similarity_matrix_from_term_frequencies(tf_source, tf_target):
    """
    Calculates the similarity matrix used for predicting traces from the term frequencies of the sources and targets
    :param tf_source: The term frequencies of the sources
    :param tf_target: The term frequencies of the targets
    :return: The similarity matrix where each cell contains the similarity of the corresponding source (row) and target (col)
    """
    return 1 - pairwise_distances(tf_source, Y=tf_target, metric="cosine", n_jobs=-1)

def create_term_frequency_matrices(model, raw_sources: pd.Series, raw_targets: pd.Series):
    """
    Creates 2 TermFrequencyMatrices (one for A another for B) where the weight of
    each (row, col) pair is calculated via TF-IDF
    :param raw_sources : The source documents whose matrix is the first element
    :param raw_targets : The target documents whose matrix is the second element
    :return: CountMatrix for raw_sources and raw_targets, and also the trained model
    """
    set_source: csr_matrix = model.transform(raw_sources)
    set_target: csr_matrix = model.transform(raw_targets)
    return set_source, set_target

# Read Project

In [8]:
project_reader = HubProjectReader("cm1")
dataset_creator = TraceDatasetCreator(project_reader)
dataset = dataset_creator.create()
trace_df = dataset.to_dataframe()
print(len(trace_df))

Downloading dataset from hub: https://safa-datasets-open.s3.amazonaws.com/datasets/open-source/cm1.zip
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/4719a95a7911878e5dd403ca7f5b5600d50ea6eb74744ceb527265f0637985ad/CM1/CM1-sourceArtifacts.xml:22
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/4719a95a7911878e5dd403ca7f5b5600d50ea6eb74744ceb527265f0637985ad/CM1/CM1-targetArtifacts.xml:53
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/4719a95a7911878e5dd403ca7f5b5600d50ea6eb74744ceb527265f0637985ad/CM1/CM1-answerSet.xml:45
Artifacts: 75 Traces: 45 Queries: 1
Number of orphan artifacts (26)
No missing source artifacts. (0)
No missing target artifacts. (0)


Cleaning artifacts...: 100%|████████████████| 75/75 [00:00<00:00, 998643.81it/s]
Generating negative links between High Level Requirements -> Low Level Requireme

[High Level Requirements: 22],[Low Level Requirements: 53]





Trace dataset(+45, -(1121) = 1166)
1166


# Generate Scores

In [9]:
model = TfidfVectorizer()
sources = list(set(trace_df["source"]))
targets = list(set(trace_df["target"]))
artifacts = sources + targets
model.fit(artifacts)

TfidfVectorizer()

In [10]:
def get_hardness_label(hardness):
    if hardness < 0.3:
        return "easy"
    if hardness < 0.66:
        return "medium"
    else:
        return "hard"

In [11]:
scores = []
hardnesses = []
hardness_labels = []
for i, row in trace_df.iterrows():
    source = row["source"]
    target = row["target"]
    label = row["label"]
    
    s_matrix, t_matrix = create_term_frequency_matrices(model, [source], [target])
    sim_matrix = calculate_similarity_matrix_from_term_frequencies(s_matrix, t_matrix)
    sim_score = sim_matrix[0][0]
    hardness = abs(label - sim_score)
    hardness_label = get_hardness_label(hardness)
    
    scores.append(sim_score)
    hardnesses.append(hardness)
    hardness_labels.append(hardness_label)
trace_df["similarity"] = scores
trace_df["hardness"] = hardnesses
trace_df["hardness_label"] = hardness_labels

In [12]:
print(len(trace_df))
print(trace_df["hardness_label"].value_counts())
trace_df.head()

1166
easy      1120
hard        41
medium       5
Name: hardness_label, dtype: int64


Unnamed: 0,source_id,source,target_id,target,label,similarity,hardness,hardness_label
0,SRS5.12.3.2,The DPU-CCM shall provide a mechanism for othe...,DPUSDS5.12.1.5.1,Error Collection and Reporting The DPU-CCM CSC...,1,0.20648,0.79352,hard
1,SRS5.12.2.1,The DPU-CCM shall implement a mechanism whereb...,DPUSDS5.12.1.4.3,Memory Upload and Download Handling If more th...,1,0.105381,0.894619,hard
2,SRS5.12.3.2,The DPU-CCM shall provide a mechanism for othe...,DPUSDS5.12.2.2,Public Functions This routine is called by any...,1,0.082703,0.917297,hard
3,SRS5.13.1.3,The DPU-TMALI shall provide a function which s...,DPUSDS5.13.0.2,Telescope Module Access Library and Interface ...,1,0.220246,0.779754,hard
4,SRS5.12.2.1,The DPU-CCM shall implement a mechanism whereb...,DPUSDS5.12.1.2.4,Control and Monitoring the CCM Control Task al...,1,0.104114,0.895886,hard


# Selection

In [13]:
n_top = 20
hardness_lower = 0
hardness_upper = 0.9
strat = "sample"

selection_strats = {
    "top": lambda df: df[:n_top],
    "sample": lambda df: df.sample(n=n_top)
}

In [14]:
def select_from(df):
    pos_df = df[df["label"] == 1]
    neg_df = df[df["label"] == 0]

    selection_strat = selection_strats[strat]
    selected_df = pd.concat([selection_strat(pos_df), selection_strat(neg_df)])
    return selected_df

selected_df = select_from(trace_df)

unselected_indices = set(range(len(trace_df))) - set(selected_df.index)
unselected_df = trace_df.iloc[list(unselected_indices)]

print(f"Selected: {len(selected_df)}")
selected_df.head()

Selected: 40


Unnamed: 0,source_id,source,target_id,target,label,similarity,hardness,hardness_label
31,SRS5.12.2.1,The DPU-CCM shall implement a mechanism whereb...,DPUSDS5.12.1.4.1,Memory Upload and Download Handling There are ...,1,0.141467,0.858533,hard
21,SRS5.12.2.1,The DPU-CCM shall implement a mechanism whereb...,DPUSDS5.12.1.4.4,Memory Upload and Download Handling Data can b...,1,0.092119,0.907881,hard
8,SRS5.13.2.3,The DPU-TMALI shall configure the DCI interfac...,DPUSDS5.13.1.6.4,Setting Data Timeout in DCI During nominal ope...,1,0.19082,0.80918,hard
3,SRS5.13.1.3,The DPU-TMALI shall provide a function which s...,DPUSDS5.13.0.2,Telescope Module Access Library and Interface ...,1,0.220246,0.779754,hard
37,SRS5.12.2.1,The DPU-CCM shall implement a mechanism whereb...,DPUSDS5.12.1.4.5,Memory Upload and Download Handling The comman...,1,0.061458,0.938542,hard


# Splitting: Train / Val

In [15]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(selected_df, test_size=0.5, stratify=selected_df["label"])

# Formatting

In [17]:
query_format = "Are these two artifacts related?\n1. {}\n2. {}\n\n###\n\n"
export_dir = os.path.expanduser("~/desktop/safa/datasets/openai/cm1")

def export_as_formatted(df, stage_name, dir_path):
    entries = []
    queries = []
    responses = []

    for i, row in df.iterrows():
        row_query = query_format.format(row["source"], row["target"], row["label"])
        row_response = " yes" if row["label"] == 1 else " no"
        entries.append({
            "prompt": row_query,
            "completion": row_response
        })

    openai_df = pd.DataFrame(entries)
    export_path = os.path.join(dir_path, f"{stage_name}.jsonl")
    openai_df.to_json(export_path,orient='records', lines=True)
    print(f"Exported: {export_path} ({len(openai_df)})")
    
export_as_formatted(train_df, "train", export_dir)
export_as_formatted(val_df, "val", export_dir)
export_as_formatted(unselected_df, "test", export_dir)

Exported: /Users/albertorodriguez/desktop/safa/datasets/openai/cm1/train.jsonl (20)
Exported: /Users/albertorodriguez/desktop/safa/datasets/openai/cm1/val.jsonl (20)
Exported: /Users/albertorodriguez/desktop/safa/datasets/openai/cm1/test.jsonl (1126)
