In [1]:
import os
import sys

import pandas as pd

from openai.embeddings_utils import cosine_similarity

from sklearn.metrics import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
sys.path.append("../")

In [3]:
from data.creators.trace_dataset_creator import TraceDatasetCreator
from data.creators.multi_trace_dataset_creator import MultiTraceDatasetCreator
from data.datasets.trace_dataset import TraceDataset
from data.readers.hub_project_reader import HubProjectReader

from util.logging.logger_manager import LoggerManager, logger
from util.logging.logger_config import LoggerConfig

In [4]:
log_path = os.path.expanduser("~/desktop/safa/logs")
LoggerManager.configure_logger(LoggerConfig(output_dir=log_path))

<TGenLogger tgen (INFO)>

# Read Project

In [5]:
def add_vsm_scores(dataset, model):
    model.fit(dataset.artifact_df["content"])
    
    scores = []
    for _, row in dataset.trace_df.iterrows():
        source = dataset.artifact_df.loc[row["source"]]["content"]
        target = dataset.artifact_df.loc[row["target"]]["content"]
        
        source_embedding = model.transform([source]).toarray()[0]
        target_embedding = model.transform([target]).toarray()[0]
        scores.append(cosine_similarity(source_embedding, target_embedding))
    dataset.trace_df["score"] = scores
        
def undersample(df, n_needed):
    df = df.sort_values("score")
    offset = int((len(df) - n_needed) / 2)
    return df[offset:offset+n_needed]


def to_csv_dataset(dataset, trace_df):
    sources = []
    targets = []
    for _, row in trace_df.iterrows():
        source_body = dataset.artifact_df.loc[row["source"]]["content"]
        target_body = dataset.artifact_df.loc[row["target"]]["content"]
        
        sources.append(source_body)
        targets.append(target_body)
    df = pd.DataFrame()
    df["source_id"] = trace_df["source"]
    df["target_id"] = trace_df["target"]
    df["source"] = sources
    df["target"] = targets
    df["label"] = trace_df["label"]
    return df

export_dir_path = os.path.expanduser("~/desktop/safa/datasets/openai")

dataset_names = ["cm1", "mip", "traincontroller", "itrust", "cchit", "drone"]


train_dataframes = []
test_dataframes = []

for dataset_name in dataset_names:
    print(dataset_name)
    export_path = os.path.join(export_dir_path, dataset_name)
    os.makedirs(export_path, exist_ok=True)
    train_data_path = os.path.join(export_path, "train.csv")
    test_data_path = os.path.join(export_path, "test.csv")
    
    if not os.path.exists(train_data_path) or not os.path.exists(test_data_path):
        # Add VSM scores
        tfidf_model = TfidfVectorizer()
        project_reader = HubProjectReader(dataset_name)
        trace_dataset_creator = TraceDatasetCreator(project_reader)
        dataset = trace_dataset_creator.create()
        add_vsm_scores(dataset, tfidf_model)

        # Select easy / medium negative links
        pos_df = dataset.trace_df[dataset.trace_df["label"] == 1]
        neg_df = dataset.trace_df[dataset.trace_df["label"] == 0]
        selected_df = pd.concat([pos_df, undersample(neg_df, len(pos_df))])
        selected_df.head()

        # Create Splits
        csv_dataset = to_csv_dataset(dataset, selected_df)
        train_df, test_df = train_test_split(csv_dataset, test_size=0.25, stratify=csv_dataset["label"])

        # Export 
        train_df.to_csv(train_data_path, index=False)
        test_df.to_csv(test_data_path, index=False)
    else:
        train_df = pd.read_csv(train_data_path)
        test_df = pd.read_csv(test_data_path)
    
    train_dataframes.append(train_df)
    test_dataframes.append(test_df)
    print("done.")

cm1
done.
mip
done.
traincontroller
done.
itrust
done.
cchit
done.
drone
Downloading dataset from hub: https://safa-datasets-open.s3.amazonaws.com/datasets/open-source/drone.zip
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/30e2f908ec8ca05678e1daf3aab6d65f5b26e4424fc3af099482cba26f2fcdc6/drone_tasks/base/Code.csv:458
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/30e2f908ec8ca05678e1daf3aab6d65f5b26e4424fc3af099482cba26f2fcdc6/drone_tasks/base/Design Definitions.csv:99
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/30e2f908ec8ca05678e1daf3aab6d65f5b26e4424fc3af099482cba26f2fcdc6/drone_tasks/base/Requirements.csv:55
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/30e2f908ec8ca05678e1daf3aab6d65f5b26e4424fc3af099482cba26f2fcdc6/drone_tasks/base/Requirements2Design Definitions.csv:58
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/30e2f908ec8ca05678e1daf3aab6d65f5b26e4424fc3af099482cba2

Cleaning artifacts...: 100%|█████████████| 612/612 [00:00<00:00, 1229364.97it/s]
Generating negative links between Requirements -> Design Definitions: 100%|█| 55
Generating negative links between Design Definitions -> Code: 100%|█| 99/99 [00:


[Code: 458],[Design Definitions: 99],[Requirements: 55]
Trace dataset(+280, -(50507) = 50787)
done.


In [7]:
pd.concat(train_dataframes).sample(frac=1).to_csv(os.path.join(export_dir_path, "train.csv"), index=False)
pd.concat(test_dataframes).sample(frac=1).to_csv(os.path.join(export_dir_path, "test.csv"), index=False)