In [2]:
import os
import sys

In [3]:
sys.path.append("../")

In [23]:
from data.creators.trace_dataset_creator import TraceDatasetCreator
from data.creators.multi_trace_dataset_creator import MultiTraceDatasetCreator
from data.datasets.trace_dataset import TraceDataset
from data.readers.hub_project_reader import HubProjectReader

from sklearn.metrics import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from util.logging.logger_manager import LoggerManager, logger
from util.logging.logger_config import LoggerConfig
from sklearn.model_selection import train_test_split

import pandas as pd

In [5]:
log_path = os.path.expanduser("~/desktop/safa/logs")
LoggerManager.configure_logger(LoggerConfig(output_dir=log_path))

<TGenLogger tgen (INFO)>

# Read Project

In [6]:
project_reader = HubProjectReader("drone")
dataset_creator = TraceDatasetCreator(project_reader)
dataset = dataset_creator.create()
trace_df = dataset.to_dataframe()
print(len(trace_df))

Downloading dataset from hub: https://safa-datasets-open.s3.amazonaws.com/datasets/open-source/drone.zip
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/30e2f908ec8ca05678e1daf3aab6d65f5b26e4424fc3af099482cba26f2fcdc6/drone_tasks/base/Code.csv:458
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/30e2f908ec8ca05678e1daf3aab6d65f5b26e4424fc3af099482cba26f2fcdc6/drone_tasks/base/Design Definitions.csv:99
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/30e2f908ec8ca05678e1daf3aab6d65f5b26e4424fc3af099482cba26f2fcdc6/drone_tasks/base/Requirements.csv:55
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/30e2f908ec8ca05678e1daf3aab6d65f5b26e4424fc3af099482cba26f2fcdc6/drone_tasks/base/Requirements2Design Definitions.csv:58
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/30e2f908ec8ca05678e1daf3aab6d65f5b26e4424fc3af099482cba26f2fcdc6/drone_tasks/base/Design Definitions2Code.csv:222
Artifacts: 612 

Cleaning artifacts...: 100%|█████████████| 612/612 [00:00<00:00, 2273617.40it/s]
Generating negative links between Requirements -> Design Definitions: 100%|█| 55
Generating negative links between Design Definitions -> Code: 100%|█| 99/99 [00:


[Code: 458],[Design Definitions: 99],[Requirements: 55]
Trace dataset(+280, -(50507) = 50787)
50787


In [14]:
_, trace_df, _ = project_reader.read_project()

Downloading dataset from hub: https://safa-datasets-open.s3.amazonaws.com/datasets/open-source/drone.zip
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/30e2f908ec8ca05678e1daf3aab6d65f5b26e4424fc3af099482cba26f2fcdc6/drone_tasks/base/Code.csv:458
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/30e2f908ec8ca05678e1daf3aab6d65f5b26e4424fc3af099482cba26f2fcdc6/drone_tasks/base/Design Definitions.csv:99
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/30e2f908ec8ca05678e1daf3aab6d65f5b26e4424fc3af099482cba26f2fcdc6/drone_tasks/base/Requirements.csv:55
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/30e2f908ec8ca05678e1daf3aab6d65f5b26e4424fc3af099482cba26f2fcdc6/drone_tasks/base/Requirements2Design Definitions.csv:58
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/30e2f908ec8ca05678e1daf3aab6d65f5b26e4424fc3af099482cba26f2fcdc6/drone_tasks/base/Design Definitions2Code.csv:222
Artifacts: 612 

In [7]:
def to_csv_dataset(dataset, trace_df):
    sources = []
    targets = []
    for _, row in trace_df.iterrows():
        source_body = dataset.artifact_df.loc[row["source"]]["content"]
        target_body = dataset.artifact_df.loc[row["target"]]["content"]
        
        sources.append(source_body)
        targets.append(target_body)
    df = pd.DataFrame()
    df["source_id"] = trace_df["source"]
    df["target_id"] = trace_df["target"]
    df["source"] = sources
    df["target"] = targets
    df["label"] = trace_df["label"]
    return df

In [11]:
drone_df = to_csv_dataset(dataset, dataset.trace_df)
drone_df.head()

Unnamed: 0_level_0,source_id,target_id,source,target,label
link_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-7852514118434646260,RE-8,DD-26,UAV State transitions When requested the _Vehi...,Activation state is ON_GROUND When a UAV is in...,1
-3696236521568399644,RE-8,DD-30,UAV State transitions When requested the _Vehi...,Transition from ON_GROUND to AWAITING_TAKEOFF_...,1
-2714593483190114934,RE-8,DD-32,UAV State transitions When requested the _Vehi...,Transition from TAKING_OFF to FLYING When a UA...,1
-3965084921625618917,RE-8,DD-579,UAV State transitions When requested the _Vehi...,Transition from IN_AIR to LANDING When a UAV i...,1
5991805353892408792,RE-9,DD-11,ListenerNotifications If a client registers fo...,Client registers for flight route events A cli...,1


In [29]:
pos_df = drone_df[drone_df["label"] == 1]
neg_df = drone_df[drone_df["label"] == 0]

train_pos_df, test_pos_df = train_test_split(pos_df, train_size=10)
train_neg_df, test_neg_df = train_test_split(neg_df, train_size=10)
train_df = pd.concat([train_pos_df, train_neg_df])
test_df = pd.concat([test_pos_df, test_neg_df])
print(f"Train: {len(train_df)}")
print(f"Train: {len(test_df)}")

Train: 20
Train: 50767


In [30]:
export_dir_path = os.path.expanduser("~/desktop/safa/datasets/openai")
drone_dir_path = os.path.join(export_dir_path, "drone")
os.makedirs(drone_dir_path, exist_ok=True)
drone_df.to_csv(os.path.join(drone_dir_path, "drone.csv"), index=False)
train_df.to_csv(os.path.join(drone_dir_path, "train.csv"), index=False)
test_df.to_csv(os.path.join(drone_dir_path, "test.csv"), index=False)