In [17]:
import os
import openai
import sys
import json
import pandas as pd
import openai

sys.path.append("../")

from data.creators.trace_dataset_creator import TraceDatasetCreator
from data.creators.multi_trace_dataset_creator import MultiTraceDatasetCreator
from data.datasets.trace_dataset import TraceDataset
from data.readers.hub_project_reader import HubProjectReader

from sklearn.metrics import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from util.logging.logger_manager import LoggerManager, logger
from util.logging.logger_config import LoggerConfig

from openai.embeddings_utils import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

openai.organization = "org-zmmRix6NzVPwQcNm3WF0v1A2"
openai.api_key = "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"

log_path = os.path.expanduser("~/desktop/safa/logs")
LoggerManager.configure_logger(LoggerConfig(output_dir=log_path))

<TGenLogger tgen (INFO)>

# Read Project

In [18]:
project_reader = HubProjectReader("cchit")
dataset_creator = TraceDatasetCreator(project_reader)
dataset = dataset_creator.create()
trace_df = dataset.to_dataframe()
print(len(trace_df))

Downloading dataset from hub: https://safa-datasets-open.s3.amazonaws.com/datasets/open-source/cchit.zip
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/de7d81e78e8cd5bd1c84b62f339304a970f4f4819b0978cf3cbdfcf9c7748617/cchit/source.xml:118
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/de7d81e78e8cd5bd1c84b62f339304a970f4f4819b0978cf3cbdfcf9c7748617/cchit/target.xml:1066
/Users/albertorodriguez/desktop/safa/datasets/HuggingFace/extracted/de7d81e78e8cd5bd1c84b62f339304a970f4f4819b0978cf3cbdfcf9c7748617/cchit/answer.txt:587
Artifacts: 1180 Traces: 587 Queries: 1
Number of orphan artifacts (693)
No missing source artifacts. (0)
No missing target artifacts. (0)


Cleaning artifacts...: 100%|███████████| 1180/1180 [00:00<00:00, 2059624.94it/s]
Generating negative links between Requirements -> Regulatory Codes: 100%|█| 116/


[Requirements: 116],[Regulatory Codes: 1064]
Trace dataset(+587, -(122837) = 123424)
123424


In [19]:
def create_map(embeddings):
    e_map = {}
    for i, (a_id, a_row) in enumerate(dataset.artifact_df.iterrows()):
        e_map[a_id] = embeddings["data"][i]["embedding"]
    return e_map

def create_vsm_map(model):
    e_map = {}
    for i, (a_id, a_row) in enumerate(dataset.artifact_df.iterrows()):
        e_map[a_id] = model.transform([a_row["content"]]).toarray()[0]
    return e_map

def calculate_scores(e_map):
    scores = []
    for i, row in dataset.trace_df.iterrows():
        source_id = row["source"]
        target_id = row["target"]

        link_score = cosine_similarity(e_map[source_id], e_map[target_id])
        scores.append(link_score)
    return scores

def calculate_metrics(labels, scores, threshold):
    from sklearn.metrics import average_precision_score
    from sklearn.metrics import fbeta_score
    from sklearn.metrics import precision_score
    from sklearn.metrics import recall_score
    from train.metrics.precision_at_recall_metric import PrecisionAtRecallMetric
    from train.metrics.f1_metric import FMetric
    
    f_metric = FMetric()
    p_at_r = PrecisionAtRecallMetric()
    pred_labels = list(map(lambda s: 1 if s >= threshold else 0, scores))

    # Metrics
    metrics = {
        "sklearn": {
            "threshold": threshold,
            "precision": precision_score(labels, pred_labels),
            "recall": recall_score(labels, pred_labels),
            "f1": fbeta_score(labels, pred_labels, beta=1),
            "f2": fbeta_score(labels, pred_labels, beta=2),
            "ap": average_precision_score(labels, scores),
        },
        "tgen": {
            **p_at_r._compute(scores, labels),
            **f_metric._compute(scores, labels)
        }
    }
    return metrics

In [20]:
metric_map = {}
artifact_bodies = list(dataset.artifact_df["content"])
labels = list(dataset.trace_df["label"])

In [21]:
tfidf_model = TfidfVectorizer()
tfidf_model.fit(artifact_bodies)

vsm_map = create_vsm_map(tfidf_model)
vsm_scores = calculate_scores(vsm_map)

threshold = (max(vsm_scores) + min(vsm_scores)) / 2.0
vsm_metrics = calculate_metrics(labels, vsm_scores, threshold)
metric_map["vsm"] = vsm_metrics

In [23]:
models = ["text-embedding-ada-002"]#,
#           "text-similarity-curie-001", 
#           "text-search-ada-doc-001", # "text-search-ada-query-001"
#           "text-similarity-ada-001",
#           "text-similarity-davinci-001"]

for model in models:
    if model in metric_map:
        continue
    embeddings = openai.Embedding.create(input=artifact_bodies, model=model)

    openai_map = create_map(embeddings)
    openai_scores = calculate_scores(openai_map)

    threshold = (max(openai_scores) + min(openai_scores)) / 2.0
    metrics = calculate_metrics(labels, openai_scores, threshold)
    metric_map[model] = metrics
    
print(json.dumps(metric_map, indent=4))

{
    "vsm": {
        "sklearn": {
            "threshold": 0.28421813471297364,
            "precision": 0.3063063063063063,
            "recall": 0.05792163543441227,
            "f1": 0.09742120343839543,
            "f2": 0.06913379422529484,
            "ap": 0.08488282102308545
        },
        "tgen": {
            "precision_at_recall_95": 0.005636078985909803,
            "best_threshold": 0.02044291460897167,
            "f1": 0.16363636363636364,
            "f2": 0.21614118580210193
        }
    },
    "text-embedding-ada-002": {
        "sklearn": {
            "threshold": 0.8253527613877167,
            "precision": 0.014154800507542647,
            "recall": 0.8551959114139693,
            "f1": 0.027848663042272268,
            "f2": 0.06637928754661096,
            "ap": 0.14888420952302583
        },
        "tgen": {
            "precision_at_recall_95": 0.00854085990234644,
            "best_threshold": 0.804586649179889,
            "f1": 0.22376409366869038,


In [None]:
dataset.artifact_df.sample(n=1).iloc[0]["content"]