In [None]:
import pandas as pd
import numpy as np

import os, sys
from pathlib import Path

import nltk
from nltk import word_tokenize
import contractions
import spacy

nltk.download("wordnet")

sys.path.append(os.path.abspath(".."))

from config import DATASET_NAME, EXPERIMENT_NAME, GENERATION_TECHNIQUE

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.metrics import precision_recall_fscore_support # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html

In [None]:
# Set up paths and directories
score_files_path = Path(f"../../data/{DATASET_NAME}/experiment_outputs/{EXPERIMENT_NAME}/{GENERATION_TECHNIQUE}/similarity_scores/")

In [None]:
# Read all csv files found in scores_path dir into single df
score_files = os.listdir(score_files_path)

df = pd.DataFrame()

for file in score_files:
    current_path = score_files_path / file
    current_df = pd.read_csv(current_path)

    df = pd.concat([df, current_df]).reset_index(drop=True)

In [None]:
df.head()

In [None]:
# Check data for all metrics has been added to df
df["metric"].value_counts()

In [None]:
# Function to extract root us_id from scenario_id
def parse_root_us(scenario_id):
    return scenario_id.split("_")[1]

In [None]:
# Add a column identifying root US for each scenario, and a column indicating if current US matches root US
df["true_us"] = df["scenario_id"].apply(parse_root_us).astype(str).astype(int)

df["us_match"] = df["us_id"] == df["true_us"]

In [None]:
df.head()

In [None]:
# NOTE: metrics are not directly comparable so viewing raw data has limited value
# df.groupby(["metric", "model", "us_match"])["similarity_score"].describe()

#### Analysis

For each `(model, similarity metric)` grouping, the predicted user story for each scenario is the one that achieves the highest similarity score with that scenario.

In [None]:
# Find US with highest similarity score for each scenario for each metric-model grouping
predicted_matches = df.loc[df.groupby(["metric", "model", "scenario_id"])["similarity_score"].idxmax()].rename(columns={"us_id" : "predicted_us"})

predicted_matches.head()

In [None]:
predicted_matches.shape

In [None]:
# CHeck if every us_id appears at least once in predicted_matches per (metric, model) grouping
all_us_ids = set(df["us_id"].unique())

for (metric, model), group in predicted_matches.groupby(["metric", "model"]):
    print(f"Metric: {metric} \nModel: {model}")

    predicted_us_ids = set(group["predicted_us"].unique())
    missing_us_ids = all_us_ids - predicted_us_ids
    print(missing_us_ids, "\n")

In [None]:
# Check that every us_id appears at least once in predicted_matches
all_us_ids = set(df["us_id"].unique())
predicted_us_ids = set(predicted_matches["predicted_us"].unique())

missing_us_ids = all_us_ids - predicted_us_ids
missing_us_ids

Accuracy is calculated for each `(similarity metric, model)` grouping as the proportion of scenarios for which the predicted user story (the one with the highest similarity score) matches the true user story.

In [None]:
# As us_match is boolean, mean gives the proportion of true values/the accuracy, i.e., how often the top predicted US matches the true US
accuracy = predicted_matches.groupby(["metric", "model"])["us_match"].mean().reset_index().rename(columns={"us_match": "accuracy"})

accuracy


Precision, recall, and F1 are calculated per user story based on the scenarios predicted to belong to that story. Macro and weighted averages summarise performance across user stories within each `(similarity metric, model)` grouping.

In [None]:
per_us_results = []
agg_results = []

for (metric, model), group in predicted_matches.groupby(["metric", "model"]): 
    us_labels = group["true_us"].unique()

    accuracy = group["us_match"].mean() # As us_match is boolean, mean gives the proportion of values where the top predicted US matches the true US, i.e., the accuracy

    # Compute precision, recall, f1-score for each US (including support, which is the number of ground truth instances for each US)
    precision, recall, f1, support = precision_recall_fscore_support(
        group["true_us"], group["predicted_us"], labels=us_labels, zero_division=0
    )

    true_positives = group.loc[group["us_match"], "true_us"].value_counts().to_dict() # Count of correctly predicted scenarios per US
    predicted_positives = group["predicted_us"].value_counts().to_dict() # Count of predicted scenarios per US

    per_us = pd.DataFrame({
        "metric": metric,
        "model": model,
        "us_id": us_labels,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "support": support,
    })

    per_us["TP_count"] = per_us["us_id"].map(true_positives).fillna(0).astype(int)
    per_us["PP_count"] = per_us["us_id"].map(predicted_positives).fillna(0).astype(int)

    
    # Compute aggregated metrics -- macro, which gives equal weight to each US, and weighted, which weights by support
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        group["true_us"], group["predicted_us"], average="macro", zero_division=0
    )

    precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(
        group["true_us"], group["predicted_us"], average="weighted", zero_division=0
    )
    
    agg = pd.DataFrame({
        "metric": [metric],
        "model": [model],
        "accuracy": [accuracy],
        "precision_macro": [precision_macro],
        "recall_macro": [recall_macro],
        "f1_macro": [f1_macro],
        "precision_weighted": [precision_weighted],
        "recall_weighted": [recall_weighted],
        "f1_weighted": [f1_weighted]
    })
    
    # Combine per-US and aggregated metrics in the results list
    per_us_results.append(per_us)
    agg_results.append(agg)

per_us_df = pd.concat(per_us_results, ignore_index=True)
per_us_df = per_us_df.sort_values(by=["metric", "model", "us_id"]).reset_index(drop=True)

agg_df = pd.concat(agg_results, ignore_index=True)

In [None]:
per_us_df.head()

In [None]:
agg_df

In [None]:
per_us_df["diff"] = per_us_df["PP_count"] - per_us_df["support"]
per_us_df["ratio"] = per_us_df["PP_count"] / per_us_df["support"] 

per_us_df.head()

In [None]:
# TODO: guiraud's index for lexical richness