In [1]:
import pandas as pd

import os, sys
from pathlib import Path

import nltk
from nltk import word_tokenize
import contractions
import spacy

nltk.download("wordnet")

sys.path.append(os.path.abspath(".."))

from config import DATASET_NAME, EXPERIMENT_NAME, GENERATION_TECHNIQUE

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\JaneSlevin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Set up paths and directories
score_files_path = Path(f"../../data/{DATASET_NAME}/experiment_outputs/{EXPERIMENT_NAME}/{GENERATION_TECHNIQUE}/similarity_scores/")

In [3]:
# Read all csv files found in scores_path dir into single df
score_files = os.listdir(score_files_path)

score_files

['embedding_cosine_similarity_scores.csv',
 'meteor_scores.csv',
 'tfidf_cosine_similarity_scores.csv']

In [4]:
df = pd.DataFrame()

for file in score_files:
    current_path = score_files_path / file
    current_df = pd.read_csv(current_path)

    df = pd.concat([df, current_df]).reset_index(drop=True)

In [7]:
df.head()

Unnamed: 0,model,us_id,scenario_id,metric,similarity_score,scenario_text,us_text,us_text_preprocessed,scenario_text_preprocessed
0,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_1_1,all-MiniLM-L6-v2_embedding_cosine-sim,0.840304,Given I am on the website When I click the add...,"As a user, I want to click on the address, so ...",,
1,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_1_2,all-MiniLM-L6-v2_embedding_cosine-sim,0.712999,Given I am on the website When I click the add...,"As a user, I want to click on the address, so ...",,
2,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_1_3,all-MiniLM-L6-v2_embedding_cosine-sim,0.659918,Given I am on the website When I click the add...,"As a user, I want to click on the address, so ...",,
3,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_1_4,all-MiniLM-L6-v2_embedding_cosine-sim,0.526674,Given I am on the website Then the address lin...,"As a user, I want to click on the address, so ...",,
4,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_2_1,all-MiniLM-L6-v2_embedding_cosine-sim,0.20697,Given I am an anonymous user When I visit the ...,"As a user, I want to click on the address, so ...",,


In [8]:
df["metric"].value_counts()

metric
all-MiniLM-L6-v2_embedding_cosine-sim    25041
meteor                                   25041
tfidf_cosine-sim                         25041
Name: count, dtype: int64

In [12]:
#  Function to extract root us_id from scenario_id
def parse_root_us(scenario_id):
    return scenario_id.split("_")[1]

In [31]:
# Add a column identifying root US for each scenario, and a column indicating if current US matches root US
df["true_us"] = df["scenario_id"].apply(parse_root_us).astype(str).astype(int)

df["us_match"] = df["us_id"] == df["true_us"]

In [32]:
df.head()

Unnamed: 0,model,us_id,scenario_id,metric,similarity_score,scenario_text,us_text,us_text_preprocessed,scenario_text_preprocessed,true_us,us_match
0,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_1_1,all-MiniLM-L6-v2_embedding_cosine-sim,0.840304,Given I am on the website When I click the add...,"As a user, I want to click on the address, so ...",,,1,True
1,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_1_2,all-MiniLM-L6-v2_embedding_cosine-sim,0.712999,Given I am on the website When I click the add...,"As a user, I want to click on the address, so ...",,,1,True
2,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_1_3,all-MiniLM-L6-v2_embedding_cosine-sim,0.659918,Given I am on the website When I click the add...,"As a user, I want to click on the address, so ...",,,1,True
3,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_1_4,all-MiniLM-L6-v2_embedding_cosine-sim,0.526674,Given I am on the website Then the address lin...,"As a user, I want to click on the address, so ...",,,1,True
4,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_2_1,all-MiniLM-L6-v2_embedding_cosine-sim,0.20697,Given I am an anonymous user When I visit the ...,"As a user, I want to click on the address, so ...",,,2,False


In [33]:
df.dtypes

model                          object
us_id                           int64
scenario_id                    object
metric                         object
similarity_score              float64
scenario_text                  object
us_text                        object
us_text_preprocessed           object
scenario_text_preprocessed     object
true_us                         int32
us_match                         bool
dtype: object

In [34]:
# Note metrics are not directly comparable as they have different distributions of scores
df.groupby(["metric", "model", "us_match"])["similarity_score"].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
metric,model,us_match,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
all-MiniLM-L6-v2_embedding_cosine-sim,google-gemini-2.0-flash-001,False,11800.0,0.117421,0.135131,-0.201243,0.021464,0.091873,0.188137,0.731707
all-MiniLM-L6-v2_embedding_cosine-sim,google-gemini-2.0-flash-001,True,236.0,0.509733,0.161526,-0.046604,0.405142,0.528659,0.624561,0.840304
all-MiniLM-L6-v2_embedding_cosine-sim,openai-gpt-4o-mini,False,12750.0,0.138456,0.139028,-0.193813,0.039047,0.112368,0.213015,0.767539
all-MiniLM-L6-v2_embedding_cosine-sim,openai-gpt-4o-mini,True,255.0,0.551325,0.152865,0.061425,0.44603,0.565446,0.658342,0.861947
meteor,google-gemini-2.0-flash-001,False,11800.0,0.16407,0.050186,0.031646,0.130011,0.15625,0.188863,0.432039
meteor,google-gemini-2.0-flash-001,True,236.0,0.250944,0.080823,0.083612,0.182612,0.242789,0.317969,0.455409
meteor,openai-gpt-4o-mini,False,12750.0,0.151251,0.054745,0.017361,0.11236,0.143443,0.180723,0.457519
meteor,openai-gpt-4o-mini,True,255.0,0.264981,0.091914,0.088652,0.192679,0.255,0.322987,0.540941
tfidf_cosine-sim,google-gemini-2.0-flash-001,False,11800.0,0.020643,0.042555,0.0,0.0,0.0,0.023877,0.545449
tfidf_cosine-sim,google-gemini-2.0-flash-001,True,236.0,0.230692,0.15895,0.0,0.101138,0.209666,0.350346,0.672916


In [35]:
# Create df with one row per scenario (per metric, per model) showing US with highest similarity score
top_us_per_scenario = df.loc[df.groupby(["metric", "model", "scenario_id"])["similarity_score"].idxmax()].rename(columns={"us_id" : "predicted_us"})

top_us_per_scenario.head()

Unnamed: 0,model,predicted_us,scenario_id,metric,similarity_score,scenario_text,us_text,us_text_preprocessed,scenario_text_preprocessed,true_us,us_match
2165,google-gemini-2.0-flash-001,10,google-gemini-2.0-flash-001_10_1,all-MiniLM-L6-v2_embedding_cosine-sim,0.763982,Given I am on the special waste drop-off site ...,"As a user, I want to be able to view a map dis...",,,10,True
2166,google-gemini-2.0-flash-001,10,google-gemini-2.0-flash-001_10_2,all-MiniLM-L6-v2_embedding_cosine-sim,0.623315,Given I am on the special waste drop-off site ...,"As a user, I want to be able to view a map dis...",,,10,True
2167,google-gemini-2.0-flash-001,10,google-gemini-2.0-flash-001_10_3,all-MiniLM-L6-v2_embedding_cosine-sim,0.666842,Given I am on the special waste drop-off site ...,"As a user, I want to be able to view a map dis...",,,10,True
2168,google-gemini-2.0-flash-001,10,google-gemini-2.0-flash-001_10_4,all-MiniLM-L6-v2_embedding_cosine-sim,0.603735,Given I am on the special waste drop-off site ...,"As a user, I want to be able to view a map dis...",,,10,True
2169,google-gemini-2.0-flash-001,10,google-gemini-2.0-flash-001_10_5,all-MiniLM-L6-v2_embedding_cosine-sim,0.707083,Given I am on the special waste drop-off site ...,"As a user, I want to be able to view a map dis...",,,10,True


In [36]:
# Calculate accuracy as proportion of scenarios where predicted (highest-scoring) US is root US TODO update to consider all correctly classified
accuracy = top_us_per_scenario.groupby(["metric", "model"])["us_match"].mean().reset_index()

accuracy

Unnamed: 0,metric,model,us_match
0,all-MiniLM-L6-v2_embedding_cosine-sim,google-gemini-2.0-flash-001,0.766949
1,all-MiniLM-L6-v2_embedding_cosine-sim,openai-gpt-4o-mini,0.788235
2,meteor,google-gemini-2.0-flash-001,0.351695
3,meteor,openai-gpt-4o-mini,0.427451
4,tfidf_cosine-sim,google-gemini-2.0-flash-001,0.59322
5,tfidf_cosine-sim,openai-gpt-4o-mini,0.713725


In [37]:
top_us_per_scenario.dtypes

model                          object
predicted_us                    int64
scenario_id                    object
metric                         object
similarity_score              float64
scenario_text                  object
us_text                        object
us_text_preprocessed           object
scenario_text_preprocessed     object
true_us                         int32
us_match                         bool
dtype: object

In [38]:
# Calculate precision, recall, and F1-score
prf = []

for (metric, model), group in top_us_per_scenario.groupby(["metric", "model"]):
    for us in group["true_us"].unique():

        true_pos = ((group["predicted_us"] == us) & (group["true_us"] == us)).sum()
        false_pos = ((group["predicted_us"] == us) & (group["true_us"] != us)).sum()

        false_neg = ((group["predicted_us"] != us) & (group["true_us"] == us)).sum()

        precision = true_pos / (true_pos + false_pos) if (true_pos + false_pos) > 0 else 0
        recall = true_pos / (true_pos + false_neg) if (true_pos + false_neg) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        prf.append({
            "metric": metric,
            "model": model,
            "us_id": us,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        })


In [39]:
prf_df = pd.DataFrame(prf)

prf_df.head(20)

Unnamed: 0,metric,model,us_id,precision,recall,f1_score
0,all-MiniLM-L6-v2_embedding_cosine-sim,google-gemini-2.0-flash-001,10,1.0,1.0,1.0
1,all-MiniLM-L6-v2_embedding_cosine-sim,google-gemini-2.0-flash-001,11,0.833333,1.0,0.909091
2,all-MiniLM-L6-v2_embedding_cosine-sim,google-gemini-2.0-flash-001,12,0.375,0.75,0.5
3,all-MiniLM-L6-v2_embedding_cosine-sim,google-gemini-2.0-flash-001,13,0.8,0.8,0.8
4,all-MiniLM-L6-v2_embedding_cosine-sim,google-gemini-2.0-flash-001,14,0.454545,1.0,0.625
5,all-MiniLM-L6-v2_embedding_cosine-sim,google-gemini-2.0-flash-001,15,0.0,0.0,0.0
6,all-MiniLM-L6-v2_embedding_cosine-sim,google-gemini-2.0-flash-001,16,0.8,1.0,0.888889
7,all-MiniLM-L6-v2_embedding_cosine-sim,google-gemini-2.0-flash-001,17,1.0,1.0,1.0
8,all-MiniLM-L6-v2_embedding_cosine-sim,google-gemini-2.0-flash-001,18,1.0,1.0,1.0
9,all-MiniLM-L6-v2_embedding_cosine-sim,google-gemini-2.0-flash-001,19,0.333333,0.4,0.363636


In [40]:
prf_df.shape

(306, 6)

In [41]:
test = prf_df.groupby(["metric", "model"])[["precision", "recall", "f1_score"]].mean().reset_index()

In [42]:
test

Unnamed: 0,metric,model,precision,recall,f1_score
0,all-MiniLM-L6-v2_embedding_cosine-sim,google-gemini-2.0-flash-001,0.732605,0.761765,0.720306
1,all-MiniLM-L6-v2_embedding_cosine-sim,openai-gpt-4o-mini,0.813652,0.788235,0.772021
2,meteor,google-gemini-2.0-flash-001,0.415579,0.345098,0.327396
3,meteor,openai-gpt-4o-mini,0.481119,0.427451,0.387748
4,tfidf_cosine-sim,google-gemini-2.0-flash-001,0.543714,0.584314,0.525339
5,tfidf_cosine-sim,openai-gpt-4o-mini,0.713141,0.713725,0.677441


In [None]:
# TODO: guiraud's index for lexical richness