In [1]:
import pandas as pd

import nltk
from nltk import word_tokenize
import contractions
import spacy

nltk.download("wordnet")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\JaneSlevin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
tfidf = pd.read_csv("all_tfidf.csv") # Add other similarity dfs (tfidf cosine/embedding cosine) to this
meteor = pd.read_csv("all_meteor.csv")
sent_transformer = pd.read_csv("all_sent_transformer.csv")

In [3]:
df = pd.concat([tfidf, meteor, sent_transformer]).reset_index(drop=True)

In [4]:
df.head()

Unnamed: 0,us_id,scenario_id,score,us_text,scenario_text,model,metric
0,g02-federalspending_100,g02-federalspending_100_openai-gpt-4o-mini_s1,0.495752,"As an Owner, I want to design a schedule from ...",Given the Owner is logged into the system When...,openai-gpt-4o-mini,tfidf_cosine
1,g02-federalspending_100,g02-federalspending_100_openai-gpt-4o-mini_s2,0.413337,"As an Owner, I want to design a schedule from ...",Given the Owner is logged into the system And ...,openai-gpt-4o-mini,tfidf_cosine
2,g02-federalspending_100,g02-federalspending_100_openai-gpt-4o-mini_s3,0.520676,"As an Owner, I want to design a schedule from ...",Given the Owner is logged into the system And ...,openai-gpt-4o-mini,tfidf_cosine
3,g02-federalspending_100,g02-federalspending_100_openai-gpt-4o-mini_s4,0.500605,"As an Owner, I want to design a schedule from ...",Given the Owner is logged into the system And ...,openai-gpt-4o-mini,tfidf_cosine
4,g02-federalspending_100,g02-federalspending_1029_openai-gpt-4o-mini_s1,0.078025,"As an Owner, I want to design a schedule from ...",Given the leadership is reviewing the round 2 ...,openai-gpt-4o-mini,tfidf_cosine


In [5]:
df["metric"].value_counts()

metric
tfidf_cosine                                93860
meteor                                      93860
sent-transformer-all-MiniLM-L6-v2_cosine    93860
Name: count, dtype: int64

In [6]:
#  Functions to extract current and root US numbers from IDs
def parse_current_us(us_id):
    return us_id.split("_")[1]

def parse_root_us(scenario_id):
    return scenario_id.split("_")[1]

In [7]:
# Add columns identifying current and root USs, and whether they are the same
df["current_us"] = df["us_id"].apply(parse_current_us)
df["true_us"] = df["scenario_id"].apply(parse_root_us)

df["us_match"] = df["current_us"] == df["true_us"]

In [8]:
df.head()

Unnamed: 0,us_id,scenario_id,score,us_text,scenario_text,model,metric,current_us,true_us,us_match
0,g02-federalspending_100,g02-federalspending_100_openai-gpt-4o-mini_s1,0.495752,"As an Owner, I want to design a schedule from ...",Given the Owner is logged into the system When...,openai-gpt-4o-mini,tfidf_cosine,100,100,True
1,g02-federalspending_100,g02-federalspending_100_openai-gpt-4o-mini_s2,0.413337,"As an Owner, I want to design a schedule from ...",Given the Owner is logged into the system And ...,openai-gpt-4o-mini,tfidf_cosine,100,100,True
2,g02-federalspending_100,g02-federalspending_100_openai-gpt-4o-mini_s3,0.520676,"As an Owner, I want to design a schedule from ...",Given the Owner is logged into the system And ...,openai-gpt-4o-mini,tfidf_cosine,100,100,True
3,g02-federalspending_100,g02-federalspending_100_openai-gpt-4o-mini_s4,0.500605,"As an Owner, I want to design a schedule from ...",Given the Owner is logged into the system And ...,openai-gpt-4o-mini,tfidf_cosine,100,100,True
4,g02-federalspending_100,g02-federalspending_1029_openai-gpt-4o-mini_s1,0.078025,"As an Owner, I want to design a schedule from ...",Given the leadership is reviewing the round 2 ...,openai-gpt-4o-mini,tfidf_cosine,100,1029,False


In [9]:
df[df["metric"] == "sent-transformer-all-MiniLM-L6-v2_cosine"]

Unnamed: 0,us_id,scenario_id,score,us_text,scenario_text,model,metric,current_us,true_us,us_match
187720,g02-federalspending_100,g02-federalspending_100_openai-gpt-4o-mini_s1,0.662921,"As an Owner, I want to design a schedule from ...",Given the Owner is logged into the system When...,openai-gpt-4o-mini,sent-transformer-all-MiniLM-L6-v2_cosine,100,100,True
187721,g02-federalspending_100,g02-federalspending_100_openai-gpt-4o-mini_s2,0.481764,"As an Owner, I want to design a schedule from ...",Given the Owner is logged into the system And ...,openai-gpt-4o-mini,sent-transformer-all-MiniLM-L6-v2_cosine,100,100,True
187722,g02-federalspending_100,g02-federalspending_100_openai-gpt-4o-mini_s3,0.660835,"As an Owner, I want to design a schedule from ...",Given the Owner is logged into the system And ...,openai-gpt-4o-mini,sent-transformer-all-MiniLM-L6-v2_cosine,100,100,True
187723,g02-federalspending_100,g02-federalspending_100_openai-gpt-4o-mini_s4,0.645887,"As an Owner, I want to design a schedule from ...",Given the Owner is logged into the system And ...,openai-gpt-4o-mini,sent-transformer-all-MiniLM-L6-v2_cosine,100,100,True
187724,g02-federalspending_100,g02-federalspending_1029_openai-gpt-4o-mini_s1,0.297325,"As an Owner, I want to design a schedule from ...",Given the leadership is reviewing the round 2 ...,openai-gpt-4o-mini,sent-transformer-all-MiniLM-L6-v2_cosine,100,1029,False
...,...,...,...,...,...,...,...,...,...,...
281575,g02-federalspending_965,g02-federalspending_965_google-gemini-2.0-flas...,0.696545,"As an Agency user, I want all historical Finan...",Given the historical Financial Assistance data...,google-gemini-2.0-flash-001,sent-transformer-all-MiniLM-L6-v2_cosine,965,965,True
281576,g02-federalspending_965,g02-federalspending_965_google-gemini-2.0-flas...,0.451448,"As an Agency user, I want all historical Finan...",Given the source system contains historical Fi...,google-gemini-2.0-flash-001,sent-transformer-all-MiniLM-L6-v2_cosine,965,965,True
281577,g02-federalspending_965,g02-federalspending_965_google-gemini-2.0-flas...,0.545511,"As an Agency user, I want all historical Finan...",Given the source system contains historical Fi...,google-gemini-2.0-flash-001,sent-transformer-all-MiniLM-L6-v2_cosine,965,965,True
281578,g02-federalspending_965,g02-federalspending_965_google-gemini-2.0-flas...,0.504938,"As an Agency user, I want all historical Finan...",Given the historical data load process is init...,google-gemini-2.0-flash-001,sent-transformer-all-MiniLM-L6-v2_cosine,965,965,True


In [10]:
# Note metrics are not directly comparable as they have different distributions of scores
df.groupby(["metric", "model", "us_match"])["score"].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
metric,model,us_match,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
meteor,google-gemini-2.0-flash-001,False,50196.0,0.135487,0.061715,0.0,0.095238,0.128205,0.165017,0.608625
meteor,google-gemini-2.0-flash-001,True,534.0,0.302574,0.104639,0.065217,0.231678,0.295484,0.365737,0.654829
meteor,openai-gpt-4o-mini,False,42676.0,0.128358,0.061732,0.0,0.087209,0.119048,0.157068,0.527461
meteor,openai-gpt-4o-mini,True,454.0,0.334496,0.112765,0.07874,0.247085,0.322402,0.406885,0.723063
sent-transformer-all-MiniLM-L6-v2_cosine,google-gemini-2.0-flash-001,False,50196.0,0.138632,0.127127,-0.183097,0.04666,0.120738,0.213428,0.839142
sent-transformer-all-MiniLM-L6-v2_cosine,google-gemini-2.0-flash-001,True,534.0,0.605122,0.154151,-0.060614,0.513384,0.633848,0.709492,0.939911
sent-transformer-all-MiniLM-L6-v2_cosine,openai-gpt-4o-mini,False,42676.0,0.146241,0.125664,-0.169223,0.054733,0.129136,0.221302,0.836062
sent-transformer-all-MiniLM-L6-v2_cosine,openai-gpt-4o-mini,True,454.0,0.670167,0.128065,0.288389,0.587541,0.68107,0.771014,0.92991
tfidf_cosine,google-gemini-2.0-flash-001,False,50196.0,0.020645,0.041246,0.0,0.0,0.0,0.025089,0.704237
tfidf_cosine,google-gemini-2.0-flash-001,True,534.0,0.24301,0.161737,0.0,0.111784,0.220632,0.330506,0.73322


In [11]:
# Create df with one row per scenario (per metric, per model) showing US with highest similarity score
top_us_per_scenario = df.loc[df.groupby(["metric", "model", "scenario_id"])["score"].idxmax()].rename(columns={"current_us" : "predicted_us"})

top_us_per_scenario.head()

Unnamed: 0,us_id,scenario_id,score,us_text,scenario_text,model,metric,predicted_us,true_us,us_match
136990,g02-federalspending_100,g02-federalspending_100_google-gemini-2.0-flas...,0.338873,"As an Owner, I want to design a schedule from ...",Given I am logged in as an Owner And I am on t...,google-gemini-2.0-flash-001,meteor,100,100,True
137145,g02-federalspending_45,g02-federalspending_100_google-gemini-2.0-flas...,0.328414,"As an Owner, I want to design an audit from th...",Given I am logged in as an Owner And I am on t...,google-gemini-2.0-flash-001,meteor,45,100,False
137240,g02-federalspending_45,g02-federalspending_100_google-gemini-2.0-flas...,0.328414,"As an Owner, I want to design an audit from th...",Given I am logged in as an Owner And I am on t...,google-gemini-2.0-flash-001,meteor,45,100,False
137335,g02-federalspending_45,g02-federalspending_100_google-gemini-2.0-flas...,0.330454,"As an Owner, I want to design an audit from th...",Given I am logged in as an Owner And I am on t...,google-gemini-2.0-flash-001,meteor,45,100,False
137370,g02-federalspending_100,g02-federalspending_100_google-gemini-2.0-flas...,0.25974,"As an Owner, I want to design a schedule from ...",Given I am logged in as an Owner And I am on t...,google-gemini-2.0-flash-001,meteor,100,100,True


In [None]:
# Calculate accuracy as proportion of scenarios where predicted (highest-scoring) US is root US TODO update to consider all correctly classified
accuracy = top_us_per_scenario.groupby(["metric", "model"])["us_match"].mean().reset_index()

accuracy

Unnamed: 0,metric,model,us_match
0,meteor,google-gemini-2.0-flash-001,0.546816
1,meteor,openai-gpt-4o-mini,0.689427
2,sent-transformer-all-MiniLM-L6-v2_cosine,google-gemini-2.0-flash-001,0.837079
3,sent-transformer-all-MiniLM-L6-v2_cosine,openai-gpt-4o-mini,0.898678
4,tfidf_cosine,google-gemini-2.0-flash-001,0.586142
5,tfidf_cosine,openai-gpt-4o-mini,0.651982


In [13]:
# Calculate precision, recall, and F1-score
prf = []

for (metric, model), group in top_us_per_scenario.groupby(["metric", "model"]):
    for us in group["true_us"].unique():

        true_pos = ((group["predicted_us"] == us) & (group["true_us"] == us)).sum()
        false_pos = ((group["predicted_us"] == us) & (group["true_us"] != us)).sum()

        false_neg = ((group["predicted_us"] != us) & (group["true_us"] == us)).sum()

        precision = true_pos / (true_pos + false_pos) if (true_pos + false_pos) > 0 else 0
        recall = true_pos / (true_pos + false_neg) if (true_pos + false_neg) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        prf.append({
            "metric": metric,
            "model": model,
            "us_id": us,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        })


In [14]:
prf_df = pd.DataFrame(prf)

prf_df.head(20)

Unnamed: 0,metric,model,us_id,precision,recall,f1_score
0,meteor,google-gemini-2.0-flash-001,100,1.0,0.571429,0.727273
1,meteor,google-gemini-2.0-flash-001,1037,1.0,0.714286,0.833333
2,meteor,google-gemini-2.0-flash-001,1050,0.666667,1.0,0.8
3,meteor,google-gemini-2.0-flash-001,1067,0.714286,0.833333,0.769231
4,meteor,google-gemini-2.0-flash-001,1084,1.0,0.8,0.888889
5,meteor,google-gemini-2.0-flash-001,1086,1.0,1.0,1.0
6,meteor,google-gemini-2.0-flash-001,1095,1.0,0.375,0.545455
7,meteor,google-gemini-2.0-flash-001,1132,0.5,1.0,0.666667
8,meteor,google-gemini-2.0-flash-001,1141,1.0,1.0,1.0
9,meteor,google-gemini-2.0-flash-001,1165,0.5,0.75,0.6


In [15]:
prf_df.shape

(558, 6)

In [23]:
test = prf_df.groupby(["metric", "model"])[["precision", "recall", "f1_score"]].mean().reset_index()

In [24]:
test

Unnamed: 0,metric,model,precision,recall,f1_score
0,meteor,google-gemini-2.0-flash-001,0.644306,0.613632,0.563789
1,meteor,openai-gpt-4o-mini,0.71739,0.703158,0.670579
2,sent-transformer-all-MiniLM-L6-v2_cosine,google-gemini-2.0-flash-001,0.852423,0.860841,0.828369
3,sent-transformer-all-MiniLM-L6-v2_cosine,openai-gpt-4o-mini,0.906491,0.907368,0.892679
4,tfidf_cosine,google-gemini-2.0-flash-001,0.53075,0.617979,0.526679
5,tfidf_cosine,openai-gpt-4o-mini,0.617133,0.660175,0.597572


In [None]:
# TODO: guiraud's index for lexical richness