In [1]:
import pandas as pd
import os
from pathlib import Path

import nltk
from nltk import word_tokenize
import contractions
import spacy

# nltk.download("wordnet")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Set up paths and directories
experiment_path = Path('../../data/gherkins/sample_data/test/')

us_scen_sim_path = experiment_path / 'us_scenario_similarity_results'
os.makedirs(us_scen_sim_path, exist_ok=True)

In [3]:
df = pd.read_csv(experiment_path / 'parsed_scenario_data.csv')

df.head()

# TODO: include scenario_name in scenario_text?

Unnamed: 0,app_id,model,us_id,scenario_id,scenario_text,feature_name,scenario_name,scenario_examples,us_text
0,g04,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_1_1,Given I am on the page with the address link W...,Address Link Opens Google Maps in New Tab,Clicking the address link opens Google Maps in...,,"As a user, I want to click on the address, so ..."
1,g04,google-gemini-2.0-flash-001,2,google-gemini-2.0-flash-001_2_1,Given I am an anonymous user When I navigate t...,Anonymous User Can View Public Recycling Cente...,Anonymous user can view a list of recycling ce...,,"As a user, I want to be able to anonymously vi..."
2,g04,google-gemini-2.0-flash-001,2,google-gemini-2.0-flash-001_2_2,Given I am an anonymous user And a recycling c...,Anonymous User Can View Public Recycling Cente...,Anonymous user can view details of a specific ...,,"As a user, I want to be able to anonymously vi..."
3,g04,google-gemini-2.0-flash-001,3,google-gemini-2.0-flash-001_3_1,Given I am on the recycling facility search pa...,Find Recycling Facilities by Zip Code,Entering a valid zip code displays nearby recy...,,"As a user, I want to be able to enter my zip c..."
4,g04,google-gemini-2.0-flash-001,3,google-gemini-2.0-flash-001_3_2,Given I am on the recycling facility search pa...,Find Recycling Facilities by Zip Code,Entering an invalid zip code displays an error...,,"As a user, I want to be able to enter my zip c..."


In [4]:
df.shape

(26, 9)

In [5]:
df.model.value_counts()

model
google-gemini-2.0-flash-001    14
openai-gpt-4o-mini             12
Name: count, dtype: int64

#### Preprocessing for TF-IDF.

In [6]:
def expand_contractions(input_string):
    output_string = contractions.fix(input_string)

    return output_string

def preprocess_text(text):
    raw_tokens = word_tokenize(text)
    cleaned_tokens = [token for token in raw_tokens if token.isalnum()]

    return cleaned_tokens

spacy_model = spacy.load("en_core_web_sm", disable=["ner", "parser"])

def lemmatize_tokens(tokens):
    if not tokens:  # Handle empty lists
        return tokens

    return " ".join(token.lemma_.lower() for token in spacy_model(" ".join(tokens))) # Return lemmatized string

In [7]:
def full_preprocess(text):
    text = text.lower()
    text = expand_contractions(text)
    tokens = preprocess_text(text)
    lemmatized_text = lemmatize_tokens(tokens)

    return lemmatized_text

In [8]:
# Apply full_preprocess to both user story and scenario text columns, creating new columns for the preprocessed text
df["us_text_preprocessed"] = df["us_text"].apply(lambda x: full_preprocess(x) if pd.notnull(x) else x)
df["scenario_text_preprocessed"] = df["scenario_text"].apply(lambda x: full_preprocess(x) if pd.notnull(x) else x)

In [9]:
# Create a corpus of user stories and gherkin scenarios (including all gherkins from all models)
corpus = pd.Series(df['us_text_preprocessed'].tolist() + df['scenario_text_preprocessed'].tolist()).drop_duplicates().dropna().reset_index(drop=True)

corpus.head()

0    as a user i want to click on the address so th...
1    as a user i want to be able to anonymously vie...
2    as a user i want to be able to enter my zip co...
3    as a user i want to be able to get the hour of...
4    as a user i want to have a flexible pick up ti...
dtype: object

#### Perform TF-IDF.

In [10]:
# Initialise and fit TF-IDF Vectorizer on the corpus of user stories and gherkin scenarios
vectorizer = TfidfVectorizer(stop_words="english")

vectorizer.fit(corpus)

In [11]:
df.head(1)

Unnamed: 0,app_id,model,us_id,scenario_id,scenario_text,feature_name,scenario_name,scenario_examples,us_text,us_text_preprocessed,scenario_text_preprocessed
0,g04,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_1_1,Given I am on the page with the address link W...,Address Link Opens Google Maps in New Tab,Clicking the address link opens Google Maps in...,,"As a user, I want to click on the address, so ...",as a user i want to click on the address so th...,give i be on the page with the address link wh...


Get and transform unique user stories.

In [12]:
# Create df with unique user stories
us_df = df[["us_id", "us_text", "us_text_preprocessed"]].drop_duplicates(subset="us_id").reset_index(drop=True)

# From this, create lists of unique us_ids and us_texts
us_ids = us_df["us_id"].tolist()
us_texts = us_df["us_text"].tolist()
us_texts_preprocessed = us_df["us_text_preprocessed"].tolist()

us_df.head()

Unnamed: 0,us_id,us_text,us_text_preprocessed
0,1,"As a user, I want to click on the address, so ...",as a user i want to click on the address so th...
1,2,"As a user, I want to be able to anonymously vi...",as a user i want to be able to anonymously vie...
2,3,"As a user, I want to be able to enter my zip c...",as a user i want to be able to enter my zip co...
3,4,"As a user, I want to be able to get the hours ...",as a user i want to be able to get the hour of...
4,5,"As a user, I want to have a flexible pick up t...",as a user i want to have a flexible pick up ti...


In [13]:
us_ids[:5], us_texts[:5], us_texts_preprocessed[:5]

([1, 2, 3, 4, 5],
 ['As a user, I want to click on the address, so that it takes me to a new tab with Google Maps.',
  'As a user, I want to be able to anonymously view public information, so that I know about recycling centers near me before creating an account.',
  'As a user, I want to be able to enter my zip code and get a list of nearby recycling facilities, so that I can determine which ones I should consider.',
  'As a user, I want to be able to get the hours of each recycling facility, so that I can arrange drop-offs on my off days or during after-work hours.',
  'As a user, I want to have a flexible pick up time, so that I can more conveniently use the website.'],
 ['as a user i want to click on the address so that it take i to a new tab with google map',
  'as a user i want to be able to anonymously view public information so that i know about recycle center near i before create an account',
  'as a user i want to be able to enter my zip code and get a list of nearby recyclin

In [14]:
# Transform user stories 
X_us = vectorizer.transform(us_df["us_text_preprocessed"])

Filter the dataset by AI model, transform the set of Gherkin scenarios for that model, and create a cosine similarity matrix comparing the current set of gherkins against the set of user stories. Add the results to `results`.

In [15]:
# NOTE: we don't technically need to split by model here, but maybe if we want to do model-specific vectorizers later it will be useful.
results = []

for model, group in df.groupby("model"):
    # Create lists of scenario_ids and scenario_texts for this model
    scenario_ids = group["scenario_id"].tolist()
    scenario_texts = group["scenario_text"].tolist()
    scenario_texts_preprocessed = group["scenario_text_preprocessed"].tolist()

    # Transform scenarios for this model
    X_scenario = vectorizer.transform(group["scenario_text_preprocessed"])

    # Create cosine similarity matrix comparing scenarios to user stories
    cosine_sim_matrix = cosine_similarity(X_us, X_scenario)

    for i, us_id in enumerate(us_ids):
        for j, scenario_id in enumerate(scenario_ids):
            similarity_score = cosine_sim_matrix[i, j]

            results.append({
                "model": model,
                "us_id": us_id,
                "scenario_id": scenario_id,
                "us_text": us_texts[i],
                "us_text_preprocessed": us_texts_preprocessed[i],
                "scenario_text": scenario_texts[j],
                "scenario_text_preprocessed": scenario_texts_preprocessed[j],
                "metric": "tfidf_cosine-sim",
                "similarity_score": similarity_score
            })

    

In [16]:
sim_df = pd.DataFrame(results)

sim_df.head()

Unnamed: 0,model,us_id,scenario_id,us_text,us_text_preprocessed,scenario_text,scenario_text_preprocessed,metric,similarity_score
0,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_1_1,"As a user, I want to click on the address, so ...",as a user i want to click on the address so th...,Given I am on the page with the address link W...,give i be on the page with the address link wh...,tfidf_cosine-sim,0.472913
1,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_2_1,"As a user, I want to click on the address, so ...",as a user i want to click on the address so th...,Given I am an anonymous user When I navigate t...,give i be an anonymous user when i navigate to...,tfidf_cosine-sim,0.113745
2,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_2_2,"As a user, I want to click on the address, so ...",as a user i want to click on the address so th...,Given I am an anonymous user And a recycling c...,give i be an anonymous user and a recycling ce...,tfidf_cosine-sim,0.048094
3,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_3_1,"As a user, I want to click on the address, so ...",as a user i want to click on the address so th...,Given I am on the recycling facility search pa...,give i be on the recycling facility search pag...,tfidf_cosine-sim,0.022763
4,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_3_2,"As a user, I want to click on the address, so ...",as a user i want to click on the address so th...,Given I am on the recycling facility search pa...,give i be on the recycling facility search pag...,tfidf_cosine-sim,0.029172


In [17]:
sim_df.shape

(130, 9)

In [18]:
sim_df.to_csv(us_scen_sim_path / 'us_scen_tfidf_cosine_similarity.csv', index=False)