In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

import nltk
from nltk import word_tokenize
import contractions
import spacy

# nltk.download("wordnet")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering

# embedd_model = SentenceTransformer("all-MiniLM-L6-v2")

In [2]:
# Set up paths and directories
experiment_path = Path('../../data/gherkins/sample_data/test/')

us_scen_sim_path = experiment_path / 'us_scenario_similarity_results'
os.makedirs(us_scen_sim_path, exist_ok=True)

embeddings_path = experiment_path / 'embeddings'
os.makedirs(embeddings_path, exist_ok=True)

In [3]:
# Create dict of the embedding models we want to use
embedd_model_dict = {
    "all-MiniLM-L6-v2": SentenceTransformer("all-MiniLM-L6-v2")
}

#### Embed scenario data for scenario-user story traceability experiments.

In [4]:
df = pd.read_csv(experiment_path / 'parsed_scenario_data.csv')

df.head()

Unnamed: 0,app_id,model,us_id,scenario_id,scenario_text,feature_name,scenario_name,scenario_examples,us_text
0,g04,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_1_1,Given I am on the page with the address link W...,Address Link Opens Google Maps in New Tab,Clicking the address link opens Google Maps in...,,"As a user, I want to click on the address, so ..."
1,g04,google-gemini-2.0-flash-001,2,google-gemini-2.0-flash-001_2_1,Given I am an anonymous user When I navigate t...,Anonymous User Can View Public Recycling Cente...,Anonymous user can view a list of recycling ce...,,"As a user, I want to be able to anonymously vi..."
2,g04,google-gemini-2.0-flash-001,2,google-gemini-2.0-flash-001_2_2,Given I am an anonymous user And a recycling c...,Anonymous User Can View Public Recycling Cente...,Anonymous user can view details of a specific ...,,"As a user, I want to be able to anonymously vi..."
3,g04,google-gemini-2.0-flash-001,3,google-gemini-2.0-flash-001_3_1,Given I am on the recycling facility search pa...,Find Recycling Facilities by Zip Code,Entering a valid zip code displays nearby recy...,,"As a user, I want to be able to enter my zip c..."
4,g04,google-gemini-2.0-flash-001,3,google-gemini-2.0-flash-001_3_2,Given I am on the recycling facility search pa...,Find Recycling Facilities by Zip Code,Entering an invalid zip code displays an error...,,"As a user, I want to be able to enter my zip c..."


In [5]:
# Get unique LLMs used, for looping later
llms = df["model"].unique().tolist()

llms

['google-gemini-2.0-flash-001', 'openai-gpt-4o-mini']

In [6]:
us_df = df[["us_id", "us_text"]].drop_duplicates(subset=["us_id"]).reset_index(drop=True)

us_df.head()

Unnamed: 0,us_id,us_text
0,1,"As a user, I want to click on the address, so ..."
1,2,"As a user, I want to be able to anonymously vi..."
2,3,"As a user, I want to be able to enter my zip c..."
3,4,"As a user, I want to be able to get the hours ..."
4,5,"As a user, I want to have a flexible pick up t..."


In [7]:
us_ids = us_df["us_id"].tolist()
us_texts = us_df["us_text"].tolist() 

In [8]:
# Function to generate and save embeddings (us or scenario) to pickle file
def generate_and_save_embeddings(ids, texts, embedding_model, filename):
    embeddings = embedding_model.encode(texts, show_progress_bar=True)

    embeddings_df = pd.DataFrame({
        "id": ids,
        "embedding": embeddings.tolist()
    })

    os.makedirs(experiment_path / 'embeddings', exist_ok=True)
    embeddings_df.to_pickle(experiment_path / 'embeddings' / filename)


In [9]:
# Generate and save US embeddings using each embedding model
for embedd_model_name, embedd_model in embedd_model_dict.items():
    generate_and_save_embeddings(
        us_ids,
        us_texts,
        embedd_model,
        f'us_embeddings_{embedd_model_name}.pkl'
    )

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
df.head(1)

Unnamed: 0,app_id,model,us_id,scenario_id,scenario_text,feature_name,scenario_name,scenario_examples,us_text
0,g04,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_1_1,Given I am on the page with the address link W...,Address Link Opens Google Maps in New Tab,Clicking the address link opens Google Maps in...,,"As a user, I want to click on the address, so ..."


In [11]:
for embedd_model_name, embedd_model in embedd_model_dict.items():
    for llm, group in df.groupby("model"):
        scenario_ids = group["scenario_id"].tolist()
        scenario_texts = group["scenario_text"].tolist()

        generate_and_save_embeddings(
            scenario_ids,
            scenario_texts,
            embedd_model,
            f'scenario_embeddings_{embedd_model_name}_{llm}.pkl'
        )

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [12]:
results = []

for embedd_model_name in embedd_model_dict.keys():
    # Load US embeddings from pickle
    us_embeddings_df = pd.read_pickle(experiment_path / f"embeddings/us_embeddings_{embedd_model_name}.pkl")

    us_ids = us_embeddings_df["id"].tolist()
    us_embeddings = np.stack(us_embeddings_df["embedding"].values) # Convert to numpy array for cosine similarity computation

    for llm in llms:
        # Load scenario embeddings from pickle
        scenario_embeddings_df = pd.read_pickle(experiment_path / f"embeddings/scenario_embeddings_{embedd_model_name}_{llm}.pkl")

        scenario_ids = scenario_embeddings_df["id"].tolist()
        scenario_embeddings = np.stack(scenario_embeddings_df["embedding"].values)

        # Compute cosine similarity
        cosine_sim_matrix = cosine_similarity(us_embeddings, scenario_embeddings)

        # NOTE: we don't preprocess texts for embeddings so preprocessed fields won't be included here
        current_results = [{
            "model": llm, 
            "us_id": us_ids[i],
            "scenario_id": scenario_ids[j],
            "metric": f"{embedd_model_name}_embedding_cosine-sim",
            "similarity_score": cosine_sim_matrix[i, j]
        }
        for i in range(len(us_ids)) 
        for j in range(len(scenario_ids))
        ]

        results.extend(current_results)


In [13]:
sim_df = pd.DataFrame(results)

sim_df.head()

Unnamed: 0,model,us_id,scenario_id,metric,similarity_score
0,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_1_1,all-MiniLM-L6-v2_embedding_cosine-sim,0.789876
1,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_2_1,all-MiniLM-L6-v2_embedding_cosine-sim,0.263878
2,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_2_2,all-MiniLM-L6-v2_embedding_cosine-sim,0.155419
3,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_3_1,all-MiniLM-L6-v2_embedding_cosine-sim,0.131962
4,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_3_2,all-MiniLM-L6-v2_embedding_cosine-sim,0.076466


In [None]:
# Add us_text and scenario_text to sim_df
sim_df = (
    sim_df
    .merge(df[['scenario_id', 'scenario_text']], on="scenario_id", how="left")
    .merge(us_df[['us_id', 'us_text']], on="us_id", how="left")
)

In [15]:
sim_df.head()

Unnamed: 0,model,us_id,scenario_id,metric,similarity_score,scenario_text,us_text
0,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_1_1,all-MiniLM-L6-v2_embedding_cosine-sim,0.789876,Given I am on the page with the address link W...,"As a user, I want to click on the address, so ..."
1,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_2_1,all-MiniLM-L6-v2_embedding_cosine-sim,0.263878,Given I am an anonymous user When I navigate t...,"As a user, I want to click on the address, so ..."
2,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_2_2,all-MiniLM-L6-v2_embedding_cosine-sim,0.155419,Given I am an anonymous user And a recycling c...,"As a user, I want to click on the address, so ..."
3,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_3_1,all-MiniLM-L6-v2_embedding_cosine-sim,0.131962,Given I am on the recycling facility search pa...,"As a user, I want to click on the address, so ..."
4,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_3_2,all-MiniLM-L6-v2_embedding_cosine-sim,0.076466,Given I am on the recycling facility search pa...,"As a user, I want to click on the address, so ..."


In [16]:
# Check for any missing us_texts after merge
sim_df[sim_df["us_text"].isna()]

Unnamed: 0,model,us_id,scenario_id,metric,similarity_score,scenario_text,us_text


In [17]:
# Check for any missing scenario_texts after merge
sim_df[sim_df["scenario_text"].isna()]

Unnamed: 0,model,us_id,scenario_id,metric,similarity_score,scenario_text,us_text


In [19]:
sim_df.to_csv(us_scen_sim_path / 'us_scen_embedding_cosine_similarity.csv', index=False)