In [2]:
from nltk.translate.meteor_score import meteor_score
import nltk
from nltk import word_tokenize

from nltk.translate.meteor_score import single_meteor_score
from nltk.corpus import wordnet
from nltk.translate.meteor_score import _enum_align_words, _generate_enums

from itertools import product
from joblib import Parallel, delayed
from tqdm import tqdm

import pandas as pd
import os, sys
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt

import re
import math

sys.path.append(os.path.abspath(".."))

from config import DATASET_NAME, EXPERIMENT_NAME, GENERATION_TECHNIQUE

# nltk.download("wordnet")

In [3]:
# Set up paths and directories
experiment_path = Path(f"../../data/{DATASET_NAME}/experiment_outputs/{EXPERIMENT_NAME}/{GENERATION_TECHNIQUE}/")

us_scen_sim_path = experiment_path / 'similarity_scores'
os.makedirs(us_scen_sim_path, exist_ok=True)

In [4]:
# Calculate METEOR, showing word matches
def meteor_verbose(reference, hypothesis):
    token_ref = word_tokenize(reference)
    token_hyp = word_tokenize(hypothesis)

    # Enumerate tokens
    enum_ref, enum_hyp = _generate_enums(token_ref, token_hyp)

    # Find matches (exact, stem, synonym)
    matches, _, _ = _enum_align_words(enum_ref, enum_hyp, wordnet=wordnet)
    print("Matches:", [(token_ref[match[0]], token_hyp[match[1]]) for match in matches])

    return single_meteor_score(token_ref, token_hyp, wordnet=wordnet)


In [5]:
# Import user story and gherkin data
df = pd.read_csv(experiment_path / 'parsed_scenario_data.csv')

df.head()

Unnamed: 0,app_id,model,us_id,scenario_id,scenario_text,feature_name,scenario_name,scenario_examples,us_text
0,g04-recycling,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_1_1,Given I am on the website When I click the add...,Address Link Opens Google Maps in New Tab,Clicking the address link opens Google Maps in...,,"As a user, I want to click on the address, so ..."
1,g04-recycling,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_1_2,Given I am on the website When I click the add...,Address Link Opens Google Maps in New Tab,The Google Maps URL contains the correct address,,"As a user, I want to click on the address, so ..."
2,g04-recycling,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_1_3,Given I am on the website When I click the add...,Address Link Opens Google Maps in New Tab,Clicking the address link does not close the c...,,"As a user, I want to click on the address, so ..."
3,g04-recycling,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_1_4,Given I am on the website Then the address lin...,Address Link Opens Google Maps in New Tab,Address link uses the correct HTML attribute t...,,"As a user, I want to click on the address, so ..."
4,g04-recycling,google-gemini-2.0-flash-001,2,google-gemini-2.0-flash-001_2_1,Given I am an anonymous user When I visit the ...,Anonymous User Can View Public Recycling Cente...,Anonymous user views the recycling center list,,"As a user, I want to be able to anonymously vi..."


In [6]:
df.shape # Remove

(491, 9)

In [7]:
df.isna().sum() # Remove

app_id                 0
model                  0
us_id                  0
scenario_id            0
scenario_text          0
feature_name           0
scenario_name          0
scenario_examples    491
us_text                0
dtype: int64

In [8]:
# Shows proportion of scenarios per model
df["model"].value_counts()

model
openai-gpt-4o-mini             255
google-gemini-2.0-flash-001    236
Name: count, dtype: int64

#### Prepare data for METEOR scoring.

In [9]:
# Remove punctuation and special characters from text, convert to lowercase, and tokenize
def preprocess_text(text):
    raw_tokens = word_tokenize(text)
    cleaned_tokens = [token.lower() for token in raw_tokens if token.isalnum()]

    return cleaned_tokens

# TODO: handle URLs, this is removing parts of them
# TODO: check if it is correct to lowercase for METEOR

In [10]:
# Apply full_preprocess to both user story and scenario text columns, creating new columns for the preprocessed text
df["us_text_preprocessed"] = df["us_text"].apply(lambda x: preprocess_text(x) if pd.notnull(x) else x)
df["scenario_text_preprocessed"] = df["scenario_text"].apply(lambda x: preprocess_text(x) if pd.notnull(x) else x)

In [11]:
user_stories = df[["us_id", "us_text", "us_text_preprocessed"]].drop_duplicates(subset=["us_id"]).reset_index(drop=True)

user_stories.head()

Unnamed: 0,us_id,us_text,us_text_preprocessed
0,1,"As a user, I want to click on the address, so ...","[as, a, user, i, want, to, click, on, the, add..."
1,2,"As a user, I want to be able to anonymously vi...","[as, a, user, i, want, to, be, able, to, anony..."
2,3,"As a user, I want to be able to enter my zip c...","[as, a, user, i, want, to, be, able, to, enter..."
3,4,"As a user, I want to be able to get the hours ...","[as, a, user, i, want, to, be, able, to, get, ..."
4,5,"As a user, I want to have a flexible pick up t...","[as, a, user, i, want, to, have, a, flexible, ..."


In [12]:
# Lookup by ID for fast access
def create_lookups(df, index_col, text_col):
    text_lookup = df.set_index(index_col)[text_col].to_dict()

    tokens_lookup = {key: preprocess_text(value) for key, value in text_lookup.items()}

    return text_lookup, tokens_lookup

In [13]:
# Create lookups for scenario text and tokens and user story text and tokens, indexed by id
scenario_text_lookup, scenario_tokens_lookup = create_lookups(df, "scenario_id", "scenario_text")

us_text_lookup, us_tokens_lookup = create_lookups(user_stories, "us_id", "us_text")

In [14]:
# Create all scenario-user story ID pairs for METEOR score calculations
pairs = [(scenario_id, us_id) for scenario_id in scenario_tokens_lookup for us_id in us_tokens_lookup]

In [15]:
len(pairs)

25041

In [16]:
def compute_meteor_batch(pairs_chunk, us_tokens_lookup, scenario_tokens_lookup):
    batch_results = []

    for scenario_id, us_id in pairs_chunk:
        score = single_meteor_score(
            us_tokens_lookup[us_id],
            scenario_tokens_lookup[scenario_id]
        )

        batch_results.append({
            "us_id": us_id,
            "scenario_id": scenario_id,
            'us_text': us_text_lookup[us_id],
            'us_text_preprocessed': us_tokens_lookup[us_id],
            'scenario_text': scenario_text_lookup[scenario_id],
            'scenario_text_preprocessed': scenario_tokens_lookup[scenario_id],
            'metric': "meteor",
            "similarity_score": score
        })

    return batch_results

chunk_size = 1000  # larger chunk reduces overhead but increases memory

chunks = [pairs[i : i + chunk_size] for i in range(0, len(pairs), chunk_size)]

In [17]:
len(chunks)

26

In [18]:
results = list(
    Parallel(n_jobs=-1, return_as="generator")(
        delayed(compute_meteor_batch)(chunk, us_tokens_lookup, scenario_tokens_lookup) for chunk in chunks
    )
)

In [20]:
flat_results = [r for batch in results for r in batch]

In [21]:
sim_df = pd.DataFrame(flat_results)

sim_df.head()

Unnamed: 0,us_id,scenario_id,us_text,us_text_preprocessed,scenario_text,scenario_text_preprocessed,metric,similarity_score
0,1,google-gemini-2.0-flash-001_1_1,"As a user, I want to click on the address, so ...","[as, a, user, i, want, to, click, on, the, add...",Given I am on the website When I click the add...,"[given, i, am, on, the, website, when, i, clic...",meteor,0.284978
1,2,google-gemini-2.0-flash-001_1_1,"As a user, I want to be able to anonymously vi...","[as, a, user, i, want, to, be, able, to, anony...",Given I am on the website When I click the add...,"[given, i, am, on, the, website, when, i, clic...",meteor,0.096525
2,3,google-gemini-2.0-flash-001_1_1,"As a user, I want to be able to enter my zip c...","[as, a, user, i, want, to, be, able, to, enter...",Given I am on the website When I click the add...,"[given, i, am, on, the, website, when, i, clic...",meteor,0.115132
3,4,google-gemini-2.0-flash-001_1_1,"As a user, I want to be able to get the hours ...","[as, a, user, i, want, to, be, able, to, get, ...",Given I am on the website When I click the add...,"[given, i, am, on, the, website, when, i, clic...",meteor,0.126354
4,5,google-gemini-2.0-flash-001_1_1,"As a user, I want to have a flexible pick up t...","[as, a, user, i, want, to, have, a, flexible, ...",Given I am on the website When I click the add...,"[given, i, am, on, the, website, when, i, clic...",meteor,0.163551


In [22]:
sim_df.shape

(25041, 8)

In [23]:
# Add model column 
sim_df.insert(loc=0, column='model', value=sim_df['scenario_id'].str.split('_').str[0])

sim_df.head()

Unnamed: 0,model,us_id,scenario_id,us_text,us_text_preprocessed,scenario_text,scenario_text_preprocessed,metric,similarity_score
0,google-gemini-2.0-flash-001,1,google-gemini-2.0-flash-001_1_1,"As a user, I want to click on the address, so ...","[as, a, user, i, want, to, click, on, the, add...",Given I am on the website When I click the add...,"[given, i, am, on, the, website, when, i, clic...",meteor,0.284978
1,google-gemini-2.0-flash-001,2,google-gemini-2.0-flash-001_1_1,"As a user, I want to be able to anonymously vi...","[as, a, user, i, want, to, be, able, to, anony...",Given I am on the website When I click the add...,"[given, i, am, on, the, website, when, i, clic...",meteor,0.096525
2,google-gemini-2.0-flash-001,3,google-gemini-2.0-flash-001_1_1,"As a user, I want to be able to enter my zip c...","[as, a, user, i, want, to, be, able, to, enter...",Given I am on the website When I click the add...,"[given, i, am, on, the, website, when, i, clic...",meteor,0.115132
3,google-gemini-2.0-flash-001,4,google-gemini-2.0-flash-001_1_1,"As a user, I want to be able to get the hours ...","[as, a, user, i, want, to, be, able, to, get, ...",Given I am on the website When I click the add...,"[given, i, am, on, the, website, when, i, clic...",meteor,0.126354
4,google-gemini-2.0-flash-001,5,google-gemini-2.0-flash-001_1_1,"As a user, I want to have a flexible pick up t...","[as, a, user, i, want, to, have, a, flexible, ...",Given I am on the website When I click the add...,"[given, i, am, on, the, website, when, i, clic...",meteor,0.163551


In [24]:
sim_df.to_csv(us_scen_sim_path / 'meteor_scores.csv', index=False)