In [None]:
#!pip install pyarrow
#!pip install chromadb
#!pip install sentence_transformers
#!pip install tfidf_index
#!pip install pandas
#!pip install numpy
#!pip install datasets
#!jupyter nbextension enable --py widgetsnbextension

# Step - 1 Load the data

1a - Filtered for task-1, and 'us' and 'e' esci (exact, substitute, complement, and irrelevant)

1b - Merge the Datasets using product_locale and product_id (connects each query/label pair to its corresponding product information)

1c - Filtering the large merged DataFrame down to the required data




In [None]:
import pandas as pd
import numpy as np
import os

df_examples =pd.read_parquet(r'C:\Users\saiasg\OneDrive - kochind.com\Desktop\Projects\esci\esci_dataset\dataset\shopping_queries_dataset_examples.parquet')
df_products = pd.read_parquet(r'C:\Users\saiasg\OneDrive - kochind.com\Desktop\Projects\esci\esci_dataset\dataset\shopping_queries_dataset_products.parquet')

In [None]:
df_merged = pd.merge(
    df_examples,
    df_products,
    how='left',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)

# filter for Task 1 (using the 'small_version' flag)
df_task_1 = df_merged[df_merged["small_version"] == 1]

# filter for the training set
df_task_1_train = df_task_1[df_task_1["split"] == "train"]

# applying filters for 'us' locale and 'E' label
final_training_data = df_task_1_train[
    (df_task_1_train["product_locale"] == "us") &
    (df_task_1_train["esci_label"] == "E")
]

In [None]:
# final_training_data
final_training_data.head()

# Step - 2: Create sample dataset

Task: Create a sample dataset consisting of approximately 500 rows with
around 50 unique queries from point number 1. If this doesn't yield
the desired dataset, you may use the following steps to generate the
sample dataset.

a. Determine a random sample of 50 unique queries from the
dataset derived from point number 1.

b. Filter the dataset derived from point number 1 to contain only
the unique queries from point number 2.a.

c. Create a sample dataset of 500 rows from the dataset derived
from point number 2.b

Approach:

Creating a 500-row sample from the training data, centered on 50 unique queries.

2a: Random sampling of rows

2b: (Simple Random Sampling), select 50 unique queries and then sample row

2c: Stratified sampling with around 60 unique queries to find 50 that yield a sample close to 500 rows

In [None]:
df_full = final_training_data.reset_index(drop=True)
print(f"1. Total rows in dataset: {len(df_full)}")

#### 2a: Random sampling of rows - Basic barebones

We just randomly grab 500 rows

In [None]:
# --- Configuration ---
RANDOM_STATE = 42
TARGET_QUERIES = 50
TARGET_ROWS = 500

df_full = final_training_data.reset_index(drop=True)

print(f"\n--- Step 2a: Simple Random Sample ---")
sample_2a = df_full.sample(n=TARGET_ROWS, random_state=RANDOM_STATE).reset_index(drop=True)
final_unique_2a = sample_2a['query'].nunique()

print(f"Result: {final_unique_2a} unique queries in the sample.")
print("\nSample Head:")
display(sample_2a[['query', 'product_title']].head())

#### Step 2b: Sample from Filtered Queries



In [None]:
# 1. randomly select 50 unique queries
unique_queries_2b = df_full['query'].unique()
if len(unique_queries_2b) < TARGET_QUERIES:
    sample_queries_2b = pd.Series(unique_queries_2b)
else:
    sample_queries_2b = pd.Series(unique_queries_2b).sample(n=TARGET_QUERIES, random_state=RANDOM_STATE)

# 2. filter master set to products matching these queries
df_filtered_2b = df_full[df_full['query'].isin(sample_queries_2b)].reset_index(drop=True)
total_rows_for_queries_2b = len(df_filtered_2b)

print(f"-> selected {len(sample_queries_2b)} unique queries.")
print(f"-> total rows available after filtering: {total_rows_for_queries_2b}")

# 3. sample 500 rows from filtered dataset
if total_rows_for_queries_2b <= TARGET_ROWS:
    sample_2b = df_filtered_2b.copy()
    print(f"filtered set too small ({total_rows_for_queries_2b} rows), using all available.")
else:
    sample_2b = df_filtered_2b.sample(n=TARGET_ROWS, random_state=RANDOM_STATE).reset_index(drop=True)
    print(f"took random sample of {len(sample_2b)} rows from filtered dataset.")

# 4. final results
final_unique_2b = sample_2b['query'].nunique()
print(f"result: {len(sample_2b)} rows and {final_unique_2b} unique queries in sample.")

print("\nSample 2b Head:")
display(sample_2b[['query', 'product_title']].head())

#### 2c Stratified sampling

Using this approach to select queries by row count to get more sample diversity

This method ensures representation from all queries, and also high-frequency queries, using this approach

In [None]:
print(f"\n--- Step 2c: Stratified Sampling for Target Rows and Queries ---")

TARGET_QUERIES_2c = 70 # aiming for higher number of unique queries initially

# 1. randomly selecting 70 unique queries
unique_queries_full = df_full['query'].unique()
if len(unique_queries_full) < TARGET_QUERIES_2c:
    potential_sample_queries = pd.Series(unique_queries_full)
    print(f"only {len(unique_queries_full)} unique queries available, using all.")
else:
    potential_sample_queries = pd.Series(unique_queries_full).sample(n=TARGET_QUERIES_2c, random_state=RANDOM_STATE)
    print(f"randomly selected {len(potential_sample_queries)} potential unique queries.")

# 2. filtering rows associated with selected queries
df_potential_queries = df_full[df_full['query'].isin(potential_sample_queries)].reset_index(drop=True)
print(f"filtered down to {len(df_potential_queries)} rows associated with these queries.")

# 3. calculate rows per query
query_row_counts = df_potential_queries.groupby('query').size().sort_values(ascending=False)
print("\nRow counts for potential queries:")
display(query_row_counts.head())

# 4. select top 50 queries by row count
selected_queries_2c = []
current_row_count = 0
for query, count in query_row_counts.items():
    if len(selected_queries_2c) < 50:
         selected_queries_2c.append(query)
         current_row_count += count
    else:
        pass

# ensure we have exactly 50 queries if available
if len(selected_queries_2c) < 50 and len(potential_sample_queries) >= 50:
     print(f"warning: could only select {len(selected_queries_2c)} queries.")
elif len(selected_queries_2c) == 50:
     print(f"\nselected 50 queries that yield {current_row_count} rows.")

# 5. filter to final 50 queries
df_final_50_queries = df_potential_queries[df_potential_queries['query'].isin(selected_queries_2c)].reset_index(drop=True)

# 6. determine final sample based on row count
if len(df_final_50_queries) == TARGET_ROWS:
    sample_2c = df_final_50_queries.copy()
    print(f"final filtered set has exactly {TARGET_ROWS} rows.")
elif len(df_final_50_queries) < TARGET_ROWS:
    sample_2c = df_final_50_queries.copy()
    print(f"final filtered set has {len(df_final_50_queries)} rows, using all available.")
else:
    sample_2c = df_final_50_queries.sample(n=TARGET_ROWS, random_state=RANDOM_STATE).reset_index(drop=True)
    print(f"final filtered set has {len(df_final_50_queries)} rows, taking random sample of {TARGET_ROWS}.")

# 7. final results
final_unique_2c = sample_2c['query'].nunique()
print(f"\nfinal sample (2c) result: {len(sample_2c)} rows and {final_unique_2c} unique queries.")

print("\nFinal Sample (2c) Head:")
display(sample_2c[['query', 'product_title', 'esci_label']].head())

In [None]:
# Save the sample_2c DataFrame to a CSV file
sample_2c.to_csv(r'C:\Users\saiasg\OneDrive - kochind.com\Desktop\Projects\esci\esci_dataset\sample_2c_full_data.csv', index=False)

sample_2c.head()

In [None]:
print("Queries in sample:")
print(sample_2c['query'].unique()[:10])

### Some Data Explroing

Intresting queires: One query was  "usb2aub2ra1m"

And its apprently a product id for right anlged usb connector

https://www.startech.com/en-eu/cables/usb2aub2ra1m?srsltid=AfmBOoryvB93OxhVQnPUAocknMNz41MVDvr2TJMrWf0ijRnCwf5htlXn

Face urine? - Fake urine but still never knew these existed haha

And some plumbing related queires: zurn qkipsp 5 port plastic manifold without valves

# Step 3: Vector Index

3a - Baseline tf-idf 

3b - Dense model (all-MiniLM-L6-v2)

3c - Re-ranking model Hybrid model

3d - Eval


In [1]:
import pandas as pd
import numpy as np
import os
import shutil
import warnings
from tqdm.notebook import tqdm

# --- TF-IDF & Sparse Retrieval ---
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# --- Dense Retrieval (SentenceTransformers & ChromaDB) ---
from sentence_transformers import SentenceTransformer, CrossEncoder
import chromadb

# --- Configuration & Constants ---
RANDOM_STATE = 42
warnings.filterwarnings('ignore')
tqdm.pandas()

# Model Names
RETRIEVAL_MODEL_NAME = 'all-MiniLM-L6-v2'
RERANKER_MODEL_NAME = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
TWO_TOWER_MODEL_PATH = './two_tower_model' # Or './two_tower_model_corrected'

# DB Paths
DB_PATH = "./chroma_data"
COLLECTION_NAME = "product_embeddings"

# File Paths
SAMPLED_DATA_CSV = r'/Users/ashrithgrandi/Desktop/Grainger/dataset/sample_2c_full_data.csv'

In [2]:
# loading the stored dataset
df_sample = pd.read_csv(SAMPLED_DATA_CSV)

df_sample.head()

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split,product_title,product_description,product_bullet_point,product_brand,product_color
0,118060,6 dining chairs,4845,B08CZ6TC2L,us,E,1,1,train,Yaheetech Dining Chairs Velvet Armchairs for C...,Set of 6 Kitchen Dining Chairs for Counter Lou...,STRONG METAL LEGS: To enhance the weight capac...,Yaheetech,Grey
1,118064,6 dining chairs,4845,B08HQG1MFS,us,E,1,1,train,CozyCasa Dining Chairs Modern Style Dining Cha...,<b>If you are in search of some quality-reliab...,Dining Chairs set of 6 -- White PP backrest an...,CozyCasa,White
2,118065,6 dining chairs,4845,B08K2K3J4C,us,E,1,1,train,Yaheetech Dining Chairs with Waterproof leathe...,Make every long-time sitting comfortable. The ...,MULTIPLE USE: Sold in a set of 6 chairs. Desig...,Yaheetech,Brown
3,118066,6 dining chairs,4845,B08K2V66N8,us,E,1,1,train,Yaheetech Dining Chairs Dining Room Chairs Liv...,Make every dinner time comfortable. Constructe...,MULTIPLE USE: Sold in a set of 6 chairs. This ...,Yaheetech,Khaki
4,118067,6 dining chairs,4845,B08K8VDTW8,us,E,1,1,train,Modern Dining Chairs Set of 6 - Faux Leather D...,<b>Modern Dining Chairs Set of 6 - Faux Leathe...,Comfortable Dining Chairs Set of 6 - The dinin...,WENYU,Grey


In [3]:
print("Processing data into a unique Product Corpus...")
product_columns = [
    'product_id', 
    'product_title', 
    'product_description', 
    'product_bullet_point', 
    'product_brand', 
    'product_color'
]
product_corpus_df = df_sample[product_columns].drop_duplicates(subset=['product_id']).reset_index(drop=True)

# Fill NaNs and combine text fields
text_cols_to_fill = product_columns[1:]
for col in text_cols_to_fill:
    product_corpus_df[col] = product_corpus_df[col].fillna('')

product_corpus_df['product_text'] = (
    product_corpus_df['product_title'] + ' ' +
    product_corpus_df['product_brand'] + ' ' +
    product_corpus_df['product_color'] + ' ' +
    product_corpus_df['product_description'] + ' ' +
    product_corpus_df['product_bullet_point']
)
product_corpus_df['product_text'] = product_corpus_df['product_text'].str.replace(r'\s+', ' ', regex=True).str.strip()

print(f"Created corpus of {len(product_corpus_df)} unique products.")

# --- 2. Create Query Evaluation Set (Ground Truth) ---
query_eval_set = df_sample[['query', 'query_id', 'product_id', 'esci_label']].copy()
print(f"Created evaluation set of {len(query_eval_set)} query-product pairs.")

# Create a ground truth map: {query -> list_of_relevant_product_ids}
ground_truth_map = query_eval_set.groupby('query')['product_id'].apply(list).to_dict()
unique_queries_to_eval = list(ground_truth_map.keys())
print(f"Created ground truth map for {len(unique_queries_to_eval)} unique queries.")

Processing data into a unique Product Corpus...
Created corpus of 485 unique products.
Created evaluation set of 485 query-product pairs.
Created ground truth map for 50 unique queries.


## 3a: tf-idf

In this section i created an TF-IDF sparse vector from the combined product text (title, brand, color, description, bullets), limits to the top 5000 terms.

To test, we used an cosine-similarity index (NearestNeighbors) to serve as a sparse retrieval baseline

- TF-IDF will struggle with synonyms and paraphrases where semantic similarity is required.
  


In [4]:
print("--- Building Model 1: TF-IDF ---")
documents_list = product_corpus_df['product_text'].tolist()

# Initialize and fit the vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents_list)
print(f"TF-IDF matrix created with shape: {tfidf_matrix.shape}")

# Build the NearestNeighbors index
n_neighbors_k = 50 # Retrieve 50 candidates for evaluation
nn_index_tfidf = NearestNeighbors(n_neighbors=n_neighbors_k, metric='cosine', algorithm='brute')
nn_index_tfidf.fit(tfidf_matrix)
print("In-memory sparse TF-IDF index built successfully.")

--- Building Model 1: TF-IDF ---
TF-IDF matrix created with shape: (485, 5000)
In-memory sparse TF-IDF index built successfully.


## 3b: Dense model (all-MiniLM-L6-v2)


In [5]:
print(f"--- Building Model 2: Dense Retrieval ({RETRIEVAL_MODEL_NAME}) ---")

# Load the retrieval model
retrieval_model = SentenceTransformer(RETRIEVAL_MODEL_NAME)
print("Retrieval model loaded.")

--- Building Model 2: Dense Retrieval (all-MiniLM-L6-v2) ---
Retrieval model loaded.


In [6]:
# Embed all documents in the corpus
print(f"Embedding {len(product_corpus_df)} documents for ChromaDB...")
doc_embeddings = retrieval_model.encode(
    product_corpus_df['product_text'].tolist(), 
    show_progress_bar=True
)
print("Embeddings generated.")

# Create and populate the ChromaDB collection
if os.path.exists(DB_PATH):
    shutil.rmtree(DB_PATH)
client = chromadb.PersistentClient(path=DB_PATH)

collection_dense = client.get_or_create_collection(
    name=COLLECTION_NAME, 
    metadata={"hnsw:space": "l2"} # Using 'l2' (Euclidean) distance
)

# Prepare data for Chroma
product_ids_str = product_corpus_df['product_id'].astype(str).tolist()
metadatas_list = product_corpus_df.to_dict('records')

collection_dense.add(
    embeddings=doc_embeddings.tolist(),
    metadatas=metadatas_list,
    ids=product_ids_str
)
print(f"Successfully created ChromaDB collection with {collection_dense.count()} vectors.")

Embedding 485 documents for ChromaDB...


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Embeddings generated.
Successfully created ChromaDB collection with 485 vectors.


## Re-ranking (Cross-Encoder)

We load the `ms-marco-MiniLM-L-6-v2` Cross-Encoder. This model doesn't need an index; it scores `(query, document)` pairs provided by a retrieval model (like Model 2).


In [7]:
print(f"--- Loading Model 3: Re-Ranker ({RERANKER_MODEL_NAME}) ---")
cross_encoder_model = CrossEncoder(RERANKER_MODEL_NAME)
print("Re-ranking model loaded.")

--- Loading Model 3: Re-Ranker (cross-encoder/ms-marco-MiniLM-L-6-v2) ---
Re-ranking model loaded.


# Section 6: Comparative Evaluation

Looping through all 50 unique queries from our ground truth set and run them against all available models to compare their performance fairly.


In [8]:
def calculate_metrics_for_query(retrieved_ids, ground_truth_ids):
    """Calculates Recall@k and MRR for a single query's results."""
    ground_truth_set = set(ground_truth_ids)
    if not ground_truth_set:
        return 0.0, 0.0, 0.0, 0.0

    # Calculate Recall@k
    retrieved_at_1 = set(retrieved_ids[:1])
    retrieved_at_5 = set(retrieved_ids[:5])
    retrieved_at_10 = set(retrieved_ids[:10])
    
    recall_at_1 = len(ground_truth_set.intersection(retrieved_at_1)) / len(ground_truth_set)
    recall_at_5 = len(ground_truth_set.intersection(retrieved_at_5)) / len(ground_truth_set)
    recall_at_10 = len(ground_truth_set.intersection(retrieved_at_10)) / len(ground_truth_set)

    # Calculate MRR (based on all relevant items, as in your original cell 32)
    mrr_robust = 0.0
    for gt_id in ground_truth_ids:
        if gt_id in retrieved_ids:
            try:
                rank_robust = retrieved_ids.index(gt_id) + 1
                mrr_robust += 1.0 / rank_robust
            except ValueError:
                continue # Should not happen if gt_id in retrieved_ids
                
    mrr_robust = mrr_robust / len(ground_truth_ids)

    return recall_at_1, recall_at_5, recall_at_10, mrr_robust

# --- Main Evaluation Loop ---

print(f"Starting evaluation for {len(unique_queries_to_eval)} unique queries...")

# We will retrieve 50 candidates for re-ranking
N_CANDIDATES = 50 

all_results = []

for query in tqdm(unique_queries_to_eval, desc="Evaluating All Models"):
    ground_truth_ids = ground_truth_map.get(query, [])
    if not ground_truth_ids:
        continue
    
    # --- 1. TF-IDF Evaluation ---
    query_vector_sparse = tfidf_vectorizer.transform([query])
    distances, indices = nn_index_tfidf.kneighbors(query_vector_sparse, n_neighbors=N_CANDIDATES)
    tfidf_retrieved_ids = product_corpus_df.iloc[indices[0]]['product_id'].tolist()
    
    r1, r5, r10, mrr = calculate_metrics_for_query(tfidf_retrieved_ids, ground_truth_ids)
    all_results.append({
        "Model": "1. TF-IDF",
        "Query": query,
        "Recall@1": r1,
        "Recall@5": r5,
        "Recall@10": r10,
        "MRR": mrr
    })

    # --- 2. Dense Model Evaluation ---
    query_vector_dense = retrieval_model.encode([query]).tolist()
    search_results = collection_dense.query(
        query_embeddings=query_vector_dense,
        n_results=N_CANDIDATES,
    )
    dense_retrieved_ids = [meta['product_id'] for meta in search_results['metadatas'][0]]
    
    r1, r5, r10, mrr = calculate_metrics_for_query(dense_retrieved_ids, ground_truth_ids)
    all_results.append({
        "Model": "2. Dense (S-BERT)",
        "Query": query,
        "Recall@1": r1,
        "Recall@5": r5,
        "Recall@10": r10,
        "MRR": mrr
    })

    # --- 3. Re-Ranker Evaluation ---
    if cross_encoder_model:
        # We re-rank the candidates from the Dense model
        candidate_texts = []
        for pid in dense_retrieved_ids:
            try:
                text = product_corpus_df.loc[product_corpus_df['product_id'] == pid, 'product_text'].iloc[0]
                candidate_texts.append(text)
            except IndexError:
                candidate_texts.append("") # Append empty string if product_id not found

        rerank_pairs = [(query, doc_text) for doc_text in candidate_texts]
        rerank_scores = cross_encoder_model.predict(rerank_pairs, show_progress_bar=False)
        
        reranked_results = list(zip(dense_retrieved_ids, rerank_scores))
        reranked_results.sort(key=lambda x: x[1], reverse=True)
        reranked_retrieved_ids = [p_id for p_id, score in reranked_results]

        r1, r5, r10, mrr = calculate_metrics_for_query(reranked_retrieved_ids, ground_truth_ids)
        all_results.append({
            "Model": "3. Re-Ranker",
            "Query": query,
            "Recall@1": r1,
            "Recall@5": r5,
            "Recall@10": r10,
            "MRR": mrr
        })

print("Evaluation complete.")

Starting evaluation for 50 unique queries...


Evaluating All Models:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluation complete.


## Section 7: Final Results

Here is the final comparison of all models across the entire 50-query evaluation set.

In [12]:
df_results_all = pd.DataFrame(all_results)

df_summary = df_results_all.groupby('Model')[['Recall@1', 'Recall@5', 'Recall@10', 'MRR']].mean().sort_values(by="Model")

# Calculate average rank (1/MRR gives approximate average rank)
df_summary['Avg_Rank'] = 1 / df_summary['MRR']

print("--- Aggregate Model Performance (Averaged over 50 Queries) ---")
display(df_summary)

--- Aggregate Model Performance (Averaged over 50 Queries) ---


Unnamed: 0_level_0,Recall@1,Recall@5,Recall@10,MRR,Avg_Rank
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1. TF-IDF,0.130202,0.503778,0.690149,0.292254,3.421686
2. Dense (S-BERT),0.136868,0.55422,0.748343,0.319586,3.129052
3. Re-Ranker,0.138118,0.564091,0.785804,0.323647,3.089782


In [None]:
# --- Display Per-Query Results for Analysis ---

print("\n--- Per-Query Performance Breakdown (by MRR) ---")
pd.set_option('display.max_rows', 100)
try:
    display(df_results_all.pivot(index="Query", columns="Model", values="MRR"))
except Exception as e:
    print(f"Could not pivot results: {e}")
    print("Displaying raw results instead:")
    display(df_results_all)



--- Per-Query Performance Breakdown (by MRR) ---


Model,1. TF-IDF,2. Dense (S-BERT),3. Re-Ranker
Query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6 dining chairs,0.166987,0.159189,0.166538
a intex pool pump,0.408333,0.408333,0.408333
activated carbon mask,0.280489,0.291988,0.292897
adidas original superstar women,0.408333,0.408333,0.408333
balloons yellow and orange,0.202327,0.202327,0.202327
big little lies,0.313131,0.357639,0.358796
brumate hopsulator slim,0.611111,0.611111,0.611111
carpenter bench press,0.287649,0.331503,0.314748
christmas dresses size 3t-4t,0.237834,0.237943,0.24038
computers,0.1,0.205811,0.216783


## Single Query testing

In [11]:
test_query = "carpenter bench press"
if test_query in ground_truth_map:
    ground_truth_ids_test = ground_truth_map.get(test_query, [])
    print(f"Test Query: '{test_query}'")
    print(f"Ground Truth 'Exact' Product IDs ({len(ground_truth_ids_test)}): {ground_truth_ids_test}")

    # --- TF-IDF Sanity Check ---
    print("\n--- TF-IDF Sanity Check ---")
    query_vector = tfidf_vectorizer.transform([test_query])
    distances, indices = nn_index_tfidf.kneighbors(query_vector, n_neighbors=10)
    results_df_tfidf = product_corpus_df.iloc[indices[0]].copy()
    results_df_tfidf['_distance'] = distances[0]
    results_df_tfidf['is_ground_truth'] = results_df_tfidf['product_id'].isin(ground_truth_ids_test)
    print(results_df_tfidf[['product_id', '_distance', 'is_ground_truth', 'product_title']])
    matches = results_df_tfidf['is_ground_truth'].sum()
    print(f"TF-IDF found {matches} out of {len(ground_truth_ids_test)} ground truth items in Top 10.")

    # --- Dense Model Sanity Check ---
    print("\n--- Dense Model Sanity Check ---")
    query_vector = retrieval_model.encode([test_query]).tolist()
    search_results = collection_dense.query(
        query_embeddings=query_vector,
        n_results=10,
    )
    result_metadatas = search_results['metadatas'][0]
    result_distances = search_results['distances'][0]
    results_df_dense = pd.DataFrame({
        'product_id': [meta['product_id'] for meta in result_metadatas],
        '_distance': result_distances
    })
    results_df_dense['is_ground_truth'] = results_df_dense['product_id'].isin(ground_truth_ids_test)
    results_df_dense = results_df_dense.merge(product_corpus_df[['product_id', 'product_title']], on='product_id', how='left')
    print(results_df_dense[['product_id', '_distance', 'is_ground_truth', 'product_title']])
    matches_dense = results_df_dense['is_ground_truth'].sum()
    print(f"Dense Model found {matches_dense} out of {len(ground_truth_ids_test)} ground truth items in Top 10.")
else:
    print(f"Test query '{test_query}' not found in the sampled dataset.")

Test Query: 'carpenter bench press'
Ground Truth 'Exact' Product IDs (8): ['B088R8VC7V', 'B07N4QN64D', 'B01M4F8JZJ', 'B01LZV1QW1', 'B00HQONFVE', 'B00BHSPJC8', 'B00068U7XQ', 'B00SIQ1DLS']

--- TF-IDF Sanity Check ---
     product_id  _distance  is_ground_truth  \
73   B01LZV1QW1   0.813588             True   
70   B088R8VC7V   0.868601             True   
76   B00068U7XQ   0.883066             True   
431  B06XD1DVBG   0.885864            False   
71   B07N4QN64D   0.893176             True   
427  B06XD5G2XB   0.911316            False   
77   B00SIQ1DLS   0.912220             True   
74   B00HQONFVE   0.916837             True   
129  B07G15QZNQ   0.924860            False   
342  B0015TX0MU   0.925376            False   

                                         product_title  
73   Genesis GDP805P 5-Speed 2.6 Amp 8" Drill Press...  
70   Workbench Mounted Drilling Machine, 350W 5 Spe...  
76              Palmgren Ratcheting arbor press, 3 ton  
431  Creative Teaching Press Safari Fr

# Step 4 Two tower eval vs others

Refrence links:

Uber blog on two tower arch: https://www.uber.com/blog/innovative-recommendation-applications-using-two-tower-embeddings/

In [None]:
print("=== Two-Tower Model Evaluation ===")

# Import required libraries
import sys
import os
from two_tower_evaluation import TwoTowerEvaluator

# Check if model exists - try both possible paths
model_paths = ['./two_tower_model', './two_tower_model_corrected']
model_path = None

for path in model_paths:
    if os.path.exists(path):
        model_path = path
        break

if model_path is None:
    print("‚ùå Two-Tower model not found. Please train the model first using two_tower_final.py")
    print("   Looked for models in:")
    for path in model_paths:
        print(f"   - {path}")
else:
    print(f"‚úÖ Two-Tower model found at: {model_path}")
    print("   Starting evaluation...")
    
    try:
        evaluator = TwoTowerEvaluator(
            model_path='./two_tower_model_corrected', 
            test_data_path='sample_2c_full_data.csv'
        )
        
        # Run evaluation
        print("\nüîÑ Building product embeddings index...")
        evaluator.build_product_index()
        
        # Test the same query as baselines for comparison
        test_query = "carpenter bench press"
        print(f"\nüîç Testing query: '{test_query}'")
        
        # Get ground truth for comparison
        ground_truth_df = evaluator.eval_data[evaluator.eval_data['query'] == test_query]
        ground_truth_ids = set(ground_truth_df[ground_truth_df['esci_label'] == 'E']['product_id'].tolist())
        
        print(f"üìã Ground Truth: {len(ground_truth_ids)} relevant products")
        print(f"    Product IDs: {list(ground_truth_ids)}")
        
        # Search using Two-Tower model
        results = evaluator.search_products(test_query, top_k=10)
        
        print(f"\nüéØ Two-Tower Top-10 Results:")
        hits = 0
        for i, (product_id, similarity) in enumerate(results, 1):
            is_relevant = product_id in ground_truth_ids
            if is_relevant:
                hits += 1
            
            # Get product title
            product_title = evaluator.product_corpus[evaluator.product_corpus['product_id'] == product_id]['product_title'].iloc[0]
            status = "‚úÖ RELEVANT" if is_relevant else "‚ùå"
            
            print(f"  {i:2d}. {status} | Sim: {similarity:.4f} | {product_title[:60]}...")
        
        two_tower_precision = hits / 10
        two_tower_recall = hits / len(ground_truth_ids) if ground_truth_ids else 0
        
        print(f"\nüìä Two-Tower Results:")
        print(f"    Precision@10: {two_tower_precision:.4f}")
        print(f"    Recall@10: {two_tower_recall:.4f}")
        print(f"    Found {hits} out of {len(ground_truth_ids)} ground truth items in Top 10")
        
        # Compare with previous results
        print(f"\nüìà Comparison Summary:")
        print(f"    TF-IDF:              Found 6/8 ground truth items (Precision: 0.6000)")
        print(f"    SentenceTransformers: Found 7/8 ground truth items (Precision: 0.7000)")
        print(f"    Two-Tower:           Found {hits}/{len(ground_truth_ids)} ground truth items (Precision: {two_tower_precision:.4f})")
        
        # Run comprehensive evaluation
        print(f"\nüîÑ Running comprehensive evaluation on all test queries...")
        metrics = evaluator.calculate_metrics([1, 5, 10, 20])
        avg_metrics, _ = metrics
        
        print(f"\nüìà Comprehensive Results:")
        print(f"{'Metric':<12} {'Top-1':<8} {'Top-5':<8} {'Top-10':<8} {'Top-20':<8}")
        print("-" * 48)
        
        for metric_name in ['precision', 'recall', 'ndcg', 'mrr']:
            row = f"{metric_name.upper():<12}"
            for k in [1, 5, 10, 20]:
                row += f"{avg_metrics[k][metric_name]:<8.3f}"
            print(row)
        
        # Sample additional queries
        print(f"\nüîç Testing additional sample queries...")
        sample_queries = ['6 dining chairs', 'plants', 'turning shoe']
        
        for query in sample_queries:
            if query in evaluator.eval_data['query'].values:
                ground_truth = evaluator.eval_data[evaluator.eval_data['query'] == query]
                relevant_count = len(ground_truth[ground_truth['esci_label'] == 'E'])
                
                results = evaluator.search_products(query, top_k=5)
                relevant_in_top5 = sum(1 for pid, _ in results if pid in set(ground_truth[ground_truth['esci_label'] == 'E']['product_id']))
                
                print(f"  '{query}': {relevant_in_top5}/{relevant_count} relevant in Top-5")
        
        print(f"\n‚úÖ Two-Tower evaluation completed!")
        
        # Store final metrics for summary
        final_precision_10 = avg_metrics[10]['precision']
        final_recall_10 = avg_metrics[10]['recall']
        
    except Exception as e:
        print(f"‚ùå Error during evaluation: {e}")
        import traceback
        traceback.print_exc()
        # Set default values for summary
        final_precision_10 = 0.0
        final_recall_10 = 0.0

print("\n" + "="*60)
print("üèÅ FINAL COMPARISON SUMMARY")
print("="*60)
print("Method                | Precision@10 | Recall@10 | Notes")
print("-" * 60)
print("TF-IDF               |    0.6000    |   0.7500  | Sparse, keyword-based")
print("SentenceTransformers |    0.7000    |   0.8750  | Dense, pre-trained")
try:
    print(f"Two-Tower (Custom)   |    {final_precision_10:.4f}    |   {final_recall_10:.4f}  | Dense, task-specific")
except:
    print("Two-Tower (Custom)   |    ?.????    |   ?.????  | Dense, task-specific")

print("\nüí° The Two-Tower model should perform better due to:")
print("   - Task-specific training on your exact data")
print("   - Separate encoders for queries and products")
print("   - Triplet loss optimization for retrieval")
print("   - Domain-specific fine-tuning")
