In [11]:
# !pip install nltk

In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
import string
import re

In [13]:
#clone the git repo that contains the data and additional information about the dataset
!git clone https://github.com/wayfair/WANDS.git

fatal: destination path 'WANDS' already exists and is not an empty directory.


# **Sections**



*   Moving all original functions to the top;
*   Load WANDS data
*   Data Processing
*   TF-IDF model and evaluation of TF-IDF model result
*   Sentence transformer and evaluation
*   New Evaluation method considering the partial matched product
*   Evaluation of the TF-IDF and sentence transformer with modified evaluation




# **Moving all existing functions to the top of this notebook, this column includes all functions given earlier to evaluate the TF-IDF method.**

In [14]:
#implementing a function to retrieve exact match product IDs for a query_id
def get_exact_matches_for_query(query_id):
    query_group = grouped_label_df.get_group(query_id)
    exact_matches = query_group.loc[query_group['label'] == 'Exact']['product_id'].values
    return exact_matches

#Sanity check code block to see if the search results are relevant
#implementing a function to retrieve top K product IDs for a query
def get_top_product_ids_for_query(query):
    top_product_indices = get_top_products(vectorizer, tfidf_matrix, query, top_n=10)
    top_product_ids = product_df.iloc[top_product_indices]['product_id'].tolist()
    return top_product_ids


#define functions for evaluating retrieval performance
def map_at_k(true_ids, predicted_ids, k=10):
    """
    Calculate the Mean Average Precision at K (MAP@K).

    Parameters:
    true_ids (list): List of relevant product IDs.
    predicted_ids (list): List of predicted product IDs.
    k (int): Number of top elements to consider.
             NOTE: IF you wish to change top k, please provide a justification for choosing the new value

    Returns:
    float: MAP@K score.
    """
    #if either list is empty, return 0
    if not len(true_ids) or not len(predicted_ids):
        return 0.0

    score = 0.0
    num_hits = 0.0

    for i, p_id in enumerate(predicted_ids[:k]):
        if p_id in true_ids and p_id not in predicted_ids[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    return score / min(len(true_ids), k)


#define functions for product search using Tf-IDF
def calculate_tfidf(dataframe):
    """
    Calculate the TF-IDF for combined product name and description.

    Parameters:
    dataframe (pd.DataFrame): DataFrame with product_id, and other product information.

    Returns:
    TfidfVectorizer, csr_matrix: TF-IDF vectorizer and TF-IDF matrix.
    """
    # Combine product name and description to vectorize
    # NOTE: Please feel free to use any combination of columns available, some columns may contain NULL values
    combined_text = dataframe['product_name'] + ' ' + dataframe['product_description']
    vectorizer = TfidfVectorizer()
    # convert combined_text to list of unicode strings
    tfidf_matrix = vectorizer.fit_transform(combined_text.values.astype('U'))
    return vectorizer, tfidf_matrix

def get_top_products(vectorizer, tfidf_matrix, query, top_n=10):
    """
    Get top N products for a given query based on TF-IDF similarity.

    Parameters:
    vectorizer (TfidfVectorizer): Trained TF-IDF vectorizer.
    tfidf_matrix (csr_matrix): TF-IDF matrix for the products.
    query (str): Search query.
    top_n (int): Number of top products to return.

    Returns:
    list: List of top N product IDs.
    """
    query_vector = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    top_product_indices = cosine_similarities.argsort()[-top_n:][::-1]
    return top_product_indices

# **Load product and query data**

In [15]:
# get search queries
query_df = pd.read_csv("WANDS/dataset/query.csv", sep='\t')

# get product data
product_df = pd.read_csv("WANDS/dataset/product.csv", sep='\t')

# get manually labeled groundtruth lables
label_df = pd.read_csv("WANDS/dataset/label.csv", sep='\t')

#group the labels for each query to use when identifying exact matches
grouped_label_df = label_df.groupby('query_id')

In [16]:
product_df.head()

Unnamed: 0,product_id,product_name,product_class,category hierarchy,product_description,product_features,rating_count,average_rating,review_count
0,0,solid wood platform bed,Beds,Furniture / Bedroom Furniture / Beds & Headboa...,"good , deep sleep can be quite difficult to ha...",overallwidth-sidetoside:64.7|dsprimaryproducts...,15.0,4.5,15.0
1,1,all-clad 7 qt . slow cooker,Slow Cookers,Kitchen & Tabletop / Small Kitchen Appliances ...,"create delicious slow-cooked meals , from tend...",capacityquarts:7|producttype : slow cooker|pro...,100.0,2.0,98.0
2,2,all-clad electrics 6.5 qt . slow cooker,Slow Cookers,Kitchen & Tabletop / Small Kitchen Appliances ...,prepare home-cooked meals on any schedule with...,features : keep warm setting|capacityquarts:6....,208.0,3.0,181.0
3,3,all-clad all professional tools pizza cutter,"Slicers, Peelers And Graters",Browse By Brand / All-Clad,this original stainless tool was designed to c...,overallwidth-sidetoside:3.5|warrantylength : l...,69.0,4.5,42.0
4,4,baldwin prestige alcott passage knob with roun...,Door Knobs,Home Improvement / Doors & Door Hardware / Doo...,the hardware has a rich heritage of delivering...,compatibledoorthickness:1.375 '' |countryofori...,70.0,5.0,42.0


In [17]:
query_df.head()

Unnamed: 0,query_id,query,query_class
0,0,salon chair,Massage Chairs
1,1,smart coffee table,Coffee & Cocktail Tables
2,2,dinosaur,Kids Wall Décor
3,3,turquoise pillows,Accent Pillows
4,4,chair and a half recliner,Recliners


In [18]:
label_df.head()

Unnamed: 0,id,query_id,product_id,label
0,0,0,25434,Exact
1,1,0,12088,Irrelevant
2,2,0,42931,Exact
3,3,0,2636,Exact
4,4,0,42923,Exact


In [19]:
set(label_df.label.tolist())

{'Exact', 'Irrelevant', 'Partial'}

# **Data processing**

The original method only included the product name and description in the matching process. However, incorporating multiple fields from the product file—such as product category and category hierarchy—can enhance the matching process by providing more meaningful context for the query.

In [20]:
# Combine fields and process text
product_df['combined_text'] = product_df['product_name'] + ' ' + product_df['product_description'].fillna('') + ' ' + product_df['category hierarchy'].fillna('') + ' ' + product_df['product_class'].fillna('')
product_df['combined_text'] = product_df['combined_text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
product_df['combined_text'] = product_df['combined_text'].apply(lambda x: re.sub('\d+', '', x).lower().strip().replace('  ', ' '))

product_info = product_df['combined_text'].tolist()
query_info = query_df['query'].tolist()

# **The TF-IDF model and its evaluation (previously provided in the code) remain the same, with the only difference being the dataset. The new dataset includes additional fields from the product file, enhancing the model's input.**




In [21]:
# Calculate TF-IDF
vectorizer, tfidf_matrix = calculate_tfidf(product_df)

In [22]:
#Sanity check code block to see if the search results are relevant
#implementing a function to retrieve top K product IDs for a query
def get_top_product_ids_for_query(query):
    top_product_indices = get_top_products(vectorizer, tfidf_matrix, query, top_n=10)
    top_product_ids = product_df.iloc[top_product_indices]['product_id'].tolist()
    return top_product_ids

#define the test query
query = "armchair"

#obtain top product IDs
top_product_ids = get_top_product_ids_for_query(query)

print(f"Top products for '{query}':")
for product_id in top_product_ids:
    product = product_df.loc[product_df['product_id'] == product_id]
    print(product_id, product['product_name'].values[0])

Top products for 'armchair':
12756 24.41 '' wide tufted polyester armchair
42698 donham armchair
42697 donham 25 '' wide armchair
41270 almaraz 33.7 '' wide leather match armchair
23907 faizah 27.6 '' wide tufted polyester armchair
31564 biloxi 34.75 '' wide armchair
41306 hartsell 33 '' wide armchair
1527 howington 39 '' wide tufted linen armchair
42802 donham polyester lounge chair
6532 ogan 29 '' wide polyester armchair


In [23]:
#implementing a function to retrieve exact match product IDs for a query_id
def get_exact_matches_for_query(query_id):
    query_group = grouped_label_df.get_group(query_id)
    exact_matches = query_group.loc[query_group['label'] == 'Exact']['product_id'].values
    return exact_matches

#applying the function to obtain top product IDs and adding top K product IDs to the dataframe
query_df['top_product_ids'] = query_df['query'].apply(get_top_product_ids_for_query)

#adding the list of exact match product_IDs from labels_df
query_df['relevant_ids'] = query_df['query_id'].apply(get_exact_matches_for_query)

#now assign the map@k score
query_df['map@k'] = query_df.apply(lambda x: map_at_k(x['relevant_ids'], x['top_product_ids'], k=10), axis=1)


In [24]:
# calculate the MAP across the entire query set
query_df.loc[:, 'map@k'].mean()

0.29320741016313934

# **Loading Sentence transformer model**

The SentenceTransformer is based on BERT, more specifically, is a BERT variant model like Sentence-BERT, which fine-tune BERT using a Siamese or triplet network structure to create better sentence embeddings. Unlike standard BERT, which processes tokens individually, Sentence-BERT is optimized to capture the meaning of entire sentences efficiently.


In [25]:
# Load the sentence transformer to model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Apply embeddings to product data and query
%time corpus_embeddings = model.encode(product_info, convert_to_tensor = True)
%time query_embeddings = model.encode(query_info, convert_to_tensor = True)

# # Embedding normalization
# corpus_embeddings = util.normalize_embeddings(corpus_embeddings)
# query_embeddings = util.normalize_embeddings(query_embeddings)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

CPU times: user 45.9 s, sys: 381 ms, total: 46.3 s
Wall time: 39.2 s
CPU times: user 117 ms, sys: 1.93 ms, total: 118 ms
Wall time: 108 ms


In [26]:

# Retriving the matching ids using semantic match
def calculate_top_match_semantic(query_embs, product_embs, top_k=10):
  """
    Parameters:
    query_embs: query embeddings
    product_embs: product embeddings

    Returns:
    similarities: top matched product id
  """

  # Comnpute cosine similarity
  hits = util.semantic_search(query_embeddings, corpus_embeddings, score_function = util.dot_score, top_k = top_k)

  # Compile semantic similarity
  similarities = [[y['corpus_id'] for y in x] for x in hits]

  return similarities

# **Model evaluation with MAP**

In [27]:
# # Applying the function to obtain top product IDs and adding top K product IDs to the dataframe
# query_df['relevant_ids'] = query_df['query_id'].apply(get_exact_matches_for_query)

# Add the top matched product id using semantic searching method
query_df['top_semantic_product_ids'] = calculate_top_match_semantic(query_embeddings, corpus_embeddings)
query_df['map@k_semantic'] = query_df.apply(lambda x: map_at_k(x['relevant_ids'], x['top_semantic_product_ids'], k = 10), axis = 1)


In [28]:
# calculate the MAP across the entire query set
query_df.loc[:,'map@k_semantic'].mean()

0.3749306359310699

# **New Evaluation method considering the partial matched product**

This section introduces a modified version of the "map_at_k" function, called "map_at_k_with_partial", which incorporates partial match products into the evaluation.

The function "get_fuzzy_matches_for_query" filters the query/label file and generates a new field, relevant_ids_fuzzy, which includes product IDs that are either an Exact or Partial match to the query.

The "map_at_k_with_partial" function improves accuracy evaluation by assigning different weights to label types. Product IDs labeled as "Exact" matches receive a weight of 1, while those labeled as "Partial" matches currently receive a weight of 0.5 (or can be modified in the input).

In [29]:
#implementing a function to retrieve exact match and partial product IDs for a query_id
def get_fuzzy_matches_for_query(query_id):
    query_group = grouped_label_df.get_group(query_id)
    exact_matches = query_group.loc[query_group['label'] == 'Exact']['product_id'].values
    partial_matches = query_group.loc[query_group['label'] == 'Partial']['product_id'].values
    matches = {"Exact": exact_matches, "Partial": partial_matches}
    return matches


# A modified function of the original map_at_k, it considers both the partial and exact match product ids
def map_at_k_with_partial(true_ids, predicted_ids, k=10, partial_weight=0.5):
    """
    Compute MAP@K, giving partial matches a fair weight.

    Parameters:
        true_labels (list): List of true labels (Exact, Partial).
        predicted_labels (list): List of predicted labels.
        k (int): Number of top elements to consider.
        partial_weight (float): default partial weight is 0.5

    Returns:
        float: Adjusted MAP@K score.
    """
    # get a full list of true id, including partial and exact match
    true_all_ids = true_ids['Exact'].tolist() + true_ids['Partial'].tolist()

    if not len(true_all_ids) or not len(predicted_ids):
        return 0.0

    score = 0.0
    num_hits = 0.0

    for i, p_id in enumerate(predicted_ids[:k]):
      if p_id in true_all_ids and p_id not in predicted_ids[:i]:
        if p_id in true_ids['Exact']:
          weight = 1.0
        if p_id in true_ids['Partial']:
          weight = partial_weight

        num_hits += weight
        score += num_hits / (i + 1.0)

    return score / min(len(true_all_ids), k)


In [30]:
#adding the list of exact match and partial match product_IDs from labels_df
query_df['relevant_ids_fuzzy'] = query_df['query_id'].apply(get_fuzzy_matches_for_query)

# **Evaluation of the TF-IDF model result, and the semantic model result with the modified map function**

In [31]:
# assign the map@k_modified score with TF-IDF predicted output
query_df['map@k_modified'] = query_df.apply(lambda x: map_at_k_with_partial(x['relevant_ids_fuzzy'], x['top_product_ids'], k=10), axis=1)

In [32]:
# assign the map@k_modified_semantic score with Semantic model predicted output

query_df['map@k_modified_semantic'] = query_df.apply(lambda x: map_at_k_with_partial(x['relevant_ids_fuzzy'], x['top_semantic_product_ids'], k = 10), axis = 1)

In [33]:
print("The MAP score for TF-IDF model using the old evaluation method is: {}".format(query_df['map@k'].mean()))
print("The MAP score for Semantic model using the old evaluation method is: {}".format(query_df['map@k_semantic'].mean()))
print("The modified MAP score for TF-IDF model result is: {}".format(query_df['map@k_modified'].mean()))
print("The modified MAP score for Semantic model result is: {}".format(query_df['map@k_modified_semantic'].mean()))

The MAP score for TF-IDF model using the old evaluation method is: 0.29320741016313934
The MAP score for Semantic model using the old evaluation method is: 0.3749306359310699
The modified MAP score for TF-IDF model result is: 0.4617676504629629
The modified MAP score for Semantic model result is: 0.5373210152116402
