# Part 1: Indexing

#### Imports

In [1]:
import nltk
import json
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# from wordcloud import WordCloud
from collections import Counter

#### Useful code from part 1

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def remove_punctuation(text):
    cleaned = ""
    for char in text:
        if char.isalnum() or char.isspace() or char == "-":
            cleaned += char
        else:
            cleaned += " "  # Replace punctuation with space
    return cleaned


In [4]:
products_path = '../../data/fashion_products_dataset.json'
with open(products_path, "r", encoding="utf-8") as f:
    products = pd.read_json(products_path)

def build_terms(line):
    """
    Preprocess a line:
    ●  Removing stop words 
    ●  Tokenization 
    ●  Removing punctuation marks 
    ●  Stemming 
    ●  Transforming to lowercase

    Argument:
    line -- string (text) to be preprocessed

    Returns:
    line - a list of tokens corresponding to the input text after the preprocessing
    """

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    line = line.lower()
    line = remove_punctuation(line)
    line = line.split()
    line = [x for x in line if x not in stop_words]
    line = [stemmer.stem(word) for word in line]
    return line

def get_products_information(products_df):
    elements = ["pid", "title", "description", "brand", "category", "sub_category", 
                "product_details", "seller", "out_of_stock", "selling_price", 
                "discount", "actual_price", "average_rating", "url"]
    
    products_df = products_df[elements]
    
    return products_df

products = get_products_information(products)
products["processed_title"] = products["title"].apply(build_terms)
products["processed_description"] = products["description"].apply(build_terms)
products['cat_subcat'] = products['category'] + ": " + products['sub_category']

# Part 1: Indexing 

## 1. Build inverted index

We join the words in the processed titles and the processed descriptions for each product

In [5]:
products["title_description"] = products["processed_title"] + products["processed_description"]
display(products.head(5))

Unnamed: 0,pid,title,description,brand,category,sub_category,product_details,seller,out_of_stock,selling_price,discount,actual_price,average_rating,url,processed_title,processed_description,cat_subcat,title_description
0,TKPFCZ9EA7H5FYZH,Solid Women Multicolor Track Pants,Yorker trackpants made from 100% rich combed c...,York,Clothing and Accessories,Bottomwear,"[{'Style Code': '1005COMBO2'}, {'Closure': 'El...",Shyam Enterprises,False,921,69% off,2999,3.9,https://www.flipkart.com/yorker-solid-men-mult...,"[solid, women, multicolor, track, pant]","[yorker, trackpant, made, 100, rich, comb, cot...",Clothing and Accessories: Bottomwear,"[solid, women, multicolor, track, pant, yorker..."
1,TKPFCZ9EJZV2UVRZ,Solid Men Blue Track Pants,Yorker trackpants made from 100% rich combed c...,York,Clothing and Accessories,Bottomwear,"[{'Style Code': '1005BLUE'}, {'Closure': 'Draw...",Shyam Enterprises,False,499,66% off,1499,3.9,https://www.flipkart.com/yorker-solid-men-blue...,"[solid, men, blue, track, pant]","[yorker, trackpant, made, 100, rich, comb, cot...",Clothing and Accessories: Bottomwear,"[solid, men, blue, track, pant, yorker, trackp..."
2,TKPFCZ9EHFCY5Z4Y,Solid Men Multicolor Track Pants,Yorker trackpants made from 100% rich combed c...,York,Clothing and Accessories,Bottomwear,"[{'Style Code': '1005COMBO4'}, {'Closure': 'El...",Shyam Enterprises,False,931,68% off,2999,3.9,https://www.flipkart.com/yorker-solid-men-mult...,"[solid, men, multicolor, track, pant]","[yorker, trackpant, made, 100, rich, comb, cot...",Clothing and Accessories: Bottomwear,"[solid, men, multicolor, track, pant, yorker, ..."
3,TKPFCZ9ESZZ7YWEF,Solid Women Multicolor Track Pants,Yorker trackpants made from 100% rich combed c...,York,Clothing and Accessories,Bottomwear,"[{'Style Code': '1005COMBO3'}, {'Closure': 'El...",Shyam Enterprises,False,911,69% off,2999,3.9,https://www.flipkart.com/yorker-solid-men-mult...,"[solid, women, multicolor, track, pant]","[yorker, trackpant, made, 100, rich, comb, cot...",Clothing and Accessories: Bottomwear,"[solid, women, multicolor, track, pant, yorker..."
4,TKPFCZ9EVXKBSUD7,"Solid Women Brown, Grey Track Pants",Yorker trackpants made from 100% rich combed c...,York,Clothing and Accessories,Bottomwear,"[{'Style Code': '1005COMBO1'}, {'Closure': 'Dr...",Shyam Enterprises,False,943,68% off,2999,3.9,https://www.flipkart.com/yorker-solid-men-brow...,"[solid, women, brown, grey, track, pant]","[yorker, trackpant, made, 100, rich, comb, cot...",Clothing and Accessories: Bottomwear,"[solid, women, brown, grey, track, pant, yorke..."


In [6]:
def create_index_tfidf_products(products):
    index = defaultdict(list)
    tf = defaultdict(list)
    df = defaultdict(int)
    idf = defaultdict(float)
    title_index = defaultdict(str)

    num_documents = len(products)

    for i in range(num_documents):
        pid = products.iloc[i]["pid"]
        words = products.iloc[i]["title_description"]
        title_index[pid] = products.iloc[i].get("title", "")

        current_product_index = {}

        for position, term in enumerate(words):
            try:
                current_product_index[term][1].append(position)
            except:
                current_product_index[term] = [pid, array('I', [position])]

        norm = math.sqrt(sum(len(posting[1]) ** 2 for posting in current_product_index.values()))

        for term, posting in current_product_index.items():
            tf[term].append(np.round(len(posting[1]) / norm, 4))
            df[term] += 1

        for term, posting in current_product_index.items():
            index[term].append(posting)

    for term in df:
        idf[term] = np.round(np.log(float(num_documents / df[term])), 4)

    return index, tf, df, idf, title_index

## 2. Propose test queries

In [7]:
queries = [
    "casual half sleeve polo shirt for men",
    "light blue jeans slim fit",
    "trousers chino casual men",
    "black sports shoes",
    "fancy t-shirt"
]

## 3. Rank your results

In [8]:
def rank_documents(terms, prods, index, idf, tf, title_index):
    product_vectors = defaultdict(lambda: [0] * len(terms))
    query_vector = [0] * len(terms)

    query_terms_count = collections.Counter(terms)
    query_norm = la.norm(list(query_terms_count.values()))

    for term_index, term in enumerate(terms):
        if term not in index:
            continue

        query_vector[term_index] = query_terms_count[term] / query_norm * idf[term]

        for posting_index, (pid, positions) in enumerate(index[term]):
            if pid in prods:
                product_vectors[pid][term_index] = tf[term][posting_index] * idf[term]

    product_scores = [[np.dot(vec, query_vector), pid] for pid, vec in product_vectors.items()]
    product_scores.sort(reverse=True)
    ranked_prods = [pid for _, pid in product_scores]

    return ranked_prods, product_scores

def search_tf_idf(query, index, idf, tf, title_index):
    query = build_terms(query)
    prods = None

    for term in query:
        try:
            term_prods = set(posting[0] for posting in index[term])
            if prods is None:
                prods = term_prods
            else:
                prods &= term_prods
        except:
            return [], []

    if not prods:
        return [], []

    prods = list(prods)
    ranked_prods, prod_scores = rank_documents(query, prods, index, idf, tf, title_index)
    return ranked_prods, prod_scores


In [9]:
index, tf, df, idf, title_index = create_index_tfidf_products(products)

In [10]:
queries = [
    "casual half sleeve polo shirt for men",
    "light blue jeans slim fit",
    "trousers chino casual men",
    "black sports shoes",
    "fancy t-shirt"
]

for query in queries:
    print(f"\nQuery: {query}")
    ranked_prods, scores = search_tf_idf(query, index, idf, tf, title_index)
    top = min(10, len(ranked_prods))

    print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(ranked_prods)))
    for score, pid in scores[:top]:
        url = products.loc[products["pid"] == pid, "url"].values
        url = url[0] if len(url) > 0 else "N/A"
        print("product_id = {} - product_title: {} - score: {:.4f} - url: {}".format(pid, title_index[pid], score, url))


Query: casual half sleeve polo shirt for men

Top 10 results out of 43 for the searched query:

product_id = TSHEG64SDHDEZAGH - product_title: Solid Men Polo Neck Blue T-Shirt - score: 2.1961 - url: https://www.flipkart.com/teemoods-solid-men-polo-neck-blue-t-shirt/p/itmf3ysfnqxuen9r?pid=TSHEG64SDHDEZAGH&lid=LSTTSHEG64SDHDEZAGHDFBM4L&marketplace=FLIPKART&srno=b_1_22&otracker=browse&fm=organic&iid=4448bc13-3c41-4a59-8584-a811c8e4784f.TSHEG64SDHDEZAGH.SEARCH&ssid=5yyou4hsu80000001612412157763
product_id = TSHEG5Y3ZNYCADMV - product_title: Solid Men Polo Neck White T-Shirt - score: 2.1961 - url: https://www.flipkart.com/teemoods-solid-men-polo-neck-white-t-shirt/p/itmf3ytvdrcjqbb2?pid=TSHEG5Y3ZNYCADMV&lid=LSTTSHEG5Y3ZNYCADMVIW3S0U&marketplace=FLIPKART&srno=b_1_26&otracker=browse&fm=organic&iid=4448bc13-3c41-4a59-8584-a811c8e4784f.TSHEG5Y3ZNYCADMV.SEARCH&ssid=5yyou4hsu80000001612412157763
product_id = TSHEG5FPJMEZQXBD - product_title: Solid Men Polo Neck Black T-Shirt - score: 2.1961 - ur

# Part 2: Evaluation

## 2.1 Implement the evaluation metrics

In [11]:
def precision_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: dict mapping doc_id to ground truth relevance (0 or 1)
    y_score: list of tuples (doc_id, predicted_score)
    k : number of docs to consider

    Returns
    -------
    precision @k : float
    """
    sorted_docs = sorted(y_score, key=lambda x: x[1], reverse=True)

    top_k_docs = [doc_id for doc_id, _ in sorted_docs[:k]]

    relevant = sum(doc_score.get(doc_id, 0) == 1 for doc_id in top_k_docs)

    return float(relevant) / k

In [12]:
def recall_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: dict mapping doc_id to ground truth relevance (0 or 1)
    y_score: list of tuples (doc_id, predicted_score)
    k : number of docs to consider

    Returns
    -------
    recall @k : float
    """
    sorted_docs = sorted(y_score, key=lambda x: x[1], reverse=True)

    top_k_docs = [doc_id for doc_id, _ in sorted_docs[:k]]

    relevant_retrieved = sum(doc_score.get(doc_id, 0) == 1 for doc_id in top_k_docs)

    total_relevant = sum(1 for relevance in doc_score.values() if relevance == 1)

    if total_relevant == 0:
        return 0.0

    return float(relevant_retrieved) / total_relevant

In [13]:
def avg_precision_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: dict mapping doc_id to ground truth relevance (0 or 1)
    y_score: list of tuples (doc_id, predicted_score)
    k : number of docs to consider

    Returns
    -------
    average precision @k : float
    """
    sorted_docs = sorted(y_score, key=lambda x: x[1], reverse=True)
    top_k = sorted_docs[:k]

    relevant = 0
    precision_sum = 0.0

    for i, (pid, _) in enumerate(top_k, start=1):
        if doc_score.get(pid, 0) == 1:
            relevant += 1
            precision_sum += relevant / i

    if relevant == 0:
        return 0.0

    return precision_sum / relevant


In [14]:
def f1_score_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: dict mapping doc_id to ground truth relevance (0 or 1)
    y_score: list of tuples (doc_id, predicted_score)
    k : number of docs to consider

    Returns
    -------
    F1-score @k : float
    """
    sorted_docs = sorted(y_score, key=lambda x: x[1], reverse=True)
    top_k = sorted_docs[:k]

    relevant_retrieved = sum(doc_score.get(pid, 0) == 1 for pid, _ in top_k)
    total_relevant = sum(1 for rel in doc_score.values() if rel == 1)

    precision = relevant_retrieved / k if k > 0 else 0
    recall = relevant_retrieved / total_relevant if total_relevant > 0 else 0

    if precision + recall == 0:
        return 0.0

    return 2 * (precision * recall) / (precision + recall)


In [15]:
def map_at_k(all_doc_scores, all_y_scores, k=10):
    """
    Parameters
    ----------
    all_doc_scores: list of dicts mapping doc_id to ground truth relevance (0 or 1)
    all_y_scores: list of lists of tuples (doc_id, predicted_score)
    k : number of docs to consider

    Returns
    -------
    mean average precision @k : float
    list of average precision scores per query : list
    """
    ap_list = []
    for doc_score, y_score in zip(all_doc_scores, all_y_scores):
        ap_list.append(avg_precision_at_k(doc_score, y_score, k))
    return np.mean(ap_list), ap_list


In [16]:
def rr_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: dict mapping doc_id to ground truth relevance (0 or 1)
    y_score: list of tuples (doc_id, predicted_score)
    k : number of docs to consider

    Returns
    -------
    reciprocal rank @k : float
    """
    sorted_docs = sorted(y_score, key=lambda x: x[1], reverse=True)
    top_k = sorted_docs[:k]

    for rank, (pid, _) in enumerate(top_k, start=1):
        if doc_score.get(pid, 0) == 1:
            return 1.0 / rank
    return 0.0


In [17]:
def dcg_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: dict mapping doc_id to ground truth relevance (0 or 1)
    y_score: list of tuples (doc_id, predicted_score)
    k : number of docs to consider

    Returns
    -------
    discounted cumulative gain @k : float
    """
    sorted_docs = sorted(y_score, key=lambda x: x[1], reverse=True)
    top_k = sorted_docs[:k]

    gains = [(2 ** doc_score.get(pid, 0) - 1) for pid, _ in top_k]
    discounts = np.log2(np.arange(len(gains)) + 2)

    return np.sum(np.array(gains) / discounts)

def ndcg_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: dict mapping doc_id to ground truth relevance (0 or 1)
    y_score: list of tuples (doc_id, predicted_score)
    k : number of docs to consider

    Returns
    -------
    normalized discounted cumulative gain @k : float
    """
    ideal_y_score = [(pid, doc_score.get(pid, 0)) for pid in doc_score]
    ideal_dcg = dcg_at_k(doc_score, ideal_y_score, k)
    if ideal_dcg == 0:
        return 0.0
    return np.round(dcg_at_k(doc_score, y_score, k) / ideal_dcg, 4)


# 2.2. Apply the evaluation metrics 

In [18]:
# Read validation labels
df = pd.read_csv("../../data/validation_labels.csv")

# Define example queries
query_texts = {
    1: "women full sleeve sweatshirt cotton",
    2: "men slim jeans blue"
}

# Preprocess queries
processed_queries = {qid: build_terms(text) for qid, text in query_texts.items()}

# Create relevance maps for each query
relevance_maps = {}
for query_id, group in df.groupby("query_id"):
    processed_query = tuple(processed_queries[query_id])
    relevance_maps[processed_query] = {
        row["pid"]: int(row["labels"]) for _, row in group.iterrows()
    }

# Evaluate each query
for query_id, terms in processed_queries.items():
    query_key = tuple(terms)
    ranked_prods, prod_scores = rank_documents(
        terms,
        list(relevance_maps[query_key].keys()),
        index,
        idf,
        tf,
        title_index
    )

    # Pair product ids with their scores
    y_score = [(pid, score) for score, pid in prod_scores]
    doc_score = relevance_maps[query_key]

    # Compute evaluation metrics
    p_at_10 = round(precision_at_k(doc_score, y_score, k=10), 3)
    r_at_10 = round(recall_at_k(doc_score, y_score, k=10), 3)
    ap_at_10 = round(avg_precision_at_k(doc_score, y_score, k=10), 3)
    f1_at_10 = round(f1_score_at_k(doc_score, y_score, k=10), 3)
    rr = round(rr_at_k(doc_score, y_score, k=10), 3)
    ndcg = round(ndcg_at_k(doc_score, y_score, k=10), 3)

    # Print results in a readable way
    print("-" * 50)
    print(f"Query {query_id}: {' '.join(terms)}")
    print(f"Precision@10 : {p_at_10}")
    print(f"Recall@10    : {r_at_10}")
    print(f"Avg Precision: {ap_at_10}")
    print(f"F1-Score@10  : {f1_at_10}")
    print(f"Reciprocal R.: {rr}")
    print(f"NDCG@10      : {ndcg}")
    print("-" * 50)

# Evaluate mean average precision for all queries
all_doc_scores = []
all_y_scores = []

for query_id, terms in processed_queries.items():
    query_key = tuple(terms)
    ranked_prods, prod_scores = rank_documents(
        terms,
        list(relevance_maps[query_key].keys()),
        index,
        idf,
        tf,
        title_index
    )

    y_score = [(pid, score) for score, pid in prod_scores]
    doc_score = relevance_maps[query_key]

    all_doc_scores.append(doc_score)
    all_y_scores.append(y_score)

# Compute MAP@10
mean_ap, ap_list = map_at_k(all_doc_scores, all_y_scores, k=10)

# Print MAP results clearly
print("-" * 50)
print("Overall Performance Summary")
print(f"Mean Average Precision (MAP@10): {round(mean_ap, 3)}")
print("Average precision per query:", " ".join(str(round(ap, 3)) for ap in ap_list))
print("-" * 50)


--------------------------------------------------
Query 1: women full sleev sweatshirt cotton
Precision@10 : 1.0
Recall@10    : 0.769
Avg Precision: 1.0
F1-Score@10  : 0.87
Reciprocal R.: 1.0
NDCG@10      : 1.0
--------------------------------------------------
--------------------------------------------------
Query 2: men slim jean blue
Precision@10 : 0.6
Recall@10    : 0.6
Avg Precision: 0.68
F1-Score@10  : 0.6
Reciprocal R.: 1.0
NDCG@10      : 0.615
--------------------------------------------------
--------------------------------------------------
Overall Performance Summary
Mean Average Precision (MAP@10): 0.84
Average precision per query: 1.0 0.68
--------------------------------------------------


In [19]:
def evaluate_queries(df, query_texts, index, idf, tf, title_index, k=10):
    """
    Evaluate a set of queries and print per-query metrics + MAP@k.
    """

    # Preprocess queries
    processed_queries = {qid: build_terms(text) for qid, text in query_texts.items()}

    # Build relevance maps
    relevance_maps = {}
    for query_id, group in df.groupby("query_id"):
        processed_query = tuple(processed_queries[query_id])
        relevance_maps[processed_query] = {
            row["pid"]: int(row["labels"]) for _, row in group.iterrows()
        }

    # Evaluate each query
    all_doc_scores = []
    all_y_scores = []

    for query_id, terms in processed_queries.items():
        query_key = tuple(terms)
        ranked_prods, prod_scores = rank_documents(
            terms,
            list(relevance_maps[query_key].keys()),
            index,
            idf,
            tf,
            title_index
        )

        y_score = [(pid, score) for score, pid in prod_scores]
        doc_score = relevance_maps[query_key]

        # Metrics
        p_at_10 = round(precision_at_k(doc_score, y_score, k), 3)
        r_at_10 = round(recall_at_k(doc_score, y_score, k), 3)
        ap_at_10 = round(avg_precision_at_k(doc_score, y_score, k), 3)
        f1_at_10 = round(f1_score_at_k(doc_score, y_score, k), 3)
        rr = round(rr_at_k(doc_score, y_score, k), 3)
        ndcg = round(ndcg_at_k(doc_score, y_score, k), 3)

        print("-" * 55)
        print(f"Query {query_id}: {' '.join(terms)}")
        print(f"Precision@{k}       : {p_at_10}")
        print(f"Recall@{k}          : {r_at_10}")
        print(f"Avg Precision@{k}   : {ap_at_10}")
        print(f"F1-Score@{k}        : {f1_at_10}")
        print(f"Reciprocal Rank@{k} : {rr}")
        print(f"NDCG@{k}            : {ndcg}")
        print("-" * 55)

        all_doc_scores.append(doc_score)
        all_y_scores.append(y_score)

    # Compute and print MAP@k
    mean_ap, ap_list = map_at_k(all_doc_scores, all_y_scores, k)
    print("\nOverall Performance Summary")
    print("=" * 55)
    print(f"Mean Average Precision (MAP@{k}): {round(mean_ap, 3)}")
    print("Average precision per query   :", " ".join(str(round(ap, 3)) for ap in ap_list))
    print("=" * 55)


In [21]:
df = pd.read_csv("../../data/student_validation_labels.csv")

query_texts = {
    1: "casual half sleeve polo shirt for men",
    2: "light blue jeans slim fit",
    3: "trousers chino casual men",
    4: "black sports shoes",
    5: "fancy t-shirt"
}

evaluate_queries(df, query_texts, index, idf, tf, title_index, k=5)


-------------------------------------------------------
Query 1: casual half sleev polo shirt men
Precision@5       : 1.0
Recall@5          : 0.5
Avg Precision@5   : 1.0
F1-Score@5        : 0.667
Reciprocal Rank@5 : 1.0
NDCG@5            : 1.0
-------------------------------------------------------
-------------------------------------------------------
Query 2: light blue jean slim fit
Precision@5       : 1.0
Recall@5          : 0.625
Avg Precision@5   : 1.0
F1-Score@5        : 0.769
Reciprocal Rank@5 : 1.0
NDCG@5            : 1.0
-------------------------------------------------------
-------------------------------------------------------
Query 3: trouser chino casual men
Precision@5       : 1.0
Recall@5          : 0.556
Avg Precision@5   : 1.0
F1-Score@5        : 0.714
Reciprocal Rank@5 : 1.0
NDCG@5            : 1.0
-------------------------------------------------------
-------------------------------------------------------
Query 4: black sport shoe
Precision@5       : 0.2
Recall