# Part 1: Indexing

#### Imports

In [1]:
import nltk
import json
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# from wordcloud import WordCloud
from collections import Counter

#### Useful code from part 1

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yuxia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def remove_punctuation(text):
    cleaned = ""
    for char in text:
        if char.isalnum() or char.isspace() or char == "-":
            cleaned += char
        else:
            cleaned += " "  # Replace punctuation with space
    return cleaned


In [4]:
products_path = '../../data/fashion_products_dataset.json'
with open(products_path, "r", encoding="utf-8") as f:
    products = pd.read_json(products_path)

def build_terms(line):
    """
    Preprocess a line:
    ●  Removing stop words 
    ●  Tokenization 
    ●  Removing punctuation marks 
    ●  Stemming 
    ●  Transforming to lowercase

    Argument:
    line -- string (text) to be preprocessed

    Returns:
    line - a list of tokens corresponding to the input text after the preprocessing
    """

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    line = line.lower()
    line = remove_punctuation(line)
    line = line.split()
    line = [x for x in line if x not in stop_words]
    line = [stemmer.stem(word) for word in line]
    return line

def get_products_information(products_df):
    elements = ["pid", "title", "description", "brand", "category", "sub_category", 
                "product_details", "seller", "out_of_stock", "selling_price", 
                "discount", "actual_price", "average_rating", "url"]
    
    products_df = products_df[elements]
    
    return products_df

products = get_products_information(products)
products["processed_title"] = products["title"].apply(build_terms)
products["processed_description"] = products["description"].apply(build_terms)
products['cat_subcat'] = products['category'] + ": " + products['sub_category']

## 1. Build inverted index

We join the words in the processed titles and the processed descriptions for each product

In [5]:
products["title_description"] = products["processed_title"] + products["processed_description"]
display(products.head(5))

Unnamed: 0,pid,title,description,brand,category,sub_category,product_details,seller,out_of_stock,selling_price,discount,actual_price,average_rating,url,processed_title,processed_description,cat_subcat,title_description
0,TKPFCZ9EA7H5FYZH,Solid Women Multicolor Track Pants,Yorker trackpants made from 100% rich combed c...,York,Clothing and Accessories,Bottomwear,"[{'Style Code': '1005COMBO2'}, {'Closure': 'El...",Shyam Enterprises,False,921,69% off,2999,3.9,https://www.flipkart.com/yorker-solid-men-mult...,"[solid, women, multicolor, track, pant]","[yorker, trackpant, made, 100, rich, comb, cot...",Clothing and Accessories: Bottomwear,"[solid, women, multicolor, track, pant, yorker..."
1,TKPFCZ9EJZV2UVRZ,Solid Men Blue Track Pants,Yorker trackpants made from 100% rich combed c...,York,Clothing and Accessories,Bottomwear,"[{'Style Code': '1005BLUE'}, {'Closure': 'Draw...",Shyam Enterprises,False,499,66% off,1499,3.9,https://www.flipkart.com/yorker-solid-men-blue...,"[solid, men, blue, track, pant]","[yorker, trackpant, made, 100, rich, comb, cot...",Clothing and Accessories: Bottomwear,"[solid, men, blue, track, pant, yorker, trackp..."
2,TKPFCZ9EHFCY5Z4Y,Solid Men Multicolor Track Pants,Yorker trackpants made from 100% rich combed c...,York,Clothing and Accessories,Bottomwear,"[{'Style Code': '1005COMBO4'}, {'Closure': 'El...",Shyam Enterprises,False,931,68% off,2999,3.9,https://www.flipkart.com/yorker-solid-men-mult...,"[solid, men, multicolor, track, pant]","[yorker, trackpant, made, 100, rich, comb, cot...",Clothing and Accessories: Bottomwear,"[solid, men, multicolor, track, pant, yorker, ..."
3,TKPFCZ9ESZZ7YWEF,Solid Women Multicolor Track Pants,Yorker trackpants made from 100% rich combed c...,York,Clothing and Accessories,Bottomwear,"[{'Style Code': '1005COMBO3'}, {'Closure': 'El...",Shyam Enterprises,False,911,69% off,2999,3.9,https://www.flipkart.com/yorker-solid-men-mult...,"[solid, women, multicolor, track, pant]","[yorker, trackpant, made, 100, rich, comb, cot...",Clothing and Accessories: Bottomwear,"[solid, women, multicolor, track, pant, yorker..."
4,TKPFCZ9EVXKBSUD7,"Solid Women Brown, Grey Track Pants",Yorker trackpants made from 100% rich combed c...,York,Clothing and Accessories,Bottomwear,"[{'Style Code': '1005COMBO1'}, {'Closure': 'Dr...",Shyam Enterprises,False,943,68% off,2999,3.9,https://www.flipkart.com/yorker-solid-men-brow...,"[solid, women, brown, grey, track, pant]","[yorker, trackpant, made, 100, rich, comb, cot...",Clothing and Accessories: Bottomwear,"[solid, women, brown, grey, track, pant, yorke..."


In [6]:
def create_index(products):
    """
    Implement the inverted index.

    Argument:
    products - collection of products, where each product contains the list 
               of words for the title and description of the product.

    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms 
            as keys and the corresponding list of documents where these keys appears.
    """
    index = defaultdict(list)
    for i in range(len(products)):
        pid = products.iloc[i]["pid"]
        words = products.iloc[i]["title_description"]
        unique_terms = set(words)
        for term in unique_terms:
            index[term].append(pid)

    return dict(index)

In [7]:
inverted_index = create_index(products)
num_products = products.shape[0]
for i, (term, doc_list) in enumerate(inverted_index.items()):
    print(f"{term} DF = {round(len(doc_list)/num_products, 3)} : {doc_list}")
    if i == 9:
        break

year DF = 0.016 : ['TKPFCZ9EA7H5FYZH', 'TKPFCZ9EJZV2UVRZ', 'TKPFCZ9EHFCY5Z4Y', 'TKPFCZ9ESZZ7YWEF', 'TKPFCZ9EVXKBSUD7', 'TKPFCZ9EFK9DNWDA', 'TKPFDABN3GXYPFHE', 'TKPFCZ9ESGZYT8NH', 'TKPFCZ9DYU33FFXS', 'TKPFDABN4NQFVKZY', 'TKPFCZ9ENWGMX23W', 'TKPFCZ9EHCNAPKPU', 'TKPFDACEXAWUHGR7', 'TKPFCZ9ETR6YVXNG', 'TKPFD3K6K5TNYZGF', 'TKPFCZ9EGGYENTZS', 'TKPFD3K6ZMN79MPH', 'TKPFD3K6UZBYDZNY', 'TKPFD3K62JB9PEMR', 'TKPFCZ9EZDPZR5AH', 'TKPFCZ9EVM2GZ4GF', 'TKPFCZ9E2UC3DR3F', 'TKPFCZ9ECDYYDNKA', 'CTPFVZHSA7G4PFC5', 'CTPFVZD8CNSZ3AMR', 'CTPFVQNNHGYFTGFN', 'CTPFVZT3UFN99ZTH', 'CTPFVSU7CXFCXEHD', 'CTPFVZGRKPGSFPUU', 'CTPFVZT7EFZWVRUP', 'CTPFVZT2GYAVYEE6', 'CTPFVZHFTSBTMH9M', 'CTPFVZTPGHGVFCFE', 'CTPFVSSQHD96FH9Z', 'CTPFVZEYHCRQ27Y2', 'CTPFVZFYR8KGYYBJ', 'CTPFW3W97JMQVBYG', 'CTPFW3VX6NGRNZUE', 'CTPFVQNGFCRGYK2H', 'CTPFVPN4CUY6QZXD', 'CTPFVPMUAHEJX8EW', 'CTPFVPMZV7RCDNVR', 'CTPFVZT46SYT5GTB', 'CTPFVZT9ZYJB4WJZ', 'CTPFVTZ9GYNEDU54', 'CTPFVZTZNFBGXUMD', 'CTPFVZTBCHWHDMGJ', 'CTPFVQSGZGPHUEFX', 'CTPFVZTBN4GRZKXH', '

## 2. Propose test queries

In [None]:
q1 = "casual half sleeve polo shirt for men"
q2 = "light blue jeans slim fit"
q3 = "trousers chino casual men"
q4 = "black sports shoes"
q5 = "fancy t-shirt"

q = []
q.append(q1)
q.append(q2)
q.append(q3)
q.append(q4)
q.append(q5)

for i in range(5):
    query_terms = build_terms(q[i])

    result_docs = set(inverted_index.get(query_terms[0], []))

    for term in query_terms[1:]:
        result_docs &= set(inverted_index.get(term, []))

    print(result_docs)  


{'TSHFFYTQNA9CXMTZ', 'TSHFG2HT6EMKGP7W', 'TSHFK68CSHUDZWAD', 'TSHFFXX4YMHWGGPF', 'TSHFK68PHMV7JVRU', 'TSHFPD3VM8GGGEEX', 'TSHFHVPB7F33EBNN', 'TSHEGHGENWAEJ3JV', 'TSHFK68GQPGQEVZK', 'TSHFHVPBAUCHFQ3K', 'TSHFG2HTPGKEK3WX', 'TSHFG2HTNYF2VG9Q', 'TSHFK68RZ9PZDXFK', 'TSHFG2HTQDXM3SQX', 'TSHEG64SDHDEZAGH', 'TSHFZP6HBDFGFWFC', 'TSHFPKNC3JEGVQCZ', 'TSHFG2HTXEHZNGGZ', 'TSHFG2HTPXQAWZTS', 'TSHEG5Y3ZNYCADMV', 'TSHEG5FPJMEZQXBD', 'TSHFHVPBFGSGVXFM', 'TSHFHVPBVZ9HY9GG', 'TSHFHVPBFZSXYHEP', 'TSHFKZYFJ8DXYPJG', 'TSHFHVPBF4BJYW7N', 'TSHFHVPBYQP9SNBP', 'TSHFFXTHJPHFU8HA', 'TSHFFXTNZRTNP9JG', 'TSHFPKNCFYGMWDDH', 'TSHFK68SC4GBWKHN', 'TSHFK68PGB5P77Z3', 'TSHFG2HTNZJZEPUH', 'TSHFG2HTGWSVXBZB', 'TSHFHVPBSDYRNG9N', 'TSHFPKNCFHB6QTFP', 'TSHEGHG2YWG7TZJQ', 'TSHFG2HTBE9JC7D9', 'TSHFPKNCQAQFNVKB', 'TSHFK68HVCGVFQZE', 'TSHFG2HT2XZVHGPH', 'TSHFG2HT8YNZHWA5', 'TSHFG2HTZBX7RMGZ'}
{'JEAFPMKZGHS2ZTKG', 'JEAFTZFX7HR5HZVW', 'JEAFUZXSHCZNHQE4', 'JEAFRARNKDFYXUYW', 'JEAFWH29FHYPZFSE', 'JEAFWH29AAGEZANB', 'JEAFH5H7ZNHRZQBJ'

IndexError: list index out of range

## 3. Rank your results

In [20]:
def create_index_tfidf_products(products):
    index = defaultdict(list)
    tf = defaultdict(list)
    df = defaultdict(int)
    idf = defaultdict(float)
    title_index = defaultdict(str)

    num_documents = len(products)

    for i in range(num_documents):
        pid = products.iloc[i]["pid"]
        words = products.iloc[i]["title_description"]
        title_index[pid] = products.iloc[i].get("title", "")

        current_product_index = {}

        for position, term in enumerate(words):
            try:
                current_product_index[term][1].append(position)
            except:
                current_product_index[term] = [pid, array('I', [position])]

        norm = math.sqrt(sum(len(posting[1]) ** 2 for posting in current_product_index.values()))

        for term, posting in current_product_index.items():
            tf[term].append(np.round(len(posting[1]) / norm, 4))
            df[term] += 1

        for term, posting in current_product_index.items():
            index[term].append(posting)

    for term in df:
        idf[term] = np.round(np.log(float(num_documents / df[term])), 4)

    return index, tf, df, idf, title_index

def rank_documents(terms, prods, index, idf, tf, title_index):
    product_vectors = defaultdict(lambda: [0] * len(terms))
    query_vector = [0] * len(terms)

    query_terms_count = collections.Counter(terms)
    query_norm = la.norm(list(query_terms_count.values()))

    for term_index, term in enumerate(terms):
        if term not in index:
            continue

        query_vector[term_index] = query_terms_count[term] / query_norm * idf[term]

        for posting_index, (pid, positions) in enumerate(index[term]):
            if pid in prods:
                product_vectors[pid][term_index] = tf[term][posting_index] * idf[term]

    product_scores = [[np.dot(vec, query_vector), pid] for pid, vec in product_vectors.items()]
    product_scores.sort(reverse=True)
    ranked_prods = [pid for _, pid in product_scores]

    return ranked_prods, product_scores

def search_tf_idf(query, index, idf, tf, title_index):
    query = build_terms(query)
    prods = None

    for term in query:
        try:
            term_prods = set(posting[0] for posting in index[term])
            if prods is None:
                prods = term_prods
            else:
                prods &= term_prods
        except:
            return [], []

    if not prods:
        return [], []

    prods = list(prods)
    ranked_prods, prod_scores = rank_documents(query, prods, index, idf, tf, title_index)
    return ranked_prods, prod_scores


In [None]:
index, tf, df, idf, title_index = create_index_tfidf_products(products)

In [76]:
print("Insert your query:\n")
query = input()
print(query)

ranked_prods, scores = search_tf_idf(query, index, idf, tf, title_index)
top = 10

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(ranked_prods)))
# for score, pid in scores[:top]:
#     print("product_id = {} - product_title: {} - score: {:.4f}".format(products[products["pid"] == pid]["url"], title_index[pid], score))

for score, pid in scores[:top]:
    url = products.loc[products["pid"] == pid, "url"].values
    url = url[0] if len(url) > 0 else "N/A"
    print("product_id = {} - product_title: {} - score: {:.4f} - url: {}".format(pid, title_index[pid], score, url))


Insert your query:

black sports shoes

Top 10 results out of 60 for the searched query:

product_id = SHOFN3EJHXHGSGUC - product_title: RS-7250 Running Shoes For Men  (Black) - score: 8.7852 - url: https://www.flipkart.com/vector-x-rs-7250-running-shoes-men/p/itm315827b1dcd91?pid=SHOFN3EJHXHGSGUC&lid=LSTSHOFN3EJHXHGSGUCGWFHSS&marketplace=FLIPKART&srno=b_5_189&otracker=nmenu_sub_Men_0_Footwear&fm=organic&iid=2c9c6405-15f7-4765-be59-536d21002294.SHOFN3EJHXHGSGUC.SEARCH&ssid=z6yjmdcc8w0000001612688550527
product_id = SHOFN3EJZKBG6QUF - product_title: JAGUAR Football Shoes For Women  (Red, Black) - score: 8.3345 - url: https://www.flipkart.com/vector-x-jaguar-football-shoes-men/p/itmb9b13375a136c?pid=SHOFN3EJZKBG6QUF&lid=LSTSHOFN3EJZKBG6QUFNSOSIE&marketplace=FLIPKART&srno=b_2_62&otracker=nmenu_sub_Men_0_Footwear&fm=organic&iid=ed342989-9f17-43c2-b620-87aa4485e449.SHOFN3EJZKBG6QUF.SEARCH&ssid=3u6pawgow00000001612688548473
product_id = SHOFUH52V5AVNHMN - product_title: Loafers For Men  (Bla

## 4. Evaluation

In [None]:
def precision_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: dict mapping doc_id to ground truth relevance (0 or 1)
    y_score: list of tuples (doc_id, predicted_score)
    k : number of docs to consider

    Returns
    -------
    precision @k : float
    """
    sorted_docs = sorted(y_score, key=lambda x: x[1], reverse=True)

    top_k_docs = [doc_id for doc_id, _ in sorted_docs[:k]]

    relevant = sum(doc_score.get(doc_id, 0) == 1 for doc_id in top_k_docs)

    return float(relevant) / k


In [None]:
def recall_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: dict mapping doc_id to ground truth relevance (0 or 1)
    y_score: list of tuples (doc_id, predicted_score)
    k : number of docs to consider

    Returns
    -------
    recall @k : float
    """
    sorted_docs = sorted(y_score, key=lambda x: x[1], reverse=True)

    top_k_docs = [doc_id for doc_id, _ in sorted_docs[:k]]

    relevant_retrieved = sum(doc_score.get(doc_id, 0) == 1 for doc_id in top_k_docs)

    total_relevant = sum(1 for relevance in doc_score.values() if relevance == 1)

    if total_relevant == 0:
        return 0.0

    return float(relevant_retrieved) / total_relevant


In [25]:
def avg_precision_at_k(doc_score, y_score, k=10):
    sorted_docs = sorted(y_score, key=lambda x: x[1], reverse=True)
    top_k = sorted_docs[:k]

    relevant = 0
    precision_sum = 0.0

    for i, (pid, _) in enumerate(top_k, start=1):
        if doc_score.get(pid, 0) == 1:
            relevant += 1
            precision_sum += relevant / i

    if relevant == 0:
        return 0.0

    return precision_sum / relevant


In [26]:
def f1_score_at_k(doc_score, y_score, k=10):
    sorted_docs = sorted(y_score, key=lambda x: x[1], reverse=True)
    top_k = sorted_docs[:k]

    relevant_retrieved = sum(doc_score.get(pid, 0) == 1 for pid, _ in top_k)
    total_relevant = sum(1 for rel in doc_score.values() if rel == 1)

    precision = relevant_retrieved / k if k > 0 else 0
    recall = relevant_retrieved / total_relevant if total_relevant > 0 else 0

    if precision + recall == 0:
        return 0.0

    return 2 * (precision * recall) / (precision + recall)


In [27]:
def map_at_k(all_doc_scores, all_y_scores, k=10):
    ap_list = []
    for doc_score, y_score in zip(all_doc_scores, all_y_scores):
        ap_list.append(avg_precision_at_k(doc_score, y_score, k))
    return np.mean(ap_list), ap_list


In [28]:
def rr_at_k(doc_score, y_score, k=10):
    sorted_docs = sorted(y_score, key=lambda x: x[1], reverse=True)
    top_k = sorted_docs[:k]

    for rank, (pid, _) in enumerate(top_k, start=1):
        if doc_score.get(pid, 0) == 1:
            return 1.0 / rank
    return 0.0


In [29]:
def dcg_at_k(doc_score, y_score, k=10):
    sorted_docs = sorted(y_score, key=lambda x: x[1], reverse=True)
    top_k = sorted_docs[:k]

    gains = [(2 ** doc_score.get(pid, 0) - 1) for pid, _ in top_k]
    discounts = np.log2(np.arange(len(gains)) + 2)

    return np.sum(np.array(gains) / discounts)

def ndcg_at_k(doc_score, y_score, k=10):
    ideal_y_score = [(pid, doc_score.get(pid, 0)) for pid in doc_score]
    ideal_dcg = dcg_at_k(doc_score, ideal_y_score, k)
    if ideal_dcg == 0:
        return 0.0
    return np.round(dcg_at_k(doc_score, y_score, k) / ideal_dcg, 4)


In [None]:
df = pd.read_csv("../../data/validation_labels.csv")

query_texts = {
    1: "women full sleeve sweatshirt cotton",
    2: "men slim jeans blue"
}

processed_queries = {qid: build_terms(text) for qid, text in query_texts.items()}

relevance_maps = {}
for query_id, group in df.groupby("query_id"):
    processed_query = tuple(processed_queries[query_id])
    relevance_maps[processed_query] = {
        row["pid"]: int(row["labels"]) for _, row in group.iterrows()
    }

for query_id, terms in processed_queries.items():
    query_key = tuple(terms)
    ranked_prods, prod_scores = rank_documents(terms, list(relevance_maps[query_key].keys()), index, idf, tf, title_index)
    y_score = [(pid, score) for score, pid in prod_scores]
    doc_score = relevance_maps[query_key]
    

    p_at_10 = round(precision_at_k(doc_score, y_score, k=10), 3)
    r_at_10 = round(recall_at_k(doc_score, y_score, k=10), 3)
    ap_at_10 = round(avg_precision_at_k(doc_score, y_score, k=10), 3)
    f1_at_10 = round(f1_score_at_k(doc_score, y_score, k=10), 3)
    rr = round(rr_at_k(doc_score, y_score, k=10), 3)
    ndcg = round(ndcg_at_k(doc_score, y_score, k=10), 3)

    print(f"{p_at_10}")
    print(f"{r_at_10}")
    print(f"{ap_at_10}")
    print(f"{f1_at_10}")
    print(f"{rr}")
    print(f"{ndcg}")
    print()

    # print("Top 10 products:")
    # for score, pid in prod_scores[:10]:
    #     title = title_index.get(pid, "N/A")
    #     print(f"{pid} - {title}")
    # print()

    # Collect all scores for MAP@10
all_doc_scores = []
all_y_scores = []

for query_id, terms in processed_queries.items():
    query_key = tuple(terms)
    ranked_prods, prod_scores = rank_documents(terms, list(relevance_maps[query_key].keys()), index, idf, tf, title_index)
    y_score = [(pid, score) for score, pid in prod_scores]
    doc_score = relevance_maps[query_key]

    all_doc_scores.append(doc_score)
    all_y_scores.append(y_score)

mean_ap, ap_list = map_at_k(all_doc_scores, all_y_scores, k=10)

print("MAP@10:")
print(f"{round(mean_ap, 3)}")
for ap in ap_list:
    print(f"{round(ap, 3)}", end=" ")



1.0
0.769
1.0
0.87
1.0
1.0

0.6
0.6
0.68
0.6
1.0
0.615

MAP@10:
0.84
1.0 0.68 