# Part 1: Indexing

#### Imports

In [67]:
import nltk
import json
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# from wordcloud import WordCloud
from collections import Counter

#### Useful code from part 1

In [68]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yuxia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [69]:
def remove_punctuation(text):
    cleaned = ""
    for char in text:
        if char.isalnum() or char.isspace() or char == "-":
            cleaned += char
        else:
            cleaned += " "  # Replace punctuation with space
    return cleaned


In [70]:
products_path = '../../data/fashion_products_dataset.json'
with open(products_path, "r", encoding="utf-8") as f:
    products = pd.read_json(products_path)

def build_terms(line):
    """
    Preprocess a line:
    ●  Removing stop words 
    ●  Tokenization 
    ●  Removing punctuation marks 
    ●  Stemming 
    ●  Transforming to lowercase

    Argument:
    line -- string (text) to be preprocessed

    Returns:
    line - a list of tokens corresponding to the input text after the preprocessing
    """

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    line = line.lower()
    line = remove_punctuation(line)
    line = line.split()
    line = [x for x in line if x not in stop_words]
    line = [stemmer.stem(word) for word in line]
    return line

def get_products_information(products_df):
    elements = ["pid", "title", "description", "brand", "category", "sub_category", 
                "product_details", "seller", "out_of_stock", "selling_price", 
                "discount", "actual_price", "average_rating", "url"]
    
    products_df = products_df[elements]
    
    return products_df

products = get_products_information(products)
products["processed_title"] = products["title"].apply(build_terms)
products["processed_description"] = products["description"].apply(build_terms)
products['cat_subcat'] = products['category'] + ": " + products['sub_category']

## 1. Build inverted index

We join the words in the processed titles and the processed descriptions for each product

In [71]:
products["title_description"] = products["processed_title"] + products["processed_description"]
display(products.head(5))

Unnamed: 0,pid,title,description,brand,category,sub_category,product_details,seller,out_of_stock,selling_price,discount,actual_price,average_rating,url,processed_title,processed_description,cat_subcat,title_description
0,TKPFCZ9EA7H5FYZH,Solid Women Multicolor Track Pants,Yorker trackpants made from 100% rich combed c...,York,Clothing and Accessories,Bottomwear,"[{'Style Code': '1005COMBO2'}, {'Closure': 'El...",Shyam Enterprises,False,921,69% off,2999,3.9,https://www.flipkart.com/yorker-solid-men-mult...,"[solid, women, multicolor, track, pant]","[yorker, trackpant, made, 100, rich, comb, cot...",Clothing and Accessories: Bottomwear,"[solid, women, multicolor, track, pant, yorker..."
1,TKPFCZ9EJZV2UVRZ,Solid Men Blue Track Pants,Yorker trackpants made from 100% rich combed c...,York,Clothing and Accessories,Bottomwear,"[{'Style Code': '1005BLUE'}, {'Closure': 'Draw...",Shyam Enterprises,False,499,66% off,1499,3.9,https://www.flipkart.com/yorker-solid-men-blue...,"[solid, men, blue, track, pant]","[yorker, trackpant, made, 100, rich, comb, cot...",Clothing and Accessories: Bottomwear,"[solid, men, blue, track, pant, yorker, trackp..."
2,TKPFCZ9EHFCY5Z4Y,Solid Men Multicolor Track Pants,Yorker trackpants made from 100% rich combed c...,York,Clothing and Accessories,Bottomwear,"[{'Style Code': '1005COMBO4'}, {'Closure': 'El...",Shyam Enterprises,False,931,68% off,2999,3.9,https://www.flipkart.com/yorker-solid-men-mult...,"[solid, men, multicolor, track, pant]","[yorker, trackpant, made, 100, rich, comb, cot...",Clothing and Accessories: Bottomwear,"[solid, men, multicolor, track, pant, yorker, ..."
3,TKPFCZ9ESZZ7YWEF,Solid Women Multicolor Track Pants,Yorker trackpants made from 100% rich combed c...,York,Clothing and Accessories,Bottomwear,"[{'Style Code': '1005COMBO3'}, {'Closure': 'El...",Shyam Enterprises,False,911,69% off,2999,3.9,https://www.flipkart.com/yorker-solid-men-mult...,"[solid, women, multicolor, track, pant]","[yorker, trackpant, made, 100, rich, comb, cot...",Clothing and Accessories: Bottomwear,"[solid, women, multicolor, track, pant, yorker..."
4,TKPFCZ9EVXKBSUD7,"Solid Women Brown, Grey Track Pants",Yorker trackpants made from 100% rich combed c...,York,Clothing and Accessories,Bottomwear,"[{'Style Code': '1005COMBO1'}, {'Closure': 'Dr...",Shyam Enterprises,False,943,68% off,2999,3.9,https://www.flipkart.com/yorker-solid-men-brow...,"[solid, women, brown, grey, track, pant]","[yorker, trackpant, made, 100, rich, comb, cot...",Clothing and Accessories: Bottomwear,"[solid, women, brown, grey, track, pant, yorke..."


In [72]:
def create_index(products):
    """
    Implement the inverted index.

    Argument:
    products - collection of products, where each product contains the list 
               of words for the title and description of the product.

    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms 
            as keys and the corresponding list of documents where these keys appears.
    """
    index = defaultdict(list)
    for i in range(len(products)):
        pid = products.iloc[i]["pid"]
        words = products.iloc[i]["title_description"]
        unique_terms = set(words)
        for term in unique_terms:
            index[term].append(pid)

    return dict(index)

In [73]:
inverted_index = create_index(products)
num_products = products.shape[0]
for i, (term, doc_list) in enumerate(inverted_index.items()):
    print(f"{term} DF = {round(len(doc_list)/num_products, 3)} : {doc_list}")
    if i == 9:
        break

great DF = 0.056 : ['TKPFCZ9EA7H5FYZH', 'TKPFCZ9EJZV2UVRZ', 'TKPFCZ9EHFCY5Z4Y', 'TKPFCZ9ESZZ7YWEF', 'TKPFCZ9EVXKBSUD7', 'TKPFCZ9EFK9DNWDA', 'TKPFDABN3GXYPFHE', 'TKPFCZ9ESGZYT8NH', 'TKPFCZ9DYU33FFXS', 'TKPFDABN4NQFVKZY', 'TKPFCZ9ENWGMX23W', 'TKPFCZ9EHCNAPKPU', 'TKPFDACEXAWUHGR7', 'TKPFCZ9ETR6YVXNG', 'TKPFD3K6K5TNYZGF', 'TKPFCZ9EGGYENTZS', 'TKPFD3K6ZMN79MPH', 'TKPFD3K6UZBYDZNY', 'TKPFD3K62JB9PEMR', 'TKPFCZ9EZDPZR5AH', 'TKPFCZ9EVM2GZ4GF', 'TKPFCZ9E2UC3DR3F', 'TKPFCZ9ECDYYDNKA', 'CTPFVZTEMJWEJJJV', 'CTPFVZHSA7G4PFC5', 'CTPFVZD8CNSZ3AMR', 'CTPFVQNNHGYFTGFN', 'CTPFVZT3UFN99ZTH', 'CTPFVSU7CXFCXEHD', 'CTPFVZGRKPGSFPUU', 'CTPFVZT7EFZWVRUP', 'CTPFVZT2GYAVYEE6', 'CTPFVZHFTSBTMH9M', 'CTPFVZTPGHGVFCFE', 'CTPFVZTCQC7ZFWSH', 'CTPFVSSQHD96FH9Z', 'CTPFVZEYHCRQ27Y2', 'CTPFVZFYR8KGYYBJ', 'CTPFVQNGFCRGYK2H', 'CTPFVPN4CUY6QZXD', 'CTPFVPMUAHEJX8EW', 'CTPFVPMZV7RCDNVR', 'CTPFVZT46SYT5GTB', 'CTPFVZT9ZYJB4WJZ', 'CTPFVTZ9GYNEDU54', 'CTPFVZTZNFBGXUMD', 'CTPFVZTBCHWHDMGJ', 'CTPFVQSGZGPHUEFX', 'CTPFVZTBN4GRZKXH', 

## 2. Propose test queries

In [109]:
q1 = "casual half sleeve polo shirt for men"
q2 = "light blue jeans slim fit"
q3 = "high quality fabric dress"
q4 = "sweatshirt cotton material"
q5 = "cotton t-shirt casual wear for women"

q = []
q.append(q1)
q.append(q2)
q.append(q3)
q.append(q4)
q.append(q5)

for i in range(5):
    query_terms = build_terms(q[i])

    # Start with the posting list of the first term
    result_docs = set(inverted_index.get(query_terms[0], []))

    # Perform intersection for AND logic
    for term in query_terms[1:]:
        result_docs &= set(inverted_index.get(term, []))

    print(result_docs)  # documents that contain ALL query terms


{'TSHFHVPBFGSGVXFM', 'TSHFHVPBFZSXYHEP', 'TSHFG2HTPXQAWZTS', 'TSHEG64SDHDEZAGH', 'TSHFG2HTGWSVXBZB', 'TSHFG2HTPGKEK3WX', 'TSHFPKNCFHB6QTFP', 'TSHFHVPBAUCHFQ3K', 'TSHFK68HVCGVFQZE', 'TSHFZP6HBDFGFWFC', 'TSHFHVPBVZ9HY9GG', 'TSHFPKNC3JEGVQCZ', 'TSHFFXTNZRTNP9JG', 'TSHFPD3VM8GGGEEX', 'TSHFK68SC4GBWKHN', 'TSHFG2HTNYF2VG9Q', 'TSHFHVPB7F33EBNN', 'TSHFFXX4YMHWGGPF', 'TSHFHVPBYQP9SNBP', 'TSHFG2HTBE9JC7D9', 'TSHFG2HTXEHZNGGZ', 'TSHFG2HT8YNZHWA5', 'TSHEGHG2YWG7TZJQ', 'TSHFG2HT6EMKGP7W', 'TSHEG5Y3ZNYCADMV', 'TSHFK68CSHUDZWAD', 'TSHEG5FPJMEZQXBD', 'TSHFHVPBSDYRNG9N', 'TSHEGHGENWAEJ3JV', 'TSHFHVPBF4BJYW7N', 'TSHFPKNCQAQFNVKB', 'TSHFFYTQNA9CXMTZ', 'TSHFK68PHMV7JVRU', 'TSHFK68GQPGQEVZK', 'TSHFK68PGB5P77Z3', 'TSHFKZYFJ8DXYPJG', 'TSHFFXTHJPHFU8HA', 'TSHFK68RZ9PZDXFK', 'TSHFG2HT2XZVHGPH', 'TSHFPKNCFYGMWDDH', 'TSHFG2HTZBX7RMGZ', 'TSHFG2HTQDXM3SQX', 'TSHFG2HTNZJZEPUH'}
{'JEAFEC2GEMGBWHA5', 'JEAFWH29ZSDRZCZ8', 'JEAFZQ8YZB7EBFHA', 'JEAFRARZ8V5JXDVU', 'JEAFPMKZC5EYHCHZ', 'JEAFSKYHHDQ2SDZQ', 'JEAFW4DUEGRPEYDR'