# Part 3: Ranking & Filtering

#### Imports

In [1]:
import nltk
import json
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# from wordcloud import WordCloud
from collections import Counter

#### Useful code from part 1 & part 2

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\isaac\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def remove_punctuation(text):
    cleaned = ""
    for char in text:
        if char.isalnum() or char.isspace() or char == "-":
            cleaned += char
        else:
            cleaned += " "  # Replace punctuation with space
    return cleaned


In [4]:
products_path = '../../data/fashion_products_dataset.json'
with open(products_path, "r", encoding="utf-8") as f:
    products = pd.read_json(products_path)

def build_terms(line):
    """
    Preprocess a line:
    ●  Removing stop words 
    ●  Tokenization 
    ●  Removing punctuation marks 
    ●  Stemming 
    ●  Transforming to lowercase

    Argument:
    line -- string (text) to be preprocessed

    Returns:
    line - a list of tokens corresponding to the input text after the preprocessing
    """

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    line = line.lower()
    line = remove_punctuation(line)
    line = line.split()
    line = [x for x in line if x not in stop_words]
    line = [stemmer.stem(word) for word in line]
    return line

def get_products_information(products_df):
    elements = ["pid", "title", "description", "brand", "category", "sub_category", 
                "product_details", "seller", "out_of_stock", "selling_price", 
                "discount", "actual_price", "average_rating", "url"]
    
    products_df = products_df[elements]
    
    return products_df

products = get_products_information(products)
products["processed_title"] = products["title"].apply(build_terms)
products["processed_description"] = products["description"].apply(build_terms)
products['cat_subcat'] = products['category'] + ": " + products['sub_category']

In [5]:
products["title_description"] = products["processed_title"] + products["processed_description"]
display(products.head(5))

Unnamed: 0,pid,title,description,brand,category,sub_category,product_details,seller,out_of_stock,selling_price,discount,actual_price,average_rating,url,processed_title,processed_description,cat_subcat,title_description
0,TKPFCZ9EA7H5FYZH,Solid Women Multicolor Track Pants,Yorker trackpants made from 100% rich combed c...,York,Clothing and Accessories,Bottomwear,"[{'Style Code': '1005COMBO2'}, {'Closure': 'El...",Shyam Enterprises,False,921,69% off,2999,3.9,https://www.flipkart.com/yorker-solid-men-mult...,"[solid, women, multicolor, track, pant]","[yorker, trackpant, made, 100, rich, comb, cot...",Clothing and Accessories: Bottomwear,"[solid, women, multicolor, track, pant, yorker..."
1,TKPFCZ9EJZV2UVRZ,Solid Men Blue Track Pants,Yorker trackpants made from 100% rich combed c...,York,Clothing and Accessories,Bottomwear,"[{'Style Code': '1005BLUE'}, {'Closure': 'Draw...",Shyam Enterprises,False,499,66% off,1499,3.9,https://www.flipkart.com/yorker-solid-men-blue...,"[solid, men, blue, track, pant]","[yorker, trackpant, made, 100, rich, comb, cot...",Clothing and Accessories: Bottomwear,"[solid, men, blue, track, pant, yorker, trackp..."
2,TKPFCZ9EHFCY5Z4Y,Solid Men Multicolor Track Pants,Yorker trackpants made from 100% rich combed c...,York,Clothing and Accessories,Bottomwear,"[{'Style Code': '1005COMBO4'}, {'Closure': 'El...",Shyam Enterprises,False,931,68% off,2999,3.9,https://www.flipkart.com/yorker-solid-men-mult...,"[solid, men, multicolor, track, pant]","[yorker, trackpant, made, 100, rich, comb, cot...",Clothing and Accessories: Bottomwear,"[solid, men, multicolor, track, pant, yorker, ..."
3,TKPFCZ9ESZZ7YWEF,Solid Women Multicolor Track Pants,Yorker trackpants made from 100% rich combed c...,York,Clothing and Accessories,Bottomwear,"[{'Style Code': '1005COMBO3'}, {'Closure': 'El...",Shyam Enterprises,False,911,69% off,2999,3.9,https://www.flipkart.com/yorker-solid-men-mult...,"[solid, women, multicolor, track, pant]","[yorker, trackpant, made, 100, rich, comb, cot...",Clothing and Accessories: Bottomwear,"[solid, women, multicolor, track, pant, yorker..."
4,TKPFCZ9EVXKBSUD7,"Solid Women Brown, Grey Track Pants",Yorker trackpants made from 100% rich combed c...,York,Clothing and Accessories,Bottomwear,"[{'Style Code': '1005COMBO1'}, {'Closure': 'Dr...",Shyam Enterprises,False,943,68% off,2999,3.9,https://www.flipkart.com/yorker-solid-men-brow...,"[solid, women, brown, grey, track, pant]","[yorker, trackpant, made, 100, rich, comb, cot...",Clothing and Accessories: Bottomwear,"[solid, women, brown, grey, track, pant, yorke..."


In [6]:
def create_index_tfidf_products(products):
    index = defaultdict(list)
    tf = defaultdict(list)
    df = defaultdict(int)
    idf = defaultdict(float)
    title_index = defaultdict(str)

    num_documents = len(products)

    for i in range(num_documents):
        pid = products.iloc[i]["pid"]
        words = products.iloc[i]["title_description"]
        title_index[pid] = products.iloc[i].get("title", "")

        current_product_index = {}

        for position, term in enumerate(words):
            try:
                current_product_index[term][1].append(position)
            except:
                current_product_index[term] = [pid, array('I', [position])]

        norm = math.sqrt(sum(len(posting[1]) ** 2 for posting in current_product_index.values()))

        for term, posting in current_product_index.items():
            tf[term].append(np.round(len(posting[1]) / norm, 4))
            df[term] += 1

        for term, posting in current_product_index.items():
            index[term].append(posting)

    for term in df:
        idf[term] = np.round(np.log(float(num_documents / df[term])), 4)

    return index, tf, df, idf, title_index

## Score