# Part 1: Text Processing and Exploratory Data Analysis

## 1. Data preparation

In [None]:
import os, string
import numpy as np
import pandas as pd

from unidecode import unidecode
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [None]:
data_path =  os.path.join(os.getcwd(), '../../data/')
doc_path = os.path.join(data_path, 'fashion_products_dataset.json')

data = pd.read_json(doc_path)

data.head(5)

1. As a first step, you must pre-process the documents. In particular, for the text fields (title,
description) you should:

- Removing stop words
- Tokenization
- Removing punctuation marks
- Stemming
- and... anything else you think it's needed (bonus point)

In [None]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))
translator = str.maketrans('', '', string.punctuation)

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower() # Lowercase
    text = text.translate(translator) # Remove punctuation
    text = unidecode(text) # normalize
    tokens = word_tokenize(text) # Tokenization
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words] # Remove stopwords and non-alphabetic tokens
    stemmed_tokens = [stemmer.stem(word) for word in tokens] # Stemming 
    stemmed_tokens = [word for word in stemmed_tokens if len(word) > 2] # Remove short tokens
    return ' '.join(stemmed_tokens)

data['title_clean'] = data['title'].apply(preprocess_text)
data['description_clean'] = data['description'].apply(preprocess_text)

data[['pid', 'title', 'title_clean', 'description_clean']].head(5)

# Replace original columns with cleaned versions
data['title'] = data['title_clean']
data['description'] = data['description_clean']
data.drop(columns=['title_clean', 'description_clean'], inplace=True)

2. Take into account that for future queries, the final output must return (when present) the following information for each of the  elected documents: pid, title, description, brand, category, sub_category, product_details, seller, out_of_stock, selling_price, discount, actual_price, average_rating, url

In [None]:
relevant_columns = [
    'pid', 'title', 'description', 'brand', 
    'category', 'sub_category', 'product_details', 
    'seller', 'out_of_stock', 'selling_price', 
    'discount', 'actual_price', 'average_rating', 'url'
]

available_fields = [f for f in relevant_columns if f in data.columns]
clean_data = data[available_fields].copy()
clean_data.head(5)

3. Decide how to handle the fields category, sub_category, brand, product_details, and seller during pre-processing. Should they be merged into a single text field, indexed as separate fields in the inverted index or any other alternative? Justify your choice, considering how their distinctiveness may affect retrieval effectiveness. What are pros and cons of each approach?

In [None]:
def clean_metadata_field(text):
    if not isinstance(text, str):
        return ""
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    return text.strip()

metadata_fields = ['category', 'sub_category', 'brand', 'product_details', 'seller']
for field in metadata_fields:
    clean_data[f'{field}_clean'] = clean_data[field].apply(clean_metadata_field)

display(clean_data.head(5))

# Replace original metadata columns with cleaned versions
for field in metadata_fields:
    clean_data[field] = clean_data[f'{field}_clean']
    clean_data.drop(columns=[f'{field}_clean'], inplace=True)

**TODO**

4. Consider the fields out_of_stock, selling_price, discount, actual_price, and average_rating. Decide how these should be handled during pre-processing to use in further search. Should they be indexed as textual terms?

In [None]:
numerical_fields = ['selling_price', 'actual_price', 'average_rating']

# Treat discount separately to handle percentage signs
data['discount'] = data['discount'].str.replace('% off', '', regex=False)  # Remove literal string
data['discount'] = pd.to_numeric(data['discount'], errors='coerce')

# Convert to binary (0/1) for out_of_stock
data['out_of_stock'] = data['out_of_stock'].map({True: 1, False: 0})

# Convert numerical fields to appropriate types
for field in numerical_fields:
    data[field] = pd.to_numeric(data[field], errors='coerce')

numerical_fields = ['out_of_stock', 'discount', 'selling_price', 'actual_price', 'average_rating']

display(data[numerical_fields].describe())

## 2. Exploratory Data Analysis

When working with data, it is important to have a better understanding of the content and some statistics. Provide an exploratory data analysis to describe the dataset you are working on in this project and explain the decisions made for the analysis. For example, word counting distribution, average sentence length, vocabulary size, ranking of products based on rating, price, discount, top sellers and brands, out_of_stock distribution, word clouds for the most frequent words, and entity recognition. Feel free to do the exploratory analysis and report your findings in the report.

In [None]:
# TODO