# Part 1: Text Processing and Exploratory Data Analysis

## 1. Data preparation

In [None]:
# %pip install nltk

In [None]:
import os, string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from unidecode import unidecode
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('punkt_tab')

In [None]:
data_path =  os.path.join(os.getcwd(), '../../data/')
doc_path = os.path.join(data_path, 'fashion_products_dataset.json')

data = pd.read_json(doc_path)

data.head(5)

1. As a first step, you must pre-process the documents. In particular, for the text fields (title,
description) you should:

- Removing stop words
- Tokenization
- Removing punctuation marks
- Stemming
- and... anything else you think it's needed (bonus point)

In [None]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))
translator = str.maketrans('', '', string.punctuation)

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower() # Lowercase
    text = text.translate(translator) # Remove punctuation
    text = unidecode(text) # normalize
    tokens = word_tokenize(text) # Tokenization
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words] # Remove stopwords and non-alphabetic tokens
    stemmed_tokens = [stemmer.stem(word) for word in tokens] # Stemming 
    stemmed_tokens = [word for word in stemmed_tokens if len(word) > 2] # Remove short tokens
    return ' '.join(stemmed_tokens)

data['title_clean'] = data['title'].apply(preprocess_text)
data['description_clean'] = data['description'].apply(preprocess_text)

data[['pid', 'title', 'title_clean', 'description_clean']].head(5)

# Replace original columns with cleaned versions
data['title'] = data['title_clean']
data['description'] = data['description_clean']
data.drop(columns=['title_clean', 'description_clean'], inplace=True)

2. Take into account that for future queries, the final output must return (when present) the following information for each of the  elected documents: pid, title, description, brand, category, sub_category, product_details, seller, out_of_stock, selling_price, discount, actual_price, average_rating, url

In [None]:
relevant_columns = [
    'pid', 'title', 'description', 'brand', 
    'category', 'sub_category', 'product_details', 
    'seller', 'out_of_stock', 'selling_price', 
    'discount', 'actual_price', 'average_rating', 'url'
]

available_fields = [f for f in relevant_columns if f in data.columns]
clean_data = data[available_fields].copy()
clean_data.head(5)

3. Decide how to handle the fields category, sub_category, brand, product_details, and seller during pre-processing. Should they be merged into a single text field, indexed as separate fields in the inverted index or any other alternative? Justify your choice, considering how their distinctiveness may affect retrieval effectiveness. What are pros and cons of each approach?

In [None]:
def clean_metadata_field(text):
    if not isinstance(text, str):
        return ""
    text = text.lower().translate(translator)
    return text.strip()

metadata_fields = ['category', 'sub_category', 'product_details']
for field in metadata_fields:
    clean_data[f'{field}_clean'] = clean_data[field].apply(clean_metadata_field)

display(clean_data.head())

# Replace original metadata columns with cleaned versions
for field in metadata_fields:
    clean_data[field] = clean_data[f'{field}_clean']
    clean_data.drop(columns=[f'{field}_clean'], inplace=True)

display(clean_data.head())

4. Consider the fields out_of_stock, selling_price, discount, actual_price, and average_rating. Decide how these should be handled during pre-processing to use in further search. Should they be indexed as textual terms?

In [None]:
# Convert data types to numeric
clean_data["out_of_stock"] = clean_data["out_of_stock"].astype(int)

clean_data["selling_price"] = clean_data["selling_price"].str.replace(',', '', regex=False)
clean_data["selling_price"] = clean_data["selling_price"].replace('', np.nan)
clean_data["selling_price"] = clean_data["selling_price"].astype(float)

clean_data["discount"] = clean_data["discount"].str.replace('%', '', regex=False)
clean_data["discount"] = clean_data["discount"].str.replace('off', '', regex=False)
clean_data["discount"] = clean_data["discount"].str.replace(' ', '', regex=False)
clean_data["discount"] = clean_data["discount"].replace('', np.nan)
clean_data["discount"] = clean_data["discount"].astype(float)

clean_data["actual_price"] = clean_data["actual_price"].str.replace(',', '', regex=False)
clean_data["actual_price"] = clean_data["actual_price"].replace('', np.nan)
clean_data["actual_price"] = clean_data["actual_price"].astype(float)

clean_data["average_rating"] = clean_data["average_rating"].replace('', np.nan)
clean_data["average_rating"] = clean_data["average_rating"].astype(float)


clean_data.head(5)

num_fields = ['out_of_stock', 'discount', 'selling_price', 'actual_price', 'average_rating']
display(clean_data[num_fields].describe())

## 2. Exploratory Data Analysis

When working with data, it is important to have a better understanding of the content and some statistics. Provide an exploratory data analysis to describe the dataset you are working on in this project and explain the decisions made for the analysis. For example, word counting distribution, average sentence length, vocabulary size, ranking of products based on rating, price, discount, top sellers and brands, out_of_stock distribution, word clouds for the most frequent words, and entity recognition. Feel free to do the exploratory analysis and report your findings in the report.

In [None]:
r = 25

plt.figure(figsize=(15, 8))
plt.subplot(1, 2, 1)
tmp = pd.DataFrame(clean_data['description'].str.split().explode().value_counts().sort_values(ascending=False)).reset_index()
sns.barplot(data=tmp[:r], y='description', x='count')
plt.title('Top 25 most common "description" words')

plt.subplot(1, 2, 2)
tmp = pd.DataFrame(clean_data['title'].str.split().explode().value_counts().sort_values(ascending=False)).reset_index()
sns.barplot(data=tmp[:r], y='title', x='count')
plt.title('Top 25 most common "title" words')

plt.tight_layout()
plt.show()

In [None]:
print(f'Average sentence length: {clean_data['description'].apply(lambda x: len(x.split())).mean():.2f} words')

In [None]:
print(f'Vocabulary size is {clean_data['description'].str.split().explode().nunique()} words') # only from description field

In [None]:
# Missing values per column
missing = clean_data.isna().sum().sort_values(ascending=False)
display(missing.to_frame("missing"))

In [None]:
# Different categories
cats = clean_data["category"].value_counts()
plt.figure()
cats.plot(kind="bar")
plt.title("Categories")
plt.xlabel("Category")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

#### Numeric columns

In [None]:
# Out of stock ratio
ratio = (clean_data["out_of_stock"] == 1).mean()
print(f'Out of stock ratio: {ratio:.2%}')

In [None]:
# Distributions of numerical fields
num_fields = ['selling_price', 'actual_price', 'discount', 'average_rating']

plt.figure(figsize=(15, 15))
for i, col in enumerate(num_fields, start=1):
    plt.subplot(2, 2, i)
    # clean_data[col].dropna().plot(kind='hist', bins=50, edgecolor='black', alpha=0.7)
    plt.hist(clean_data[col].dropna(), bins=50, edgecolor='black', alpha=0.7)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.tight_layout()
plt.show()

In [None]:
corr_matrix = clean_data.corr(numeric_only=True)
sns.heatmap(corr_matrix, annot=True)
plt.title("Correlation heatmap")
plt.show()

In [None]:
plt.figure(figsize=(15, 16))

tmp = clean_data.groupby('brand').mean(numeric_only=True)

plt.subplot(2, 2, 1)
sns.barplot(data=tmp.sort_values('out_of_stock', ascending=False).reset_index()[:r], y='brand', x='out_of_stock')
plt.title('Most out-of-stock brands')

plt.subplot(2, 2, 2)
sns.barplot(data=tmp.sort_values('discount', ascending=False).reset_index()[:r], y='brand', x='discount')
plt.title('Most discounted brands')

plt.subplot(2, 2, 3)
sns.barplot(data=tmp.sort_values('actual_price', ascending=False).reset_index()[:r], y='brand', x='actual_price')
plt.title('Most actually expensive brands')

plt.subplot(2, 2, 4)
sns.barplot(data=tmp.sort_values('selling_price', ascending=False).reset_index()[:r], y='brand', x='selling_price')
plt.title('Most supposadly expensive brands')

plt.tight_layout()
plt.show()