# Projet 6
## Description du projet

## Configuration

In [None]:
# Imports
import sys, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import skimage.io as io
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import missingno as msno
import seaborn as sns
import random
import string
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Custom
from src.utils import (
    count_categories, find_dots, extract_label, tokenize,
    word_vectorizer
)

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# Variables globales
path_to_csv = os.path.join("data", "flipkart_com-ecommerce_sample_1050.csv")
path_to_images_folder = os.path.join("data", "images")

## I. Chargement des données

In [None]:
df_raw = pd.read_csv(path_to_csv)
print(df_raw.shape)

In [None]:
df_raw["product_category_tree"].iloc[0]

In [None]:
# Affichage des 3 premières données
for idx in range(3):
    img_path = os.path.join("data", "images", df_raw.loc[idx, "image"])
    img = io.imread(img_path)
    print("Short description:", df_raw.loc[idx, "product_name"])
    print("Category tree:", df_raw.loc[idx, "product_category_tree"])
    plt.imshow(img)
    plt.show()

# II. Définitions des catégories (labels pour la classif)
Le but de cette partie est d'étudier les différents labels que nous pouvons attribuer aux données.
La catégorie d'un produit est disponible via un arbre.
La question que nous nous poserons est la suivante : Jusqu'à quelle "profondeur" pourrons nous aller

In [None]:
# Count number of sub categories in category_tree
print("Count number of sub categories min/max for each product")
df_raw["nb_of_categories"] = df_raw["product_category_tree"].apply(
    count_categories
)
print(f"    There is/are at least  {df_raw['nb_of_categories'].min()} "
      "sub categories for each product")
print(f"    There are at max       {df_raw['nb_of_categories'].max()} "
      "sub categories for each product\n")

# Extract each different subcategories
for pos in range(8):
    df_raw[f"product_category_{pos}"] = df_raw["product_category_tree"].apply(
        extract_label,
        args=(pos,)
    )

print("Count all possible combinations:")
# Number of data according to tree depth
for i in range(1, 8):
    print(f"    There are {len(df_raw[df_raw['nb_of_categories'] >= i])}"
          f" data with {i} sub category(ies)")
    df_tmp = df_raw.copy()
    df_tmp["final_category"] = df_tmp["product_category_0"]
    for k in range(i):
        df_tmp["final_category"] = df_tmp["final_category"] +\
            "_" + df_tmp[f"product_category_{k}"]
    print(f"    -> Which makes {len(df_tmp['final_category'].unique())}"
          " possible combinations\n")
    del df_tmp
    
    
print("==> To many data is lost if take in a category_tree with at least 4 sub"
      "\n    categories. We will create 3 dataframes : df_1, df_2 and df_3"
      "\n    corresponding to the category tree depth wanted")

# Creates dataframes
# For 1 sub category
df1 = df_raw[df_raw["nb_of_categories"] >= 1]
df1["final_category"] = df1["product_category_0"]
dict_cat_to_label = {
    cat:i for (i, cat) in enumerate(df1["final_category"].unique())
}
df1["label"] = df1["final_category"].apply(lambda x: dict_cat_to_label[x])

# For 2 sub categories
df2 = df_raw[df_raw["nb_of_categories"] >= 1]
df2["final_category"] = df2["product_category_0"] +\
      "_" + df2["product_category_1"]
dict_cat_to_label = {
    cat:i for (i, cat) in enumerate(df2["final_category"].unique())
}
df2["label"] = df2["final_category"].apply(lambda x: dict_cat_to_label[x])

# For 3 sub categories
df3 = df_raw[df_raw["nb_of_categories"] >= 1]
df3["final_category"] = df3["product_category_0"] +\
      "_" + df3["product_category_1"] +\
      "_" + df3["product_category_2"]
dict_cat_to_label = {
    cat:i for (i, cat) in enumerate(df3["final_category"].unique())
}
df3["label"] = df3["final_category"].apply(lambda x: dict_cat_to_label[x])

# III. Traitement du texte

### III.1. Analyse exploratoire

In [None]:
# Init parameters
df = df1.copy()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
tokenizer = nltk.RegexpTokenizer(r"\w+")

# Tokenize documents
tokenized_corpus = [tokenizer.tokenize(doc) for doc in df["description"]]

# Lemmatize, lower
for i in range(len(tokenized_corpus)):
    words = [
        lemmatizer.lemmatize(word.lower(), pos="v")
        for word in tokenized_corpus[i]
    ]
    tokenized_corpus[i] = words

# Count each words
bow = [word for text in tokenized_corpus for word in text]
bow = Counter(bow)
bow = dict(sorted(bow.items(), key=lambda item: item[1], reverse=True))
print("20 most occurences:")
for i in range(20):
    word = list(bow.keys())[i]
    print(f"{word:<9}: {bow[word]}")
print("==> What is rs? Indian Rupees (money)\n")

# Drop stopwords
for i in range(len(tokenized_corpus)):
    words = [
        word
        for word in tokenized_corpus[i]
        if word not in stop_words
    ]
    tokenized_corpus[i] = words

# Count each words
bow = [word for text in tokenized_corpus for word in text]
bow = Counter(bow)
bow = dict(sorted(bow.items(), key=lambda item: item[1], reverse=True))
print("20 most occurences after removing stopwords:")
for i in range(20):
    word = list(bow.keys())[i]
    print(f"{word:<9}: {bow[word]}")

print(df["description"].iloc[0])

In [None]:
def preprocess(doc):
    # Init
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    # Tokenize the document
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    words = tokenizer.tokenize(doc)
    # Lemmatize and remove stop words
    words = [
        lemmatizer.lemmatize(word.lower(), pos="v")
        for word in words
        if word.lower() not in stop_words
    ]
    return words

for word in tokenized_corpus[0]:
    if word not in preprocess(df["description"].iloc[0]):
        print(word)
print()
for word in preprocess(df["description"].iloc[0]):
    if word not in tokenized_corpus[0]:
        print(word)

##

## III.2 Extraction de features : BoW

In [None]:
df = df3.copy()

corpus = df["description"]

X, _ = word_vectorizer([tokenize(doc) for doc in corpus], False)

X_train, X_test, y_train, y_test = train_test_split(X, df["label"], test_size=0.3, random_state=42)

In [None]:
preprocessor = StandardScaler()
clf = RandomForestClassifier(min_samples_leaf=2, random_state=77)

pipe = Pipeline(steps=[
    # ("preprocessor", preprocessor),
    ("classifier", clf)
])

pipe.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_train, clf.predict(X_train)))
print(accuracy_score(y_test, clf.predict(X_test)))

## B.3 Word tfid

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
transformer.fit_transform(X).toarray()

In [None]:
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define your custom stop words list
custom_stop_words = set(stopwords.words('english')).union({'specific', 'list', 'of', 'stopwords'})

# Custom preprocessing function
def preprocess(doc):
    # Use regex to clean the text
    doc = re.sub(r'\W', ' ', doc)
    # Tokenize the document
    words = word_tokenize(doc)
    # Lemmatize and remove stop words
    words = [lemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in custom_stop_words]
    return ' '.join(words)

# Example corpus
corpus = [
    'Cats are running faster than dogs.',
    'The cat ran faster than the dog.'
]

# Preprocess the corpus
preprocessed_corpus = [preprocess(doc) for doc in corpus]

# Create CountVectorizer and TfidfVectorizer
count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed corpus
X_count = count_vectorizer.fit_transform(preprocessed_corpus)
X_tfidf = tfidf_vectorizer.fit_transform(preprocessed_corpus)

# Print the results
print("Count Vectorizer Feature Names:", count_vectorizer.get_feature_names_out())
print("Count Vectorizer Array:\n", X_count.toarray())
print("TF-IDF Vectorizer Feature Names:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Vectorizer Array:\n", X_tfidf.toarray())


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Custom preprocessing function
def lemmatize_text(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

# Create the CountVectorizer with the custom preprocessing function
vectorizer = CountVectorizer(preprocessor=lemmatize_text)

# Example usage
corpus = [
    'Cats are running faster than dogs.',
    'The cat ran faster than the dog.'
]

X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(X.toarray())
