# 1. Understanding the problem

## 1.1 Mission



## 1.2 Requirements: Libraries used in this notebook

- See [`requirements.txt`] (./ requirements.txt) for the versions of the libraries tested with this notebook

In [None]:
# If this notebook does not work with the versions
# libraries in your environment, then
# Decommentarize The Following Line for Tested Versions:

# %pip install -r requirements.txt

In [None]:
def install_libraries(required={}) -> None:

    import sys
    import subprocess
    import pkg_resources
    installed = {pkg.key for pkg in pkg_resources.working_set}
    missing = set(required) - set(installed)
    if missing:
        print(f'missing libraries: {missing}')
        python = sys.executable
        subprocess.check_call([python, '-m', 'pip', 'install', *missing],
                              stdout=subprocess.DEVNULL)


required_libraries = {'numpy', 'pandas',
                      'matplotlib', 'seaborn', 'scikit-learn',
                      'nltk',
                      'gensim',
                      'tensorflow',
                      'transformers',
                      'tensorflow_hub',
                      'tensorflow_text',
                      'wordcloud',
                      'plotly',
                      'kaleido'}
install_libraries(required_libraries)

### 1.2.1 Import of libraries

In [None]:
import json
from typing import List
import transformers
import tensorflow_hub
import tensorflow as tf
import gensim
import nltk
from sklearn import metrics
from sklearn import cluster
from sklearn import manifold, decomposition
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import preprocessing, pipeline
import re
import string
import pickle
import scipy
from yellowbrick.cluster import SilhouetteVisualizer
import plotly.io as pio
import IPython
import plotly.graph_objects as go
from wordcloud import WordCloud
import seaborn as sns
from matplotlib.ticker import MaxNLocator
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
import os
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)




### 1.2.2 List of versions of the libraries used

In [None]:
from platform import python_version

python_version()
print('versions of libraries used:')
print('; '.join(f'{m.__name__}=={m.__version__}' for m in globals(
).values() if getattr(m, '__version__', None)))

### 1.2.3 Configuration of display defects

In [None]:
from sklearn import set_config

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.precision', 2)
pd.options.display.float_format = '{:.2f}'.format

%matplotlib inline
sns.set_theme(style="white", context="notebook")
sns.set_color_codes("pastel")
sns.set_palette("tab20")

set_config(display='diagram')
# Displays HTML Representation in a jupyter Context

### 1.2.4 some constant

In [None]:
# DEFINE RANDOM_SEED = None for Variable Results
# Here we define random_seed = constant only for reproducibility
RANDOM_SEED = 42

## 1.3 Utility functions

### 1.3.1 Graphics recording

To save the graphics, define ** `save_images = true` **

In [None]:
SAVE_IMAGES = True
IMAGE_FOLDER = './images/analyse'
if not os.path.exists(IMAGE_FOLDER):
    os.makedirs(IMAGE_FOLDER)

In [None]:
def sanitize(fig_name: str) -> str:

    return fig_name.replace(' ', '_').replace(':', '-').replace(
        '.', '-').replace('/', '_').replace('>', 'gt.').replace('<', 'lt.')


def to_png(fig_name=None) -> None:


    def get_title() -> str:

        if plt.gcf()._suptitle is None:
            return plt.gca().get_title()
        else:
            return plt.gcf()._suptitle.get_text()

    if SAVE_IMAGES:
        if fig_name is None:
            fig_name = get_title()
        elif len(fig_name) < 9:
            fig_name = f'{fig_name}_{get_title()}'
        fig_name = sanitize(fig_name)
        print(f'"{fig_name}.png"')
        plt.gcf().savefig(
            f'{IMAGE_FOLDER}/{fig_name}.png', bbox_inches='tight')

In [None]:
OUT_FOLDER = 'data/out'


def save_pickle(obj, filename, filepath=OUT_FOLDER):
    with open(f'{filepath}/{filename}.pickle', 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)


def load_pickle(filename, filepath=OUT_FOLDER):
    with open(f'{filepath}/{filename}.pickle', 'rb') as handle:
        return pickle.load(handle)

### 1.3.2 Check that the columns are in the dataframe

- without changing the order of the columns

In [None]:
def cols_in_df(df: pd.DataFrame, colonnes: list = None) -> list:

    ret_cols = []
    for col in colonnes:
        if col in df.columns:
            ret_cols.append(col)
    return ret_cols

# 2. Import, cleaning and exploratory analysis of data

A first articles database with the photo and an associated description:
[The link to download] (https://s3-eu-west-1.amazonaws.com/static.oc-statatic.com/prod/courses/files/parcours_data_scientist/projet+-+textimage+das+v2/dataset+Project+Pre%CC%81Trament+texts+images.zip)

## 2.1 Import of data

Once downloaded, data from the Zip file (329 MB) is extracted in the folder defined by Data_Folder below.

In [None]:
ENV = 'local'

if ENV == 'local':
# # Local Development
    DATA_FOLDER = 'data/raw'
    OUT_FOLDER = 'data/out'
    IMAGE_FOLDER = 'images/textes'

if ENV == 'colab':
# # COLABORATORY - Uncomment the Following 2 Lines to Connect to your Drive
# # from Google.colab Import Drive
# # Drive.ment ('/Content/Drive')
    DATA_FOLDER = '/content/drive/MyDrive/data/OC6'
    OUT_FOLDER = '/content/drive/MyDrive/data/OC6'
    IMAGE_FOLDER = '/content/drive/MyDrive/images/OC6/nettoyage'

In [None]:
import os

# Assuming your raw data filename and folder are defined as below
RAW_DATA_FILENAME = '/content/data/raw/flipkart_com-ecommerce_sample_1050.csv'
DATA_FOLDER = '/content'  # Modify this path as per your actual data folder location

def os_make_dir(folder):
    if not os.path.exists(folder):
        os.makedirs(folder)

def os_path_join(folder, file):
    return os.path.join(folder, file)  # Using os.path.join for better path handling

IMAGE_FOLDER = os_path_join(DATA_FOLDER, 'images')  # Assuming you want to store images in this folder
os_make_dir(IMAGE_FOLDER)
RAW_DATA = os_path_join(DATA_FOLDER, RAW_DATA_FILENAME)

print(f'Data file: {RAW_DATA}')

def read_file_rows(file_path=RAW_DATA, nb_rows=1):
    with open(file_path, encoding='UTF-8') as fp:
        for i in range(nb_rows + 1):
            row = fp.readline()
            print(f'Line {i} :\n{row}')

read_file_rows(RAW_DATA, nb_rows=1)


### Data format:

- The first line contains the headers (Column Names)
- The data seems separated by comma
- We use `encoding = utf-8 '
- The column `Product_SPecifications` is made up of Key Values.

In [None]:
df_data = pd.read_csv(RAW_DATA, sep=',', header=0, encoding='UTF-8')
df_data.head(2)

## 2.2 Data cleaning

In [None]:
df_data.info()

### Missing data

There is little missing data (see DF_Data.info () non-Null Count):

- 1 price not indicated
- 25% of the data do not have a brand

### Missing data filling

In [None]:
def fill_missing_values(df: pd.DataFrame):
    if 'brand' in df.columns:
        df = df.copy()
        df['brand'] = df['brand'].fillna('missing')
    return df


df_cleaned = df_data.pipe(fill_missing_values)

### Duplicated data

In [None]:
print(df_data.duplicated(subset=['uniq_id']).sum())
print(df_data.duplicated(subset=['pid']).sum())
print(df_data.duplicated(subset=['product_name']).sum())

In [None]:
for col in ['uniq_id', 'pid', 'product_name']:
    print(f'max_chars [{col}] = {df_data[col].map(len).max()}')

We can use `pid` as product identifier, deleting` uniq_id`
The `product_name` is unique in this sample but not necessarily in the population.

## 2.3 Exploratory analysis

Analysis of the relevance of columns for classification:

- categories
- Brands
- Product names and descriptions
- sample dates
- the costs

### 2.3.1 The categories

The objective is to be able to classify the products in the product_category_tree `(or another classification) automatically.

In [None]:
df_data['product_category_tree'].head()

In [None]:
print(df_data['product_category_tree'].nunique())
print(df_data['product_category_tree'].str.lower().nunique())

With 1000 samples, it will be difficult to automate the classification in 640 categories.

We look at the higher levels of the tree.

In [None]:
def get_category_count(row):

    return row.count('[')


df_data['product_category_tree'].map(lambda x: get_category_count(x)).max()

- Each product is associated with only one category in the Category Tree
- The column `Product_Category_tree` is a tree, separated by '>>'.

#### What is the depth of the tree?

In [None]:
def get_depth(categorie: str) -> int:

    return len(categorie.split(' >> '))


df_data['product_category_tree'].map(lambda x: get_depth(x)).max()

- Products have up to 7 category levels

In [None]:

df_data['product_category_tree'].str.lstrip(
    '["').str.rstrip('"]').str.split(' >>', expand=True).head()

In [None]:
categ_level_1 = (df_data['product_category_tree']
                 .str.lstrip('["').str.rstrip('"]')
                 .str.split(' >>', expand=True)).iloc[:, 0]
categ_level_1.value_counts()

### 2.3.2 Brands

In [None]:
def plot_brands(df: pd.DataFrame):
    nb_brands = df['brand'].nunique()
    df['brand'].value_counts(normalize=False).cumsum(
    ).reset_index(drop=True).plot(kind='line')
    ax = plt.gca()
# # ax.tick_params (labelbottom = false)
    ax.set_xlabel('brand')
    ax.set_ylabel('cumulative count')
    plt.title(f'Cumulative brand count (name unique = {nb_brands})')


plot_brands(df_data)
to_png()

In [None]:
def top_n_freq(df: pd.DataFrame, col, nb=10, others=True, normalize=False):

    nb = max(1, nb)
    counts_df = (df[col].value_counts(normalize=normalize)
                 .to_frame(name='freq')
                 .rename_axis(col)
                 )
    nb = min(nb, len(counts_df))
    top_n = counts_df.head(nb).copy()
    if others:
        top_n.loc['other', 'freq'] = counts_df.iloc[nb:, 0].sum()
    return top_n.reset_index()


def plot_bar_top_n(df: pd.DataFrame, col, nb=20, others=True, normalize=False, sort_values=False, palette=None,
                   ylabel=None, titre='', subtitle='', figsize=None):

    data = top_n_freq(df, col, nb, others, normalize).copy()
# # print (data.columns.to_list ())
    ax = None
    if not figsize is None:
        _, ax = plt.subplots(figsize=figsize)
    other_count = 0
    if others:
        filter_other = data[col] == 'other'
        other_count = data[filter_other]['freq'].values.sum()
        data = data[~filter_other]
    if sort_values:
        data = data.sort_values(by=col)
    if normalize:
        ax = sns.barplot(y=data[col], x=data['freq']
                         * 100, palette=palette, ax=ax)
        ax.set_xlabel('frequence (%)')
    else:
        ax = sns.barplot(y=data[col], x=data['freq'], palette=palette, ax=ax)
        ax.set_xlabel("number of occurrences")

    autres = ''
    if others and (other_count > 0):
        if normalize:
            other_count = f'{other_count * 100:.2f} %'
        else:
            other_count = f'{int(other_count)}'
        autres = f' [Other values = {other_count}]'
    if ylabel:
        ax.set_ylabel(ylabel)
    sns.despine()
    if len(titre) > 0:
        plt.suptitle(titre, y=1.05)
    plt.title(f'{subtitle} {autres}')
    plt.tight_layout()


plot_bar_top_n(df_data, 'brand', nb=20,
               subtitle="Top 20 brands of the sample")
to_png()

In [None]:
df_data[['brand']].join(categ_level_1).value_counts().head(10)

The name of the brand will help the classification (Netgear, Asus, HP will be classified in the 'Computers' category)

### 2.3.3 Product name, description

In [None]:
df_data['product_name'].nunique()

In [None]:
df_data.loc[0:3, ['brand', 'product_name', 'description']]

We see that the brand 'and `product_name` are included in the description`:

- These 2 fields can be deleted
- We keep them to see if we can categorize only based on brand or product name

### 2.3.4 Product specials

The column `product_spercifications` contains key-value peers for product specifications

In [None]:
df_data.loc[0, ['product_specifications']].values

In [None]:
# import json


def get_spec_keys(spec_text):
    try:
        items = json.loads(spec_text)
        return [item['key'] for item in items]
    except:
        return np.NaN


specification_keys = (df_data['product_specifications']
                      .str.replace('=>', ':')
                      .str.lstrip('{"product_specification":')
                      .str.rstrip('}')
                      .map(lambda x: get_spec_keys(x))
                      .rename('specification_keys')
                      )

spec_keys = specification_keys.explode().to_frame()

print(
    f"number of unique specification_keys : {spec_keys['specification_keys'].nunique()}")

plot_bar_top_n(spec_keys, 'specification_keys',
               subtitle='Top Specification Keys')
to_png()

It seems that the `Value` of each Key-Value Pair are included in the field` Description`

- So we can delete this field.

### 2.3.5 Sample dates (crawl_timestamp)

- Is the data comparable (they come from the same dates)?
- Samples come from a period of 7 months

In [None]:
sns.countplot(x=pd.to_datetime(
    df_data['crawl_timestamp']).dt.strftime('%Y_%m').sort_values())
sns.despine()
plt.title('Data date distribution')
to_png()

### 2.3.6 Price distribution

The columns `Retail_price` and` Discouned_price` can help classify products between categories, because we are waiting for healthcare products to be cheaper than computers, for example

Nevertheless, as data comes from different dates, we avoid using the price for classification

In [None]:
categ_level_1 = df_data['product_category_tree'].str.lstrip(
    '["').str.rstrip('"]').str.split(' >>', expand=True).iloc[:, 0]

sns.histplot(data=df_data, x='retail_price', hue=categ_level_1,
             palette='nipy_spectral',
             alpha=0.2,
             log_scale=True,
                     kde=True)
sns.despine()
plt.title(f'Price distribution by category (logarithmic scale)')
to_png()

## 2.4 Elimination of columns not relevant to the problem

Before doing feature engineering, the unused columns are deleted for classification

In [None]:
print(list(df_data.columns))

In [None]:
def drop_unused_columns(df):

    colonnes_non_pertinentes = ['uniq_id', 'crawl_timestamp', 'product_url', 'retail_price', 'discounted_price',
                                'is_FK_Advantage_product', 'product_rating', 'overall_rating', 'product_specifications']
    cols_to_drop = cols_in_df(df_cleaned, colonnes_non_pertinentes)
    print(f'dropping {len(cols_to_drop)} unused columns')
    return df.drop(columns=cols_to_drop)


df_cleaned = (
    df_data
    .pipe(fill_missing_values)
    .pipe(drop_unused_columns)
)

print(f'{df_data.shape} --> {df_cleaned.shape}')

# 3. Feature Engineering / Preprocessing

Features to create to classify texts:

- Preparation of ** categories ** from `product_category_tree
- Preparation of ** cleaned descriptions ** to enter models
- Creation of ** topics ** (subjects) based on these cleaned descriptions (to compare with the categories of `product_category_tree`))

## 3.1 Preparation of categories

Features to be created from `product_category_tree`:

- `Categ_level_1`: categories at the root of the tree
- `Categ_level_2`: subcategories (divisions in` Categ_level_1`)
- `Categ_level_3`: mini-categories (divisions in` Categ_level_2`)
- etc

In [None]:
def remove_brackets(node: str):
    node = node.replace('["', '').replace('"]', '')
    return node


def create_categ_level(df):
    if not 'product_category_tree' in df.columns:
        return df
    else:
        df_cat = (df['product_category_tree']
                  .map(lambda x: remove_brackets(x))
                  .str.split(' >> ', expand=True))
        print(f'create_categ_level, shape= {df_cat.shape}')
        depth = len(df_cat.columns)
        df_cat.columns = [f'categ_level_{i}' for i in range(1, depth+1)]
        return pd.concat([df, df_cat], axis=1)


df_cleaned = (
    df_data
    .pipe(fill_missing_values)
    .pipe(drop_unused_columns)
    .pipe(create_categ_level)
)

In [None]:
for col in df_cleaned.columns:
    if 'categ_level_' in col:
        print(f'{col}.nunique() = {df_cleaned[col].nunique()}')

### 3.1.1 Categories - Level 1

- We will first try to classify by level 1

In [None]:
print(df_cleaned['categ_level_1'].unique().tolist())
plot_bar_top_n(df_cleaned, 'categ_level_1', normalize=True,
               subtitle='Level 1 Categories')
to_png()

### 3.1.2 Categories - Level 2

In [None]:
print(df_cleaned['categ_level_2'].nunique())
plot_bar_top_n(df_cleaned, 'categ_level_2', normalize=True,
               subtitle='Level 2 Categories')
to_png()

In [None]:
pd.concat([df_cleaned[['categ_level_2', 'categ_level_1']].value_counts().head(20),
           df_cleaned[['categ_level_2', 'categ_level_1']].value_counts(
).cumsum().head(20).rename('cum_sum'),
], axis=1)

For the Categories tree, even if there are 62 level 2 categories, approximately 90% of the products are in the 20 largest categories of level 2.

- We can also assess the level 2 classification performance, to better understand the descriptions that are best classified in each main category.

### 3.1.3 Categories - Level 3

In [None]:
print(df_cleaned['categ_level_3'].nunique())
plot_bar_top_n(df_cleaned, 'categ_level_3', normalize=True,
               subtitle='Level 3 Categories')
to_png()

With only 1050 data, it will be very difficult to classify between 241 different level 3 categories.

- For example, distinguish between the brands of 'Coffee Mugs'

## 3.2 Preparation (preprocite) of data descriptions

We are trying to use the product 'field of products to classify them in levels of levels 1, 2, 3 ...

It is necessary to prepare the descriptions before providing them to the models of machine learning:

- Put everything in tiny
- Remove the liaison words
- Remove the punctuation
- Remove the numbers (if necessary)
- Transform sentences into a tokens list (in the words list)
- Remove connecting words or that does not provide meaning (** stopwords **)
- Lemmatizer
- Reform sentences with the remaining words

### 3.2.1 Features to be created from the description`:

Each model has different requisites:

-`Sentence_bow`: Descriptions in tiny, without unnecessary words, but not lemmatizes for Bow (Bag-of-Words) and TF-IDF (Term Frequency-Inverse Document Frequency)
-We will use these 'Bag-of-Words' for 'Baseline' Model
- `Sentence_bow_lem`: Treatment of` Senge_bow`, Lemmatizae for Bow, TF-IDF and Word2 de
- `sentence_dl`: cleaned descriptions

#### Options for cleaning:

- NLTK library
- Library Gens.utils
- Our own library

### 3.2.1.2 IMPORT from NLTK for text cleaning

We use the NLTK bookstore (Natural Language Toolkit), to help cleaning

We download stopwords, punctuation and lemmatization

- <https://www.nltk.org/data.html>

In [None]:
# import nltk

# stopwords
nltk.download('stopwords')
# punctuation
nltk.download('punkt')
# lemmatization
nltk.download('wordnet')
# Open Multilingual Wordnet Project Lemmatizations
nltk.download('omw-1.4')
pass

# Define the Language You Want To Use To Clean The Text Field `Description`
NTLK_LANGUAGE = 'english'

### 3.2.2 frequent words in descriptions

Before cleaning of descriptions, we look at the most used words

In [None]:
STOP_WORDS_EN = list(set(nltk.corpus.stopwords.words('english')))


def word_filter(list_words, stop_w=STOP_WORDS_EN):
    return [w for w in list_words if not w in stop_w]


tokenizer = nltk.RegexpTokenizer(r'[a-z]+')


def freq_words(sentences: pd.Series,
               token_fct=tokenizer.tokenize,
               filter_fct=word_filter,
               exclude=STOP_WORDS_EN,
               include=None,
               show_freq=True,
               normalize=True,
               nb=10) -> pd.DataFrame:


    corpus = sentences.str.lower().map(lambda x: token_fct(x))
    corpus = corpus.map(lambda x: filter_fct(x)).explode()


    if not exclude is None:
        corpus = [w for w in corpus if not w in exclude]

    word_count = ((pd.Series(corpus)
                  .value_counts(normalize=normalize)
                   .to_frame('freq'))
                  .rename_axis('word')
                  .reset_index()
                  )

    if not include is None:
# # Include Only The Selected Words
# # do after value_counts for percentage frequency
        word_count = word_count[word_count['word'].isin(include)]

    nb = min(nb, len(word_count))
    word_count = word_count.head(nb)

    if show_freq == False:
# # Take Only Top Words
        word_count = word_count.drop('freq', axis=1).T
    elif normalize:
# # Return Top Words and Frequency (%)
        word_count['freq'] *= 100
    else:
# # Return Top Words and Effective (Counting)

        word_count = word_count.rename(columns={'freq': 'count'})

# # Return is transposed to reduce the display space
    return word_count.T


print("most frequent words in 'product_name''")
print(freq_words(df_cleaned['product_name']))
print('----------------------------')
print("most frequent words in 'description'")
print(freq_words(df_cleaned['description']))
freq_words(df_cleaned['description'], nb=20, show_freq=True)

We see that the 20 most frequent words do not seem descriptive of a specific category, except the word 'Watch'

- They are rather the words of advertising, dimensions or color.

We check if these words are present more in each category

In [None]:
import pandas as pd

def freq_words_by_category(df: pd.DataFrame,
                           categ_col='categ_level_1',
                           desc_col='description',
                           exclude=None,
                           include=None,
                           show_freq=True,
                           normalize=True,
                           nb=10) -> pd.DataFrame:
    # Initialize an empty DataFrame to hold the results
    df_top = pd.DataFrame()

    # Retrieve unique categories from the specified category column
    categories = df[categ_col].unique()

    # Loop through each category
    for categ in categories:
        # Select descriptions corresponding to the current category
        item_descriptions = df[df[categ_col] == categ][desc_col]

        # Assuming 'freq_words' is a predefined function that returns a DataFrame of word frequencies
        df_categ_words = freq_words(item_descriptions, exclude=exclude, include=include,
                                    show_freq=show_freq, normalize=normalize, nb=nb)
        if show_freq:
            df_categ_words = df_categ_words.T.set_index('word').T
        else:
            df_categ_words = df_categ_words.T

        # Add a column for the category
        df_categ_words[categ_col] = categ

        # Append the current category DataFrame to the top DataFrame
        df_top = pd.concat([df_top, df_categ_words], ignore_index=True)

    return df_top



#### 3.4.1 visualizes like Word Clouds

In [None]:
# from WordCloud Import Wordcloud

def plot_wordcloud(sentences: pd.Series, cmap='nipy_spectral', ax=None, nb=20):
    cloud = WordCloud(stopwords=None, background_color=None,
                      colormap=cmap)
    topwords: pd.DataFrame = freq_words(sentences, nb=nb).T.set_index('word')
    topword_dict = dict(zip(list(topwords.index), list(topwords['freq'])))
# # print (topword_dict)
    cloud.generate_from_frequencies(topword_dict)
    if ax is None:
        plt.figure()
        ax = plt.gca()
    ax.imshow(cloud, interpolation="bilinear")
    plt.axis("off")


def plot_wordclouds_by_categ(df, feature='description', categ_col='categ_level_1',
                             cmap='nipy_spectral', nb=10):
    categories = sorted(list(df[categ_col].unique()))
    colors = sns.color_palette(cmap, n_colors=len(categories)).as_hex()
# # print (colors)
# # print (categories)
    n_cols = 4
    n_rows = len(categories) // n_cols + (len(categories) % n_cols > 0)
    _ = plt.figure(figsize=(n_cols*4, n_rows*3))
    for n, category in enumerate(categories):
        ax = plt.subplot(n_rows, n_cols, n + 1)
        sentences = df[df[categ_col] == category][feature]
        color = sns.color_palette(f'light:{colors[n]}', as_cmap=True)
        plot_wordcloud(sentences, ax=ax, cmap=color, nb=nb)
        plt.title(category, fontweight='bold')

    plt.subplots_adjust(hspace=0, wspace=0.2)


plot_wordclouds_by_categ(df_cleaned, feature='description',
                         categ_col='categ_level_1', cmap='Set1')
plt.suptitle('Frequent words in each level 1 category')
to_png()

Advertising words are present more in certain categories

- They are false category indicators, as probably added if the product lacks description, or after categorization of the product.

We compare with the most frequent words in 'Product_Name'

In [None]:
freq_words_by_category(df_cleaned, desc_col='product_name', show_freq=False)

In [None]:
plot_wordclouds_by_categ(df_cleaned, feature='product_name',
                         categ_col='categ_level_1', cmap='Set1')

We ask if the classification will work better on product_name, because it is cleaner.

In [None]:
# seed = np.random.randint (0, 1050)
seed = 18
print(f"[{seed}] : {df_data['product_name'][seed]}")
print(f"[{seed}] : {df_data['description'][seed]}")

### 3.2.3 Removing "Flipkart" advertising (Adwords)

We see that many of the descriptions contains sentences that have nothing to do with the product.

For products that do not have a description or special, perhaps Flipkart have added these advertising sentences to leave the space of empty description

In [None]:
print(df_data['description'][795])
print(df_data['description'][263])
print(df_data['description'][746])

For example :

> '** buy ** Ecraftindia Floral Cushions Cover at RS. 404 ** at flipkart.com.Only Genuine Products.Free Shipping.Cash on Delivery! ** '

These sentences make noise and reduces discrimination between products

In [None]:
ADVERTS = [
    'Buy', 'Only Genuine Products', '!', 'Cash On Delivery', 'Free Shipping', '30 Day Replacement Guarantee',
    'Online', 'at Flipkart.com', 'from Flipkart.com', 'Flipkart.com', 'best prices', 'Lowest Prices',
    'Great Discounts', 'in India Only'
]


def get_useless_adwords(df, adverts=ADVERTS):

    df = df.copy()
    ad_cols = []
    for idx, ad in enumerate(adverts):
        ad_col = f'AD_{idx}'
        ad_cols.append(ad_col)
        df[ad_col] = df['description'].str.lower().str.contains(ad.lower())

    print(df[ad_cols].sum(axis=0))


get_useless_adwords(df_data)

### Deletion of advertising sentences in descriptions

Advertisements seem to be added that sentences

- We delete them here as a sentence

Note: an alternative will be to add the words to the 'Stopwords', but

- We risk losing the same words out of the context of advertisements
- This procedure is not called in the pre -treatment of Deep Learning.

In [None]:
# import re


def remplace(sentence: str, old_str: str, new_str: str = ' ', case_sensitive=False) -> str:

    if case_sensitive:
        return sentence.replace(old_str, new_str)
    else:
        return re.sub(re.escape(old_str), new_str, sentence, flags=re.IGNORECASE)


def remove_useless_adwords(sentence: str, adverts=ADVERTS):

    for ad in adverts:
        sentence = remplace(sentence, ad, ' ')
    return sentence


TEST_ADVERTS = df_data['description'][746]
print(TEST_ADVERTS)
TEST_DESCRIPTION_1 = remove_useless_adwords(TEST_ADVERTS)
print(TEST_DESCRIPTION_1)

### 3.2.4 Deletion of prices in product description

We can consider that the price is not a product description:

- The products were obtained on various dates
- prices depend on the seller, date, currencies.

In [None]:

def remove_prices(sentence: str) -> str:

# # \ BRS.[0-9]*\.[0-9]*\ b, tested at https://regex101.com/
    if isinstance(sentence, list):
        sentence = (' ').join(sentence)
# # Remove the Space Between RS.And the love, if necessary
    sentence = remplace(sentence, 'Rs. ', 'Rs.')
    sentence = remplace(sentence, 'at Rs.', 'Rs.')
    sentence = remplace(sentence, 'only for Rs.', 'Rs.')
    sentence = remplace(sentence, 'for Rs.', 'Rs.')
    sentence = remplace(sentence, 'Price Rs.', 'Rs.')
    sentence = remplace(sentence, 'Price:', ' ')
# # If the Amount is decimal
    sentence = re.sub(r'\b[Rr]s.[0-9]*[\.,][0-9]*\b', ' ', sentence).strip()
# # If the Amount is within Decimal Point
    sentence = re.sub(r'\b[Rr]s.[0-9]*\b', ' ', sentence).strip()
# # Deletion of Double Spaces Left by Deleting Words
    sentence = re.sub('\s+', ' ', sentence)
    return sentence


print(TEST_DESCRIPTION_1)
TEST_DESCRIPTION_2 = remove_prices(TEST_DESCRIPTION_1)
print(TEST_DESCRIPTION_2)

#### Process test to remove advertisements and prices

The above processes are improved by testing randomly on the dataset

In [None]:
# seed = np.random.randint (0, 1050)
seed = 237
test_description = df_data['description'][seed]
test_cleaned = remove_prices(remove_useless_adwords(test_description))
print(f'original [{seed}] : \n{test_description}')
print(f'cleaned [{seed}] : \n{test_cleaned}')

### 3.2.4 Tokenization

Tokenization is the process of converting a chain into tokens (in general, words and punctuation)

In [None]:
import nltk
nltk.download('punkt')

In [None]:
from typing import List
import string
import re

def lower_start_fct(list_words: List[str]) -> List[str]:
    """
    Convert words to lowercase and filter out words starting with '@' or 'http'
    """
    lw = [w.lower() for w in list_words if (not w.startswith("@"))
          and (not w.startswith("http"))]
    return lw

def remove_useless_adwords(text: str) -> str:
    """
    Placeholder for ad word removal function
    """
    return text

def remove_prices(text: str) -> str:
    """
    Placeholder for price removal function
    """
    return text

def simple_tokenize(text: str) -> List[str]:
    """
    Simple word tokenization using regex
    """
    return re.findall(r'\b\w+\b', text.lower())

def tokenizer_fct(sentence: str, sans_ads=False) -> List[str]:
    """
    Tokenize and clean text by removing punctuation, digits, and special characters
    """
    if sans_ads:
        sentence = remove_useless_adwords(sentence)
        sentence = remove_prices(sentence)

    # Clean the text
    sentence = re.sub('\S*@\S*\s?', '', sentence)  # remove emails
    sentence = re.sub('\s+', ' ', sentence)  # remove newline chars
    sentence = re.sub("\'", "", sentence)  # remove single quotes

    # Remove punctuation
    sentence = ''.join([i for i in sentence if i not in string.punctuation])

    # Remove digits
    sentence = ''.join(i for i in sentence if not i.isdigit())

    # Convert to lowercase and clean special characters
    sentence_clean = (sentence.lower()
                     .replace('-', ' ')
                     .replace('+', ' ')
                     .replace('/', ' ')
                     .replace(',', ' ')
                     .replace('#', ' '))

    # Tokenize using simple word boundary matching
    return simple_tokenize(sentence_clean)

### 3.2.5 Stop Words

Stopwords in context for other words.
We do not delete it for Deep-learning like Bert and Use

For other algorithms:

- We delete the stopwords of the NLTK bookstore (in the language of descriptions)
- We analyze the most frequent words that remain

- We eliminate:
- The most frequent words not discriminatory (noise)
- Words that are not a product discriminatory ('Key features',')
- Forgotten words of advertising

In [None]:
STOP_WORDS_EN = list(set(nltk.corpus.stopwords.words('english')))
print(STOP_WORDS_EN[:40])

In [None]:
# from typing import list


def lower_start_fct(list_words: List[str]) -> List[str]:

    lw = [w.lower() for w in list_words if (not w.startswith("@"))
# # and (not w.startswith ("#"))
          and (not w.startswith("http"))]
    return lw


def stop_word_en_filter(list_words: List[str], stop_w=STOP_WORDS_EN) -> List[str]:

    filtered_w = [w for w in list_words if not w in stop_w]
    return filtered_w


print(stop_word_en_filter(tokenizer_fct(TEST_ADVERTS, sans_ads=True)))

#### Find the most frequent words in the corpus

We are looking for frequent non-discriminatory words.

- some frequent words are useful for classification, others not

In [None]:
# from typing import list

def get_corpus_freq(descriptions: pd.Series, nb=20, stop_w=STOP_WORDS_EN) -> List[str]:
    word_lists = descriptions.str.lower().map(lambda x: tokenizer_fct(x))
    corpus = word_lists.explode()
    print(f'unique words: {corpus.nunique()}')
    corpus_2 = pd.Series(stop_word_en_filter(corpus, stop_w=stop_w))
    most_freq = corpus_2.value_counts()[:nb]

    return most_freq


cleaned_corpus = df_data['description'].map(
    lambda x: remove_prices(remove_useless_adwords(x)))

most_freq = get_corpus_freq(cleaned_corpus, nb=40)
print(f'most_freq: {most_freq.index.to_list()}')

#### Creation of a specific stopwords list for this corpus

It is added specific stopwords of this corpus to the end of reducing noise words in the corpus.

- The most common words
- The least frequent words
- Words with an IDF score (reverse Frequency Document) very low

-<https://kavita-ganesan.com/tips-forcstructting-custom-top-word-lists/>

In [None]:
stop_frequent = freq_words(df_cleaned['description'], nb=20)
stop_frequent = stop_frequent.T['word'].tolist()
for word in ['watch', 'box', 'set', 'pack', 'color', 'cm']:
    if word in stop_frequent:
        stop_frequent.remove(word)
stop_frequent.extend(['details'])
print(stop_frequent)

In [None]:
# from Sklearn.FEATURE Extraction.Text Import TF IDF QUOTRIER

def get_low_idf_words(sentences: pd.Series):
    tfv = TfidfVectorizer(use_idf=True)
    sentences = sentences.map(lambda x: (' ').join(
        stop_word_en_filter(tokenizer_fct(x))))
    tfv.fit_transform(sentences)
    df_idf = pd.DataFrame({'word': tfv.get_feature_names_out(),
                           'idf': tfv.idf_})
    return df_idf.sort_values(by='idf')


idf = get_low_idf_words(df_cleaned['description'])
idf.head(30).T

In [None]:
stop_low_idf = idf.head(20)['word'].to_list()
# We Keep the Frequent Words Indicative of Certain Categories
# 'Watch' For Category 'Watches'
for word in ['watch', 'box', 'set', 'pack', 'color', 'cm']:
    if word in stop_low_idf:
        stop_low_idf.remove(word)
print(stop_low_idf)

In [None]:
# import time

STOP_WORDS_BASIC = ['[', ']', ',', '.', ':', '!',
                    '?', '(', ')', '%', '&', "'", "''", "'s"]

# frequent
# stop_frequent = ['Key', 'GENERAL', 'Detail', 'Details',
# 'feature', 'features', 'specification', 'specials']
stop_id = ['brand', 'product', 'products', 'id', 'model']
stop_pub = ['sales', 'price', 'warranty']
stop_adjectives = ['best', 'design', 'yes']


# Stop_words_custom = [ *stop_frequent, *stop_id, *stop_pub, *stop_adctives]
STOP_WORDS_CUSTOM = list({*stop_frequent, *stop_low_idf})
# Stop_words_custom = list (set ([*stop_low_idf])))))))
print(f'stop_words_custom : {STOP_WORDS_CUSTOM}')

timestr = time.strftime("%Y-%m-%d-%H.%M.%S")
pd.DataFrame(STOP_WORDS_CUSTOM).to_csv(
    f'{IMAGE_FOLDER}/stopwords_custom_{timestr}.csv')

STOP_WORDS_ALL = list(
    set([*STOP_WORDS_EN, *STOP_WORDS_BASIC, *STOP_WORDS_CUSTOM]))
most_freq = get_corpus_freq(cleaned_corpus, stop_w=STOP_WORDS_ALL, nb=40)
print(f'most_frequent other words: {most_freq.index.to_list()}')

In [None]:
stop_w_en = list(set([*STOP_WORDS_EN, *STOP_WORDS_BASIC]))

freq_words(df_data['description'].str.lower(),
           exclude=stop_w_en,
           include=STOP_WORDS_CUSTOM)

In [None]:
freq_words(df_data['description'].str.lower(),
           exclude=stop_w_en,
           include=STOP_WORDS_CUSTOM,
           normalize=False)

In [None]:
# Frequency of Custom Stopwords in Description
freq_words_by_category(df_cleaned, desc_col='description',
                       include=STOP_WORDS_CUSTOM)

In [None]:
# Frequency of Custom Stopwords in Product_Name
freq_words_by_category(df_cleaned, desc_col='product_name',
                       include=STOP_WORDS_CUSTOM)

In [None]:
def stop_word_filter_fct(list_words: List[str], stop_w=STOP_WORDS_ALL) -> List[str]:

    filtered_w = [w for w in list_words if not w in stop_w]
    return filtered_w


print(lower_start_fct(stop_word_filter_fct(
    tokenizer_fct("The movie was not good at all."))))

In [None]:


print(lower_start_fct(stop_word_filter_fct(
    tokenizer_fct(df_data['description'][6]), stop_w=STOP_WORDS_ALL)))

### 3.2.5 Lemmatization

In [None]:
# from typing import list
# from nltk.stem import wordnetlemmatizer


def lemma_fct(list_words: List[str]) -> List[str]:

    lemmatizer = nltk.stem.WordNetLemmatizer()
    lem_w = [lemmatizer.lemmatize(w) for w in list_words]
    return lem_w


# Lemmatizer test (bug: men -> man?)
TEST_LEMMA = 'boys girls men and women shows lots of highs and lows qualities in inches'.split(
    ' ')
print(TEST_LEMMA)
print(lemma_fct(TEST_LEMMA))

### 3.2.6 Common preparation of treatments

To analyze descriptions

#### creation of feature `sentence_dl` for Deep Learning

Deep Learning models like [Bert] (https://arxiv.org/abs/1810.04805) and [use] (https://arxiv.org/abs/1803.11175) treat the words in the context of their sentences

- Bert: bidirectional encoder representations from transformers
- Use: Universal Sentence encoder

With Bert and Use, you should not treat the texts too much, otherwise, you lose the context (root, lemmatization) or simply modify the texts (deletion of empty words).

Product descriptions already seem treated, so we try to avoid removing stopwords and lemmatization.

In [None]:
print(lower_start_fct(stop_word_filter_fct(
    tokenizer_fct("The movie was not good at all."))))

In [None]:

def transform_dl_fct(desc_text: str) -> str:

    desc_text = remove_useless_adwords(desc_text)
    desc_text = remove_prices(desc_text)
    word_tokens = tokenizer_fct(desc_text)
# # SW = stop_word_filter_fct (word_tokens)
    lw = lower_start_fct(word_tokens)
    transf_desc_text = ' '.join(lw)
    return transf_desc_text


TEST_TRANSFORM = df_data['description'][142]
print('---Original------')
print(TEST_TRANSFORM)
print('---Transformed----')
transform_dl_fct(TEST_TRANSFORM)

#### creation of feature `sentence_bow`

In [None]:
def transform_bow_fct(desc_text: str) -> str:

    desc_text = remove_useless_adwords(desc_text)
    desc_text = remove_prices(desc_text)

    word_tokens = tokenizer_fct(desc_text)
    sw = stop_word_filter_fct(word_tokens)
    lw = lower_start_fct(sw)
# # lem_w = lemma_fct (LW)
    transf_desc_text = ' '.join(lw)
    return transf_desc_text


transform_bow_fct(TEST_TRANSFORM)

#### creation of feature `sentence_bow_lem`

In [None]:

def transform_bow_lem_fct(desc_text: str) -> str:

    desc_text = remove_useless_adwords(desc_text)
    desc_text = remove_prices(desc_text)
    word_tokens = tokenizer_fct(desc_text)
    sw = stop_word_filter_fct(word_tokens)
    lw = lower_start_fct(sw)
    lem_w = lemma_fct(lw)
    transf_desc_text = ' '.join(lem_w)
    return transf_desc_text


print(TEST_ADVERTS)
print(transform_bow_lem_fct(TEST_ADVERTS))
print(TEST_TRANSFORM)
print(transform_bow_lem_fct(TEST_TRANSFORM))

In [None]:
def transform_bow_lem_ads_fct(desc_text: str) -> str:

# # Desc_Text = Remove_useless_adwords (Desc_Text)
# # Desc_Text = Remove_price (Desc_Text)
    word_tokens = tokenizer_fct(desc_text)
    sw = stop_word_filter_fct(word_tokens)
    lw = lower_start_fct(sw)
    lem_w = lemma_fct(lw)
    transf_desc_text = ' '.join(lem_w)
    return transf_desc_text


print(TEST_ADVERTS)
transform_bow_lem_ads_fct(TEST_ADVERTS)

### 3.2.7 Text transformation ready for models

In [None]:

def transform_field(df: pd.DataFrame, text_col: str = 'description') -> pd.DataFrame:
    df = df.copy()
    df['sentence_bow'] = df[text_col].apply(lambda x: transform_bow_fct(x))
    df['sentence_bow_lem'] = df[text_col].apply(
        lambda x: transform_bow_lem_fct(x))
    df['product_name_bow_lem'] = df['product_name'].apply(
        lambda x: transform_bow_lem_fct(x))
    df['sentence_bow_lem_ads'] = df[text_col].apply(
        lambda x: transform_bow_lem_ads_fct(x))
    df['sentence_dl'] = df[text_col].apply(lambda x: transform_dl_fct(x))
    print(f'transform_field [{text_col}], df.shape={df.shape}')
    return df


df_cleaned = df_cleaned.pipe(transform_field)
df_cleaned.head(1)

### 3.2.9 Calculation of the length of the sentences (number of words)

Some models need to know the length of the sentence as a hyper parameter

In [None]:

def calc_length_bow(df: pd.DataFrame):
    def tokenizer_(x): return len(nltk.tokenize.word_tokenize(x))
    df = df.copy()
    df['length_bow'] = df['sentence_bow'].apply(tokenizer_)
    print(f"max length bow : {df['length_bow'].max()}")
    df['length_dl'] = df['sentence_dl'].apply(tokenizer_)

    print(f"max length dl : {df['length_dl'].max()}")
    return df

## 3.3 pipeline for creation of features 'sentence' for models

In [None]:
import pandas as pd
import nltk
from typing import Callable

def calc_length_bow(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate length of bag-of-words and direct learning sentences using simple word splitting.

    Args:
        df: DataFrame containing 'sentence_bow' and 'sentence_dl' columns
    Returns:
        DataFrame with added 'length_bow' and 'length_dl' columns
    """
    def tokenizer_(x: str) -> int:
        if pd.isna(x):
            return 0
        # Use simple whitespace tokenization instead of NLTK's word_tokenize
        return len(str(x).split())

    df = df.copy()

    # Calculate lengths and add logging
    df['length_bow'] = df['sentence_bow'].apply(tokenizer_)
    print(f"max length bow: {df['length_bow'].max()}")
    print(f"mean length bow: {df['length_bow'].mean():.2f}")

    df['length_dl'] = df['sentence_dl'].apply(tokenizer_)
    print(f"max length dl: {df['length_dl'].max()}")
    print(f"mean length dl: {df['length_dl'].mean():.2f}")

    return df

def safe_pipe(df: pd.DataFrame, func: Callable, func_name: str, *args, **kwargs) -> pd.DataFrame:
    """
    Safely apply a pipeline function with error handling and logging.

    Args:
        df: Input DataFrame
        func: Function to apply
        func_name: Name of the function for logging
        *args, **kwargs: Additional arguments for the function
    Returns:
        Processed DataFrame
    """
    try:
        result = func(df, *args, **kwargs)
        print(f"Applied {func_name}, shape={result.shape}")
        return result
    except Exception as e:
        print(f"Error in {func_name}: {str(e)}")
        raise

def process_data(raw_data_path: str) -> pd.DataFrame:
    """
    Main data processing pipeline.

    Args:
        raw_data_path: Path to the raw CSV file
    Returns:
        Processed DataFrame
    """
    # Create transform_field step with 'description' parameter
    transform_desc = lambda df: transform_field(df, 'description')

    pipeline = [
        (fill_missing_values, "fill_missing_values"),
        (drop_unused_columns, "drop_unused_columns"),
        (create_categ_level, "create_categ_level"),
        (transform_desc, "transform_field_description"),
        (calc_length_bow, "calc_length_bow")
    ]

    try:
        df = pd.read_csv(raw_data_path, sep=',', encoding='UTF-8')

        for func, name in pipeline:
            df = safe_pipe(df, func, name)

        return df

    except Exception as e:
        print(f"Error processing data: {str(e)}")
        raise

# Usage
df_cleaned = process_data(RAW_DATA)

## 3.4 Verification of more frequent words

In [None]:

print('---- bag-of-words ------')
get_corpus_freq(df_cleaned['sentence_bow'])
print('---- bag-of-words with lemmatization ------')
get_corpus_freq(df_cleaned['sentence_bow_lem'])
print('---- sentence for deep learning ------')
get_corpus_freq(df_cleaned['sentence_dl'])
pass

### 3.4.1 Visualize as Word Clouds

In [None]:
plot_wordclouds_by_categ(
    df_cleaned, feature='sentence_bow_lem', categ_col='categ_level_1')
plt.suptitle('Frequent words in each level 1 category after cleaning')
to_png()

## 3.5 saves prepared data

In [None]:
os_make_dir(OUT_FOLDER)
df_cleaned.to_csv(f'{OUT_FOLDER}/data_cleaned.csv',
                  encoding='UTF-8', index=False)

## 3.6 Cleaning global variables

In [None]:
del df_data
del df_cleaned

---

# 4. Common functions for NLP models

- Data reading
- Model scoring
- Visualization of performance metrics
- Visualization of clusters

## 4.1 Data reading

In [None]:
data_T = pd.read_csv(f'{OUT_FOLDER}/data_cleaned.csv')
print(data_T.shape)

### 4.1.1 Variables to compare the categories

In [None]:
# from Sklearn.Preprocessing import labelencoder
cat_encoder = preprocessing.LabelEncoder()

# List of Level 1 categories
list_categories = sorted(list(set(data_T['categ_level_1'])))

l_cat_num = cat_encoder.fit_transform(list_categories)

# Category Map
cat_map = dict(zip(l_cat_num, list_categories))
print("categories : ", cat_map)
cat_encoder.inverse_transform(l_cat_num)


y_cat_num = cat_encoder.transform(data_T['categ_level_1'])
# True Category
y_cat_txt = data_T['categ_level_1'].copy()

In [None]:
cat_encoder.inverse_transform([1, 0, 5, 6])

## 4.2 Modeling and evaluation of models:

- ** PRESTRATION ** Data (choice of description, lemmatization, stopwords, ...) - see above.We use 'feature' to choose the column of preterity data.

-** Extraction of features ** (by bag-of-words, TF-IDF, Word2vec etc)

- ** Reduction of dimensions ** (by t-t, PCA, NMF, truncatedsvd ...)

- ** Classification (clustering) not supervised on reduced dimensions ** (gridsearch of hyperparammeters, without or with choice of clusters)

- by Kmeans (defect)
- by LDA (Topic Modeling)
- by NMF (not used here)

- ** Performance evaluation **:

- Score distortion (inertia or sum of square errors of each cluster)
- Davies_bouldin score (clusters separation measure)
- Silhouette score of clusters (+ visualization of the silhouettes of each cluster)
-Stability score of clusters (on sub-samples-> Type deviation)

- ** Evaluation of correspondence with the defined categories **
- Ari Score
- Percentage of badly classified items
- Visualization to compare clusters on the main reduced dimensions (tsne or PCA for example)
- Sankey visualization of correcting between clusters

### 4.2.1 Scoring: performance of a model

It is complicated to set up a preprocessing pipeline, extraction feature, reduction dimension, classification, scoring, because the bookstores sometimes return a sparse matrix, sometimes not.

Below, my generic procedure for score a model

- We vary the parameter K to assess the number of clusters for better performance

You can easily add a param_grid and make a gridsearch on all combinations of the parameters:

`` python
# generate parameter grid for classif
param_sets = parametergrid (param_grid)

for params in param_sets:
# print (f'Score model for params = {params} ')
# Set Model Parameters (usually just n_conlusters for classify)
Model.set_params (** params)
Model.Fit ...
`` `

In [None]:
import pandas as pd
import numpy as np
from sklearn import manifold, cluster, metrics
from sklearn.feature_extraction.text import TfidfVectorizer
import time
import scipy
from scipy.sparse import csr_matrix

def get_classname(estimator):
    return estimator.__class__.__name__

def score_model(df: pd.DataFrame,
                feature='sentence_bow_lem',
                labels_true=None,
                document_preprocessor=None,
                feature_extractor=TfidfVectorizer(
                    stop_words='english', min_df=3),
                dimension_reducer=manifold.TSNE(n_components=2, perplexity=30, n_iter=2000,
                                              init='pca', learning_rate=200, random_state=42),
                kmin=7, kmax=7):

    # Preprocess
    if document_preprocessor is None:
        documents = df[feature]
    else:
        documents = document_preprocessor.fit_transform(df[feature])

    # Extract features
    start_fit_extract = time.time()
    features = feature_extractor.fit_transform(documents)
    fit_time_extract = round(time.time()-start_fit_extract, 2)
    print(
        f'Extract_features ({get_classname(feature_extractor)}), fit time = {fit_time_extract} s')
    print(f'type(features) = {type(features)}')

    if isinstance(features, scipy.sparse.csr_matrix):
        print('converting features from sparse to dense array')
        features = features.toarray()

    print(f'features.shape ={features.shape}')

    # Reduce Dimensions
    start_fit_reduce = time.time()
    reduced_dimensions = dimension_reducer.fit_transform(features)
    fit_time_reduce = round(time.time()-start_fit_reduce, 2)
    print(
        f'Reduced dimensions [{reduced_dimensions.shape}] ({get_classname(dimension_reducer)}), fit time = {fit_time_reduce} s')

    scores_list = []

    for k in range(kmin, kmax+1):
        clusterer = cluster.KMeans(n_clusters=k)

        start_fit_clf = time.time()
        clusterer.fit(reduced_dimensions)
        fit_time_clf = round(time.time()-start_fit_clf, 2)

        labels_pred = clusterer.labels_
        cluster_sizes = (pd.Series(labels_pred).value_counts(
            normalize=True).values*100).astype(int)

        min_cluster_pct = pd.Series(
            labels_pred).value_counts(normalize=True).min()

        res = {
            'k': k,
            'min_cluster_pct': round(min_cluster_pct*100, 2),
            'distortion_score': round(getattr(clusterer, 'inertia_', 0), 0),
            'davies_bouldin': round(metrics.davies_bouldin_score(reduced_dimensions, labels_pred), 2),
            'calinski_harabasz': round(metrics.calinski_harabasz_score(reduced_dimensions, labels_pred), 2),
            'silhouette_score': round(metrics.silhouette_score(reduced_dimensions, labels_pred), 3),
            'fit_time_cluster': fit_time_clf
        }

        ARI = ''
        if labels_true is not None:
            res['ARI'] = round(metrics.adjusted_rand_score(
                labels_true, labels_pred), 3)
            ARI = f" ARI = {res['ARI']},"

        print(
            f"k={k}, fit: {fit_time_clf} s, silhouette= {res['silhouette_score']:.3},{ARI} cluster sizes (%) = {cluster_sizes}")

        scores_list.append(res)

    df_scores = pd.DataFrame(scores_list)

    df_scores['feature_extraction'] = get_classname(feature_extractor)
    df_scores['dimension_reduction'] = get_classname(dimension_reducer)
    df_scores['classifier'] = get_classname(clusterer)
    df_scores['k'] = df_scores['k'].astype(int)

    if kmin == kmax:
        return df_scores, labels_pred, reduced_dimensions, k
    else:
        best_score_idx = df_scores['silhouette_score'].idxmax()
        best_k = df_scores.loc[best_score_idx, 'k']
        clusterer = cluster.KMeans(n_clusters=best_k)
        clusterer.fit(reduced_dimensions)
        labels_pred = clusterer.labels_
        return df_scores, labels_pred, reduced_dimensions, best_k

# Usage
scores, labels, red_dim, best_k = score_model(data_T, 'sentence_bow_lem',
                                            labels_true=data_T['categ_level_1'],
                                            kmin=4, kmax=12)
print(f'K for best silhouette: {best_k}')

## 4.3 Visualizations of performance metrics

We are inspired by Yellowbrick Kelbowvisualizer to generalize to other metrics (Stability for example)

In [None]:

def angle_between_vectors(v1, v2):

    ang1 = np.arctan2(*v1[::-1])
    ang2 = np.arctan2(*v2[::-1])
    ang = np.rad2deg(abs(ang1 - ang2) % (2 * np.pi))
    if ang > 180:
        ang = ang-180
    return ang


print([
    angle_between_vectors([1, 1], [0, 0]),
    angle_between_vectors([1, 1], [0, 1]),
    angle_between_vectors([1, 1], [-1, 0]),
    angle_between_vectors([1, 1], [-1, -1]),
])

In [None]:

def find_elbow(df: pd.DataFrame, x_col='k', y_col='distortion_score'):

    df = df.copy().reset_index()
# # Standardise SCALE TO Maximise Between Stops

    x_scale = df[x_col].max()-df[x_col].min()
    y_min = df[y_col].min()
    y_mult = (df[y_col].max()-y_min)/x_scale
    df['y_scaled'] = (df[y_col]-y_min)/y_mult

# # Calculte angles Between Conseiscive Points
    n = len(df)
    for i in range(0, n-2):
        point1 = df.loc[i, [x_col, 'y_scaled']]
        point2 = df.loc[i+1, [x_col, 'y_scaled']]
        point3 = df.loc[i+2, [x_col, 'y_scaled']]
        vec1 = point2-point1
        vec2 = point3-point2
        df.loc[i+1, 'angle'] = angle_between_vectors(vec1, vec2)
# # Elbow is at Highest Difference in Angle (Assume Monotonic Function)
    row = df['angle'].idxmax()
    x_elbow = df.loc[row, x_col]
    y_score = df.loc[row, y_col]
    return x_elbow, y_score

In [None]:

k_best, distortion_score = find_elbow(scores, y_col='distortion_score')
print(f'Best score for k = {k_best}')
# check labels for best score have been returned
print(pd.Series(labels).nunique())

### 4.3.1 Plot Elbow Visualize

In [None]:

def plot_second_ax(df, x_col, y2_col, ax, color='grey'):

    if y2_col in df.columns:
        ax2 = ax.twinx()
        ax2.plot(df[x_col], df[y2_col], label=y2_col,
                 c=color, marker='o', linestyle='--', alpha=0.75)
        ax2.tick_params(axis='y', colors=color)
        ax2.set_ylabel(y2_col, color=color)


def plot_elbow(df, x_col, y_col, ax):

    elbow_k, elbow_score = find_elbow(df, x_col, y_col)
    elbow_label = f'elbow at ${x_col}={elbow_k}$'
    ax.axvline(elbow_k, c='k', linestyle="--", label=elbow_label)
    ax.legend(frameon=True)


def plot_vline(df, x_col, y_col, ax, line_at='max'):

    if line_at == 'max':
        row_idx = df[y_col].argmax()
    elif line_at == 'min':
        row_idx = df[y_col].argmin()
    line_x = df.loc[row_idx, x_col]
    line_label = f'{line_at} at ${x_col}={line_x}$'
    ax.axvline(line_x, c='k', linestyle="--", label=line_label)
    ax.legend(frameon=True)


def plot_elbow_visualiser(df: pd.DataFrame, x_col='k', score_col='distortion_score',
                          show_elbow=False, time_col=None, ax1=None):

    colors = sns.color_palette("tab20").as_hex()
    if ax1 is None:
        _, ax1 = plt.subplots()
# # scores
    ax1.plot(df[x_col], df[score_col],
             marker="D", c=colors[0], linestyle="-")
    ax1.grid(False)
    ax1.tick_params(axis='y', colors=colors[0])
    ax1.set_xlabel(x_col)
    ax1.set_ylabel(score_col, c=colors[0])
# # Fit Times
    if not time_col is None:
        plot_second_ax(df, x_col, y2_col=time_col, ax=ax1)
    if show_elbow:
        plot_elbow(df, x_col, score_col, ax1)


# Visualize Test
plot_elbow_visualiser(scores, score_col='distortion_score', show_elbow=True)
plt.suptitle(f'Kmeans Distortion Score')
to_png()

### 4.3.2 Plot Summary Metrics

- Visualize several metrics com Subpups

In [None]:
print(sorted(list(scores.columns)))

In [None]:

def plot_metrics(df_scores, x_col='k'):

    if 'clf' in df_scores.columns and 'preprocessor' in df_scores.columns:
        print(
            f"plotting metrics (clf: {df_scores['clf'][0]}, preprocessor: {df_scores['preprocessor'][0]})")

    METRICS = ['distortion_score', 'calinski_harabasz',
               'davies_bouldin', 'silhouette_score', 'stability_score']
    metrics = cols_in_df(df_scores, METRICS)
    n_plots = len(metrics)
    n_cols = min(n_plots, 2)
# # n_rows = int (ceil (n_plots/n_cols))))
    n_rows = n_plots // n_cols + (n_plots % n_cols > 0)
    _ = plt.figure(figsize=(n_cols*5, n_rows*3))
    for n, metric in enumerate(metrics):
        ax = plt.subplot(n_rows, n_cols, n + 1)
        if metric == 'distortion_score':
            plot_elbow_visualiser(
                df_scores, x_col, metric, ax1=ax, time_col='fit time (s)', show_elbow=True)
            plt.title(metric)
        if metric == 'calinski_harabasz':
            plot_elbow_visualiser(df_scores, x_col, metric, ax1=ax)
            plot_vline(df_scores, x_col, metric, ax=ax, line_at='max')
            plt.title(f'{metric} score (max={df_scores[metric].max():.0f})')
        if metric == 'davies_bouldin':
            plot_elbow_visualiser(df_scores, x_col, metric, ax1=ax)
            plot_vline(df_scores, x_col, metric, ax=ax, line_at='min')
            plt.title(f'{metric} score (min={df_scores[metric].min():.2f})')
        if metric == 'silhouette_score':
            titre = f'{metric} (max={df_scores[metric].max():.2f})'
            if 'silhouette_score_std' in df_scores.columns:
                if 'silhouette_sample_sizes' in df_scores.columns:
                    sample_size = int(
                        df_scores['silhouette_sample_sizes'].max())
                    titre += f' 10 samples de {sample_size} pts'
                plt.errorbar(data=df_scores, x=x_col, y='silhouette_score',
                             yerr='silhouette_score_std')
            plot_elbow_visualiser(df_scores, x_col, metric,
                                  ax1=ax, time_col='silhouette_time')
            plot_vline(df_scores, x_col, metric, ax=ax, line_at='max')
            plt.title(titre)
        if metric == 'stability_score':
            titre = f'{metric} (max={df_scores[metric].max():.2f})'
            if 'stability_score_std' in df_scores.columns:
                if 'stability_sample_sizes' in df_scores.columns:
                    sample_size = int(
                        df_scores['stability_sample_sizes'].max())
                    titre += f' 10 samples de {sample_size} pts'
                plt.errorbar(data=df_scores, x=x_col, y='stability_score',
                             yerr='stability_score_std')
            plot_elbow_visualiser(df_scores, x_col, metric,
                                  ax1=ax, time_col='stability_time')
            plot_vline(df_scores, x_col, metric, ax=ax, line_at='max')
            plt.title(titre)

    plt.subplots_adjust(hspace=0.5, wspace=0.5)
    plt.suptitle(
        f"Plot metrics (feature extraction : {df_scores['feature_extraction'][0]};   dimension reduction {df_scores['dimension_reduction'][0]})")


# test
plot_metrics(scores)
to_png()

## 4.4 Visualization of clusters on the 'reduced dimensions'

In [None]:
# Scores, labels, red_dim = score_model (data_t)
print(pd.Series(labels).nunique())
fig = plt.figure(figsize=(15, 6))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
sns.scatterplot(
    ax=ax1, x=red_dim[:, 0], y=red_dim[:, 1], hue=pd.Series(y_cat_txt, dtype=str))
sns.scatterplot(
    ax=ax2, x=red_dim[:, 0], y=red_dim[:, 1], hue=pd.Series(labels, dtype=str))

## 4.4 Visualization of clusters on the 'reduced dimensions'

### 4.4.1 Calculation of better correspondence between the category and the "cluster label"

The labels cluster attributed by modeling (eg Kmeans) are random, that makes comparisons between the attributed category and the label are difflicil to compare: they will not have the same colors

We try to match the Labels cluster in the same order as Categ_true, to facilitate:

- The confusion matrix between the category and cluster
- The comparison of clusters on tsne/pca studs (closer colors)
- Sankey diagrams to show relationships

References

- https://sparse-plex.readthedocs.io/en/latest/book/Clustering/Comparing_Clusterings.html
-https://python.plainengish.io/hungarian-algorithm-introduction-python-implementation-93e7c0890e15
- https://docs.ssipy.org/doc/scipy/reference/generated/Scipy.optimize.linear_sum_assignment.html

As an alternative to Kmeans (algorithm published October 2021):

- https://www.researchgate.net/publication/353696146_the_utility_of_clusters_and_a_hungarian_clustering_algorithm

## Create the confusion matrix

When the number of classes predicted are identical to the number of "real" classes, the confusion matrix is ​​better calculated by the Hungarian algorithm:

- associate each real class with the cluster closest in the diagonal, so as to maximize the correspondence between true classes and predicted classes (the ARI score is calculated based on this relationship 1 to 1)

### Confusion Matrix - Approach 1: Maximize the correspondence between real class and cluster predicted

In [None]:
import pandas as pd
import numpy as np

def squarify_df(df: pd.DataFrame, val=0):

    rows, cols = df.shape

    if rows > cols:
        # Append columns to end of DataFrame
        pad_cols = rows - cols
        pad_df = pd.DataFrame(
            np.zeros((rows, pad_cols), dtype=int),
            index=df.index
        )
        pad_df.columns = [f'pad_{i}' for i in range(pad_cols)]
        df = pd.concat([df, pad_df], axis=1)

    elif cols > rows:
        # Append rows to end of DataFrame
        pad_rows = cols - rows
        pad_df = pd.DataFrame(
            data=np.zeros((cols-rows, cols), dtype=int),
            columns=df.columns,
            index=[f'pad_{i}' for i in range(pad_rows)]
        )
        df = pd.concat([df, pad_df], axis=0)

    return df

# Test the function
df_test_rows = pd.DataFrame([[1, 0, 6], [0, 3, 3]])
df_test_cols = df_test_rows.T

print("Original shape:", df_test_rows.shape)
print("\nSquarified rows:")
print(squarify_df(df_test_rows))
print("\nSquarified columns:")
print(squarify_df(df_test_cols))

In [None]:
def confusion_matrix_crosstab(y_true, y_pred, normalize=False) -> pd.DataFrame:

    if not isinstance(y_true, pd.Series):
        y_true = pd.Series(np.array(y_true), name='y_true')

    if not isinstance(y_pred, pd.Series):
        y_pred = pd.Series(np.array(y_pred), name='y_pred')

    if isinstance(y_pred, pd.Series):
# # If Both Are Series, pd.crosstab uses their clues to make crosstab
        y_pred = y_pred.copy()
        y_pred.index = y_true.index

# # Create the Confusion Matrix Between the Category and Cluster Label
    return pd.crosstab(y_true, y_pred, normalize=normalize)

In [None]:
def conf_matrix_labels(categ_true: pd.Series, clust_labels: pd.Series, normalize=False):

    df_matr = confusion_matrix_crosstab(
        categ_true, clust_labels, normalize=normalize)

# # Algorithm Needs A Square Matrix
    n_categ = df_matr.shape[0]
    n_labels = df_matr.shape[1]

    square_matr = squarify_df(df_matr).fillna(0)

# # Find Order of Columns and Lines to Maximize Values ​​in the Diagonal
    rows, cols = scipy.optimize.linear_sum_assignment(
        square_matr.values, maximize=True)

# # Remove Empty Lines/Columns after Optimizing the Diagonal
    if len(rows) > n_categ:
        rows = [idx for idx in rows if idx < n_categ]
    if len(cols) > n_labels:
        cols = [idx for idx in cols if idx < n_labels]
# # print (f'len (rows): {len (rows)} ')
# # print (f'len (passes): {len (pass)} ')

    df_opt = df_matr.iloc[:, cols]

    if normalize == False:
        df_opt = df_opt.round(0).astype(int)

    return df_opt


# test
y_true = ['a', 'a', 'a', 'a', 'a', 'b', 'b',
          'b', 'b', 'b', 'c', 'c', 'c', 'c', 'c']
y_pred = [0, 3, 4, 1, 0, 4, 3, 4, 2, 4, 2, 1, 3, 1, 3]

conf_matrix_labels(y_true, y_pred).style.background_gradient(axis=1)

### Display of the confusion matrix

In [None]:
Serie = list or np.ndarray or pd.Series


def plot_confusion_matrix_heatmap(y_true: Serie, y_pred: Serie, cmap='Blues', figsize=(6, 4)):

    df_cm = conf_matrix_labels(y_true, y_pred)
    plt.figure(figsize=figsize),
    sns.heatmap(df_cm, annot=True, cmap=cmap, fmt='.0f')


# test
# test
y_true = ['a', 'a', 'a', 'a', 'a', 'b', 'b',
          'b', 'b', 'b', 'c', 'c', 'c', 'c', 'c']
y_pred = [0, 3, 4, 1, 0, 4, 3, 4, 2, 4, 2, 1, 3, 1, 3]
plot_confusion_matrix_heatmap(y_true, y_pred)

### Confusion Matrix - Approach 2: Associated each cluster predicted to one of the real classes

When the number of classes predicted is not identical to the number of "real" classes, we can present the confusion matrix between labels True and predicted labels in 2 ways:

- Combine each real cluster classes predicted the closest in the diagonal (approach 1 - "Hungarian Clustering algorithm").The other clusters less close to the real classes are considered to be "new" classes - they are unknown for the calculation of the confusion matrix.

- associate each class predicted to the real closest class (approach 2)
- Several predicted classes may be associated with a real class (we treat them as a single cluster for the calculation of the confusion matrix),
- He may have real classes without any associated cluster.

Below, the functions for approach 2

In [None]:

def conf_mat_transform(y_true: Serie, y_pred: Serie) -> pd.Series:

    conf_mat = confusion_matrix_crosstab(y_true, y_pred, False).values
    y_predi = preprocessing.LabelEncoder().fit_transform(y_pred)
    corresp = np.argmax(conf_mat, axis=0).astype(int)
    print("Cluster matching:", corresp)

    labels = pd.Series(y_true, name="y_true").to_frame()
    labels['y_pred'] = y_predi
    y_pred_transform = labels['y_pred'].apply(lambda x: corresp[x])

    if isinstance(y_pred, pd.Series):
        y_pred_transform = pd.Series(
            y_pred_transform, name=f'{y_pred.name} (transformed)')
    return y_pred_transform



new_labels = conf_mat_transform(y_true, y_pred)
pd.DataFrame({'new_label': new_labels, 'y_pred': y_pred}
             ).value_counts().sort_index().head()

### Display of the classification report

In [None]:


def print_classification_report(y_true: Serie, y_pred: Serie, target_names=None):

    y_true_unique = sorted(list(set(y_true)))
    if target_names is None:
        target_names = [str(label) for label in y_true_unique]

    is_str_y_true = np.sum([isinstance(label, str)
                           for label in y_true_unique]) > 0
    is_str_y_pred = np.sum([isinstance(label, str)
                           for label in set(y_pred)]) > 0
    mixed_types = (is_str_y_true and not is_str_y_pred) or (
        is_str_y_pred and not is_str_y_pred)
    if mixed_types:

        y_encoder = preprocessing.LabelEncoder()
        y_true_num = y_encoder.fit_transform(y_true)
        print(metrics.classification_report(y_true_num, y_pred,
              target_names=target_names, zero_division=0))
    else:
        print(metrics.classification_report(y_true, y_pred,
              target_names=target_names, zero_division=0))



### Display of classification metrics

To compare the models, we standardize the presentation of metrics

- Ari score
- Precision and Recall (Report classification)
- Confusion matrix

In [None]:
def calc_ARI(labels_true, labels_pred):

    return np.round(metrics.adjusted_rand_score(labels_true, labels_pred), 4)


def plot_classification_metrics(y_tru_, y_predi_, cmap='Blues', combine=False, target_names=None):

    y_true_ = y_tru_.copy()
    y_true_unique = list(set(y_true_))
    y_pred_unique = list(set(y_predi_))
    if (len(y_true_unique) < len(y_pred_unique)) or combine:

        print(
            f'combining clusters {len(y_true_unique)} <  {len(y_pred_unique)}')
        print(y_pred_unique)
        y_pred1_ = conf_mat_transform(y_true_, y_predi_)
    else:

        y_pred1_ = conf_mat_transform(y_true_, y_predi_)
    ari = calc_ARI(y_true_, y_pred1_)
    print(f'ARI = {ari:.3f}')
    print_classification_report(y_true_, y_pred1_, target_names=target_names)
    plot_confusion_matrix_heatmap(y_true_, y_pred1_, cmap=cmap)
    return ari


# test
y_true2 = pd.Series(y_true, name='categ_level_1')
y_pred2 = pd.Series(y_pred, name='labels_LDA')
ari = plot_classification_metrics(y_true2, y_pred2, cmap='Greens')
plt.suptitle('test confusion matrix')
plt.title(f'ARI = {ari:.3f}')
to_png()

In [None]:
# test
y_true1 = ['a', 'a', 'a', 'b', 'b', 'b', 'c',
           'c', 'c', 'd', 'd', 'd', 'e', 'e', 'e']
y_pred1 = [0, 3, 3, 1, 0, 1, 3, 2, 2, 2, 2, 1, 3, 1, 3]
print(len(y_true1), len(y_pred1))
plot_classification_metrics(y_true1, y_pred1, cmap='Blues', combine=False)

### Visualization of matrix confusion as a Sankey diagram

In [None]:
# import plotly.graph_objects as go
# Import Plotly.io as Pio


def add_alpha(color, alpha):
    (r, g, b) = color
    return f'rgba({r},{g},{b},{alpha})'


def plot_sankey_confusion_diagram(source: pd.Series, target: pd.Series = None,
                                  titre='Sankey confusion diagram',
                                  descriptors=['categories', 'clusters'],
                                  figsize=(2, 1), font_size=14,
                                  to_image=True,
                                  palette='nipy_spectral', alpha=0.5):

# # We can provides Two Series (y_true, y_pred),
    if isinstance(source, pd.DataFrame) or len(source) < 10:
        ct = source.copy()
        if isinstance(ct, np.ndarray):
            print('ct is numpy array')
            ct = pd.DataFrame(ct)
            ct = ct.rename_axis(descriptors[0], axis=0)
            ct = ct.rename_axis(descriptors[1], axis=1)
    else:
        if isinstance(source, np.ndarray):
            source = pd.Series(source, name=descriptors[0])
        if isinstance(target, np.ndarray):
            target = pd.Series(target, name=descriptors[1])
        ct = pd.crosstab(source, target)

    source_col = ct.index.name if ct.index.name else descriptors[0]
    target_col = ct.T.index.name if ct.T.index.name else descriptors[1]
    print(source_col, target_col)

    ct.index = ct.index.astype(str)
    ct.columns = ct.columns.astype(str)

# # Replace Source and Target Labels with Unique Node_ids
    node_labels = list(ct.index)+list(ct.columns)
    node_ids = range(len(node_labels))
    label_map = dict(zip(node_ids, node_labels))

# # Create Colors for Unique Nodes (Nodes with Same Name Have Same Color)
# # uniq_nodes = sorted (list (set (node_labels))))))
    uniq_nodes = []
    for node in node_labels:
        if not node in uniq_nodes:
            uniq_nodes.append(node)
    colors = sns.color_palette(palette, n_colors=len(uniq_nodes))
    solid_color_map = dict(zip(uniq_nodes, colors.as_hex()))
    node_colors = list(pd.Series(node_labels).map(solid_color_map))

# # Semi-transparent Create Colors for Links (Link Color Same As Source Node)
    alpha_colors = list([add_alpha(color, alpha) for color in colors])
    alpha_color_map = dict(zip(uniq_nodes, alpha_colors))

# # Map (possible duplicate) node_labels to unique node_ids

    ct.index = node_ids[:len(ct.index)]
    ct.index.name = source_col
    ct.columns = node_ids[-len(ct.columns):]

# # Create [Source, Target, Value, Color] Matrix for Links
    data = pd.melt(ct.reset_index(), id_vars=source_col)
    print(data.shape)
    print(data.columns)
    data.columns = [source_col, target_col, 'value']
    data = data[data['value'] > 0]
    data['src_color'] = data[source_col].map(label_map).map(alpha_color_map)

# # Send Data to Plotly Figure
    fig = go.Figure(data=[
        go.Sankey(node=dict(label=node_labels, color=node_colors),
                  link=dict(source=data[source_col], target=data[target_col],
                    value=data['value'], color=data['src_color'])
                  )
    ])

    titre = f'{titre} : {source_col} vs. {target_col}'
    fig.update_layout(title_text=titre, title_x=0.5, font_size=font_size)

    if to_image:
# # Requires Package 'Kaleido'
        w, h = figsize
        fig_name = sanitize(titre)
        filename = f'{IMAGE_FOLDER}/{fig_name}.png'
        pio.write_image(fig, file=filename, format="png")
# # Pio.Write_Image (Fig, File = Filename, Format = "PNG", Width = W*300, Height = H*300, Scale = 1)
        img = fig.to_image(format="png", width=w, height=h, scale=1)
        return IPython.display.Image(img)
    else:
        fig.show()

### 4.4.2 Clustering Kmeans via tsne, and Metric Ari calculation

Once the scores are calculated, the procedures below allows you to view

In [None]:
# From Sklearn Import Decomposition
def reducer_pca(features: pd.Series, n_components=0.99) -> np.ndarray:

    print(f'Dimensions before PCA reduction : { features.shape}')
    pca = decomposition.PCA(n_components=n_components)
    if n_components >= 1:
        param_str = f'(n_components = {n_components})'
    else:
        param_str = f'({n_components*100:.0f} % variance explained)'
    feat_pca = pca.fit_transform(features)
    print(f'Dimensions after PCA reduction {param_str} : {feat_pca.shape}')
    return feat_pca

In [None]:
# from Sklearn Import Manifold
# import time


def reducer_tsne(features: pd.Series) -> np.ndarray:

    print(f'reducer t-SNE, input shape={features.shape}')
    time1 = time.time()
    tsne_model = manifold.TSNE(n_components=2, perplexity=30, n_iter=2000,
                               init='random', learning_rate=200, random_state=RANDOM_SEED)
    X_tsne = tsne_model.fit_transform(features)
    time2 = np.round(time.time() - time1, 0)
    print(f'reducer t-SNE, shape ={X_tsne.shape} time : {time2}')
    return X_tsne

In [None]:
# from Sklearn Import Cluster, Metrics
# import time

def calc_ARI(labels_pred, labels_true=y_cat_num):

    if len(labels_pred) == len(labels_true):
        return np.round(metrics.adjusted_rand_score(labels_true, labels_pred), 4)
    else:
        return -1


def calc_tsne_cluster(features, categories_=list_categories, y_cat_num_=y_cat_num, k=None):

    X_tsne = reducer_tsne(features)

    if k is None or not k > 0:
        k = len(categories_)

    time1 = time.time()
# # Determination of Clusters from Data After Tsne
    cls = cluster.KMeans(n_clusters=k, n_init=100,
                         random_state=RANDOM_SEED)
    cls.fit(X_tsne)
    time2 = np.round(time.time() - time1, 0)
    if len(y_cat_num_) == len(cls.labels_):
        ARI = np.round(metrics.adjusted_rand_score(y_cat_num_, cls.labels_), 4)
    else:
        ARI = -1

    print("ARI : ", ARI, "time : ", time2)
# # Addition CLS.Cluster_Centers_?
    return ARI, X_tsne, cls.labels_

### 4.2.2 Visualization of clusters on reduced dimensions

In [None]:

#
def plot_clusters_sur_2D(X_tsne_, y_cat_num_, labels_, ARI_, l_cat_=list_categories,
                         bbox_left=-0.05, loc=1,
                         palette='bright',
                         titre1='Representation of products by real categories',
                         titre2='Representation of products by clusters'):

    if ARI_ is None:
        ARI_ = calc_ARI(y_cat_num_, labels_)
    if len(y_cat_num_) == len(labels_):
        df_opt = conf_matrix_labels(y_cat_num_, labels_)
# # Put the order of the clusters in the same order as the y_cat
        labels_hue_order = list(df_opt.columns)
        n_labels = len(labels_hue_order)
    else:
        labels_hue_order = None
        n_labels = pd.Series(np.array(labels_)).nunique()
    s_labels = pd.Series(labels_)

    categ_hue_order = sorted(list(pd.Series(y_cat_num_).unique()))
    n_categ = len(categ_hue_order)

    n_max = max(n_labels, n_categ)
    colors = sns.color_palette(palette, n_colors=n_max)
# # print (colors)
    labels_palette = colors[:n_labels]
    categ_palette = colors[:n_categ]
    fig = plt.figure(figsize=(15, 6))

    ax1 = fig.add_subplot(121)
    scatter1 = sns.scatterplot(x=X_tsne_[:, 0], y=X_tsne_[:, 1], ax=ax1,
                               hue=y_cat_num_, hue_order=categ_hue_order, palette=categ_palette)
    ax1.legend(title="Category", bbox_to_anchor=(bbox_left, 0.5), loc=loc)
    plt.title(titre1)

    ax2 = fig.add_subplot(122)
    scatter2 = sns.scatterplot(x=X_tsne_[:, 0], y=X_tsne_[:, 1], ax=ax2,
                               hue=s_labels, hue_order=labels_hue_order, palette=labels_palette)
    ax2.legend(loc="best", title="Clusters")
    plt.title(titre2)
    print("ARI : ", ARI_)
    return fig

## 4.5 Accumulation of model performance

In [None]:
import pandas as pd

def add_model_score(df: pd.DataFrame = None, model_name: str = 'none', ARI: float = 0, k: int = 0, **kwargs):
    """
    Add model scoring results to a DataFrame.

    Args:
        df: DataFrame to append to (if None, uses global df_resultats)
        model_name: Name of the model
        ARI: Adjusted Rand Index score
        k: Number of clusters
        **kwargs: Additional scoring metrics to include

    Returns:
        Updated DataFrame with new scoring results
    """
    global df_resultats
    if df is None:
        df = df_resultats

    # Create dictionary with model results
    resultats = dict(model=model_name, ARI=ARI, k=k)
    resultats.update(kwargs)

    # Convert single row dict to DataFrame and concatenate
    new_row = pd.DataFrame([resultats])
    df = pd.concat([df, new_row], ignore_index=True)

    # Ensure k is integer type
    df['k'] = df['k'].astype(int)
    return df

# Initialize global results DataFrame
df_resultats = pd.DataFrame()

# Test the function
result = add_model_score(pd.DataFrame(), model_name='test_model', optimizer='adam', k=7)
print(result)

# 5. Bag-of-Words and TF-IDF

## 5.1 Try 1: Bag of Words (Bow)

### Creation of the Bag of Words (Countvectorizer)

We want to find characteristic words of each group

In [None]:
# from Sklearn.Feature _ extraction.text import vectorizer

# Words present in minimum of 3 products
cvect = CountVectorizer(stop_words='english', min_df=3)

# We apply to the sentence created from bag-of-words with lemmatization
feature = 'sentence_bow_lem'
# cv_fit = cvect.fit (data_t [feature])

cv_transform = cvect.fit_transform(data_T[feature])
print(cv_transform.shape)

The matrix `CV_transform` represents the bag-of-words created by Countvectorizer:

In [None]:
def show_example_bow():
    print(data_T[feature][40])
    print(data_T[feature][42])
    print(data_T[feature][43])

    df_bow = pd.DataFrame.sparse.from_spmatrix(
        cv_transform,
        columns=cvect.get_feature_names_out(),
        index=data_T.index)
    df_bow = df_bow.join(data_T[['product_name']])
    print('Portion du bag-of-words - vectors')
    return df_bow.loc[40:43, ['boy', 'baby', 'girl', 'grey', 'blue', 'pyjama', 'cotton', 'hair', 'product_name']]


show_example_bow()

### Calculation of clusters and display

### Uni-grams

- Define `ngram_range = (1,1)`

In [None]:
unigram_count_vectorizer = CountVectorizer(ngram_range=(1, 1), min_df=3)
tsne_reducer = manifold.TSNE(n_components=2, perplexity=30,
                             n_iter=2000, init='random', learning_rate=200)

scores1a, labels1a, X_tsne1a, k1a = score_model(data_T, labels_true=y_cat_txt,
                                                feature_extractor=unigram_count_vectorizer,
                                                dimension_reducer=tsne_reducer,
                                                kmin=4, kmax=12)
ARI1a = calc_ARI(y_cat_txt, labels1a)
# Add to global scores
df_resultats = add_model_score(
    model_name='BOW (unigrams) + TSNE', ARI=ARI1a, k=k1a)
plot_metrics(scores1a)

In [None]:
# Repeat for k = 7 to turn the labels for k = 7
scores1a, labels1a, X_tsne1a, k1a = score_model(data_T, labels_true=y_cat_txt,
                                                feature_extractor=unigram_count_vectorizer,
                                                dimension_reducer=tsne_reducer,
                                                kmin=7, kmax=7)

ARI1a = calc_ARI(y_cat_txt, labels1a)
# Add to global scores
df_resultats = add_model_score(
    model_name='BOW (unigrams) + TSNE', ARI=ARI1a, k=k1a)

In [None]:
ari_1a = plot_classification_metrics(y_cat_num, labels1a)
plt.title(
    f'BOW (unigrams) + TSNE - confusion matrix (ARI={ari_1a:.3f})', fontsize=14)
to_png()

In [None]:

fig = plot_clusters_sur_2D(
    X_tsne1a, y_cat_txt, labels1a, ARI1a, palette='bright')
plt.suptitle(
    f"Visualization of clusters (feature extraction: Bag-Of-Words(unigrams); dimension reduction: TSNE, ARI = {ARI1a:.3f}")
to_png()

### Bag-of-words with reduction of dimensions via PCA

`Pca (n_components = 0.99)` - Drive PCA and only retain the components which explains 99% of the variance

- We eliminate the words that are rare or very associated with each other.

In [None]:
# from Sklearn.decomposition Import PCA

pca_reducer = decomposition.PCA(n_components=0.99)

scores1b, labels1b, X_tsne1b, k1b = score_model(data_T, labels_true=y_cat_txt,
                                                feature_extractor=unigram_count_vectorizer,
                                                dimension_reducer=pca_reducer,
                                                kmin=4, kmax=12)
ARI1b = calc_ARI(y_cat_txt, labels1b)
# Add to global scores
df_resultats = add_model_score(
    model_name='BOW (unigrams) + PCA', ARI=ARI1b, k=k1b)
plot_metrics(scores1b)

In [None]:
ari_1b = plot_classification_metrics(y_cat_txt, labels1b)
plt.title(
    f'BOW (unigrams) + PCA - confusion matrix (ARI={ari_1b:.3f})', fontsize=14)
to_png()

#### View clusters for K = 7

In [None]:
# Repeat for k = 7 to turn the labels for k = 7
scores1c, labels1c, X_tsne1c, k1c = score_model(data_T, labels_true=y_cat_txt,
                                                feature_extractor=unigram_count_vectorizer,
                                                dimension_reducer=pca_reducer,
                                                kmin=7, kmax=7)

ARI1c = calc_ARI(y_cat_txt, labels1c)
# Add to global scores
df_resultats = add_model_score(
    model_name='BOW (unigrams) + PCA', ARI=ARI1c, k=7)

fig = plot_clusters_sur_2D(
    X_tsne1c, y_cat_txt, labels1c, ARI1c, palette='bright')
plt.suptitle(
    f"Visualization of clusters (feature extraction: Bag-Of-Words; dimension reduction: PCA; ARI = {ARI1c:.3f}")
to_png()

Reducing dimensions only with PCA does not seem very useful.

### BAG-OF-Words with reduction of dimensions via PCA + TSNE

We try reduction of dimensions by PCA, followed by tsne on the reduced dimensions (reduction of dimensions in two steps)

In [None]:
pca_tsne_reducer_pipeline = pipeline.Pipeline(
    steps=[('pca', pca_reducer), ('tsne', tsne_reducer)])

scores1d, labels1d, X_tsne1d, k1d = score_model(data_T, labels_true=y_cat_txt,
                                                feature_extractor=unigram_count_vectorizer,
                                                dimension_reducer=pca_tsne_reducer_pipeline,
                                                kmin=4, kmax=12)
plot_metrics(scores1d)

In [None]:
ari_1d = plot_classification_metrics(y_cat_txt, labels1d)
plt.title(
    f'BOW (unigrams) PCA + TSNE - confusion matrix (ARI={ari_1d:.3f})', fontsize=14)
to_png()

The feature reduction pca + tsne creates a better silhouette score than feature reduction with only tsne

#### Visualise Clusters produced by PCA+Tsne

In [None]:

ARI1d = metrics.adjusted_rand_score(
    y_cat_txt, labels1d)

# Add to global scores
df_resultats = add_model_score(
    model_name='BOW (unigrams) + PCA + TSNE', ARI=ARI1d, k=k1d)

fig = plot_clusters_sur_2D(
    X_tsne1d, y_cat_txt, labels1d, ARI1d, palette='bright')
plt.suptitle(
    f'Bag-of-Words (CountVectorizer), reducer(PCA + TSNE), ARI = {ARI1d:.3f}')
to_png()

In [None]:
ari_1d = plot_classification_metrics(y_cat_txt, labels1d)
plt.title(
    f'BOW (unigrams) + PCA + TSNE - confusion matrix (ARI={ari_1d:.3f})', fontsize=14)
to_png()

### Bi-grams

In [None]:
bigram_count_vectorizer = CountVectorizer(ngram_range=(2, 2), min_df=3)

scores1_bigrammes, labels1_bi, X_tsne1_bi, k1_bi = score_model(data_T, labels_true=y_cat_txt,
                                                               feature_extractor=bigram_count_vectorizer,
                                                               dimension_reducer=tsne_reducer,
                                                               kmin=4, kmax=12)
ARI1_bi = calc_ARI(y_cat_txt, labels1_bi)
print(f'Bi-grammes, ARI = {ARI1_bi}')

# Add to global scores
df_resultats = add_model_score(
    model_name='BOW (bigrams) + TSNE', ARI=ARI1_bi, k=k1_bi)
plot_metrics(scores1_bigrammes)

## Tri-grams

In [None]:
trigram_count_vectorizer = CountVectorizer(ngram_range=(2, 2), min_df=3)

scores1_trigrammes, labels1_tri, X_tsne1_tri, k1_tri = score_model(data_T, labels_true=y_cat_txt,
                                                                   feature_extractor=trigram_count_vectorizer,
                                                                   dimension_reducer=tsne_reducer,
                                                                   kmin=4, kmax=12)
ARI1_tri = calc_ARI(y_cat_txt, labels1_tri)

print(f'Tri-grammes, ARI = {ARI1_tri}')

# Add to global scores
df_resultats = add_model_score(
    model_name='BOW (trigrams) + TSNE', ARI=ARI1_tri, k=k1_tri)
plot_metrics(scores1_trigrammes)

### Silhouette visualization of the best clusters

The best performances are for unigram_vectorizer (bag_of_words) and reduction of dimensions by tsne.

We look at the homogeneity of clusters for k = 7

In [None]:
# from yellowbrick.Clusive import silhouettevisualizer
# from Sklearn Import Cluster

def plot_silhouettes(df: pd.DataFrame, k_clusters=7, titre=''):

    cluster_colors = sns.color_palette('nipy_spectral_r', n_colors=k_clusters)
    estimator = cluster.KMeans(n_clusters=k_clusters, random_state=RANDOM_SEED)
    visualizer = SilhouetteVisualizer(estimator, colors=cluster_colors)
    visualizer.fit(df)
# # ~ 5 minutes
    visualizer.finalize()        # Finalize and render the figure
    subtitre = f'Silhouette_score for {k_clusters} clusters : {visualizer.silhouette_score_:.3f}'
    plt.suptitle(f'{titre}\n{subtitre}', fontsize=12, y=1.05)
    to_png(f'{titre}-{subtitre}')


plot_silhouettes(X_tsne1a, k_clusters=7,
                 titre='Silhouettes pour Bag-Of-Words (+ TSNE feature reduction)')

In [None]:
plot_silhouettes(X_tsne1d, k_clusters=7,
                 titre='Silhouettes pour Bag-Of-Words (PCA + TSNE feature reduction)')

### visualizes the best results

In [None]:
print("CountVectorizer : ")
print("-----------------")

data_vectorized = unigram_count_vectorizer.fit_transform(
    data_T['sentence_bow_lem'])

ARI1, X_tsne1, labels1 = calc_tsne_cluster(data_vectorized, k=7)

# Add to global scores
df_resultats = add_model_score(
    model_name='BOW (unigrams) + TSNE', ARI=ARI1, k=7)
# Addition to labels
data_T['labels_bow'] = labels1

In [None]:
fig = plot_clusters_sur_2D(X_tsne1, y_cat_txt, labels1, ARI1, palette='bright')
plt.suptitle(f'Bag-of-Words (CountVectorizer), ARI = {ARI1:.3f}')
to_png()

In [None]:
plot_sankey_confusion_diagram(y_cat_txt, labels1,
                              titre='Bag-of-Words (CountVectorizer)', palette='bright')

### Comparison if we do not remove advertisements

In [None]:
# We apply to the sentence created from bag-of-words with lemmatization
feature_ads = 'sentence_bow_lem_ads'

data_ads_vectorized = unigram_count_vectorizer.fit_transform(
    data_T[feature_ads])

print("CountVectorizer (avec ads): ")
print("-----------------")
ARI1_ads, X_tsne1_ads, labels1_ads = calc_tsne_cluster(
    data_ads_vectorized, k=7)

# Add to global scores
df_resultats = add_model_score(
    model_name='BOW (avec publicités) + TSNE', ARI=ARI1, k=7)

fig = plot_clusters_sur_2D(X_tsne1_ads, y_cat_txt, labels1_ads, ARI1_ads)
plt.suptitle(f'Bag-of-Words avec Ads (CountVectorizer), ARI = {ARI1_ads:.3f}')
to_png()

###Ove Metrics evaluation

In [None]:
ari_1_ads = plot_classification_metrics(y_cat_num, pd.Series(labels1_ads))
plt.title(
    f'BOW avec Ads (CountVectorizer) - confusion matrix (ARI={ari_1_ads:.3f})', fontsize=14)
to_png()

In [None]:
plot_sankey_confusion_diagram(
    y_cat_txt, labels1_ads, titre='Bag-of-Words avec ads')

Indeed, Flipkart advertisements:

- Create category markers, because only certain categories contains these advertisements
- Creates noise that prevents distinguishing the products.

We remain on descriptions without advertising which is not a property of the product.

In [None]:
# Cleanup variable
del data_ads_vectorized, data_vectorized, cv_transform

## 5.2 Try 2: TF-IDF

(TERM FREQUENCY - Inverse Frequency Document)

### Creation of the Word Matrix (TF-IDF)

We do as for the bag-of-words

In [None]:
# from Sklearn.FEATURE Extraction.Text Import TF IDF QUOTRIER

# Words present in minimum of 3 products
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=3)

# We apply to the sentence created from bag-of-words with lemmatization
feature = 'sentence_bow_lem'
tfidf_vectors = tfidf_vectorizer.fit_transform(data_T[feature])

#### Save TF-IDF features

We save the features extracted from the texts by TF-IDF to add to the features extracted from the images by VGG-16

In [None]:
save_pickle(tfidf_vectors, 'features_tfidf', OUT_FOLDER)

The matrix `CTF_Transform` Take the Vectors TF-IDF created by TFIDFFQUTORIER:

In [None]:
df_tf = pd.DataFrame.sparse.from_spmatrix(
    tfidf_vectors,
    columns=tfidf_vectorizer.get_feature_names_out(),
    index=data_T.index).join(data_T['product_name'])


print(data_T[feature][40])
print(data_T[feature][42])
print(data_T[feature][43])

df_tf.loc[40:45, ['baby', 'girl', 'grey', 'blue', 'pyjama',
                  'cotton', 'hair', 'product_name']]

### Execution of the model (clustering) and visualization

In [None]:
print("Tf-idf : ")
print("--------")
ARI2, X_tsne2, labels2 = calc_tsne_cluster(tfidf_vectors)
# Add to global scores
df_resultats = add_model_score(model_name='TF-IDF + TSNE', ARI=ARI2, k=7)
# Add to the result
data_T['labels_tfidf'] = labels2

In [None]:
fig = plot_clusters_sur_2D(X_tsne2, y_cat_txt, labels2, ARI2)
plt.suptitle(f'Bag-of-Words (TF-IDF), ARI = {ARI2:.3f}')
to_png()

In [None]:
plot_sankey_confusion_diagram(y_cat_txt, labels2,
                              titre='Bag-of-Words (TF-IDF)')

###Ove Metrics evaluation

In [None]:
# cls_labels_transform = conf_mat_transform (y_cat_num, labels2)
ari_2 = plot_classification_metrics(y_cat_txt, labels2)
plt.title(f'TF-IDF - confusion matrix (ARI={ari_2:.3f})', fontsize=14)
to_png()

# 6. LDA - (Topic Modeling)

-<https://www.machinelearningplus.com/nlp/topic-modeling-python-cklearn-examples/>

## 6.1 Try 3: LDA - Latent Dirichlet Allocation

We will try to find the topics (clusters)

-From Bag-Of-Words
- from TF-IDF

Features to create:

-`Topic_lda_Bow`: Categories created from 'Bag-Of-Words' Descriptions
- `Topic_lda_tf`: Categories created from 'TF-IDF' Matrix

### 6.3.1 Try 3a: LDA on Bag-of-Words

In [None]:
count_vectorizer = CountVectorizer(analyzer='word',
                                   ngram_range=(1, 1),
                                   min_df=3,                        # minimum reqd occurrences of a word
                                   stop_words='english',             # remove stop words
                                   lowercase=True,                   # convert all words to lowercase
# # NUM CHARS> 3
                                   token_pattern='[a-zA-Z0-9]{3,}',
                                   )


# We apply to the sentence created from bag-of-words with lemmatization
feature = 'sentence_bow_lem'
data_vectorized = count_vectorizer.fit_transform(data_T[feature])

#### Sparsicity Vectorize data

In [None]:

# Materialize The Sparse Data
data_dense = data_vectorized.todense()

# Compute sparsicity = percentage of non-zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

#### gridsearch for the best settings

In [None]:
# from Sklearn.decomposition Import latentdirichletallocation
# from Sklearn.model_selection import gridsearchcv

lda = decomposition.LatentDirichletAllocation(random_state=RANDOM_SEED,
                                              max_iter=10,               # Max learning iterations
                                              learning_method='online',
                                              batch_size=128,            # n docs in each learning iter
                                              )

# This param_grid takes about 5 minutes
# Define Search Param
search_params = {
    'n_components': [7, 8, 9, 10],
    'learning_decay': [.5, .7, .9]
# # 'Learning_method': ['batch', 'online'],
# # Reduce the Weight of First Iterations by Increasing the Learning Offset
# # 'Learning_offset': [2, 5, 10, 20, 50, 100],
}

# Best Params
# param_grid = {'learning_decay': [0.9], 'learning_method': ['online'], 'learning_offset': [20]}

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params, verbose=1,
                     cv=2, return_train_score=True)

# Do the Grid Search
model.fit(data_vectorized)

#### Best Model Performance (Perplexity and Log-Likelihood)

- higher log-likelihood
-Lower Perplexity (= Exp (-1. \* Log-Likelihood per Word))

Note: Perplexity does not consider the context and semantic associations between words.

In [None]:
# Best model

best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score (Higher = Better)
print("Best Log Likelihood Score: ", model.best_score_)
print("Best Log Likelihood Score: ", best_lda_model.score(data_vectorized))

# Perplexity = Exp (-1. * Log-likelihood per word) (Lower = Better)
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

# See Model Parameters
print(best_lda_model.get_params())

### Com compares LDA Model Performance Scores

In [None]:

gridscores = pd.DataFrame(model.cv_results_)
gridscores.head()

In [None]:
# from Matplotlib.ticher Import Maxnlocator
def plot_lda_cross_validation(model):

    gridscores = pd.DataFrame(model.cv_results_)

    ax = sns.lineplot(data=gridscores, x='param_n_components',
                      y='mean_test_score', hue='param_learning_decay', palette='tab10')
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    sns.despine()
    plt.title("Choosing Optimal LDA Model")
    plt.xlabel("Num Topics")
    plt.ylabel("Log Likelyhood Scores")
    plt.legend(title='Learning decay', loc='best')


plot_lda_cross_validation(model)
to_png()

In [None]:
# Create Document - Topic Matrix
lda_output = model.best_estimator_.transform(data_vectorized)
# n_topics = lda_output.n_components
n_topics = lda_output.shape[1]
# Column Names
topics = [f'Topic {i}' for i in range(0, n_topics)]
# Make the pandas dataframe
df_topics = pd.DataFrame(np.round(lda_output, 2),
                         columns=topics,
                         index=data_T.index)

# Get Dominant Topic for Each Document
df_topics['pred_topic'] = np.argmax(df_topics.values, axis=1)
df_data_topics = df_topics.join(data_T[['product_name', 'categ_level_1']])
df_data_topics.head()

In [None]:

df_data_topics['pred_topic'].value_counts()

#### Predicted Topic and Level 1 comparison

In [None]:
# pd.crosstab (df_data_topics ['Categ_level_1'],
# df_data_topics ['pred_topic'])
conf_matrix_labels(df_data_topics['categ_level_1'],
                   df_topics['pred_topic']).style.background_gradient(axis=1)

In [None]:

ARI3a = calc_ARI(df_data_topics['categ_level_1'], df_data_topics['pred_topic'])
print(ARI3a)
# Add to global scores
df_resultats = add_model_score(
    model_name='LDA(BOW) topics', ARI=ARI3a, k=n_topics)
data_T['labels_LDA1'] = df_data_topics['pred_topic']

### Metrics - LDA (Bow) evaluation

In [None]:
ari_3a = plot_classification_metrics(y_cat_txt, data_T['labels_LDA1'])
plt.title(
    f'LDA(BOW) topics - confusion matrix (ARI={ari_3a:.3f})', fontsize=14)
to_png()

In [None]:
# X_tsne1 are the reduced dimensions of the variable "data_vectorized", already calculated Above
# Ari1, x_tsne1, labels1 = calc_tsne_cluster (data_vectorized, k = 7)
fig = plot_clusters_sur_2D(X_tsne1, y_cat_txt,
                           data_T['labels_LDA1'], ARI3a)
fig.axes[1].set_title('Representation des produits par topic LDA')
plt.suptitle(f'LDA sur Bag-of-Words, ARI = {ARI3a:.3f}')
to_png()

In [None]:
def get_topic_words(model, words, nb_words=10):
    for idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[: -nb_words - 1: -1]
        top_features = [words[i] for i in top_features_ind]
# # Weights = Topic [Top_features_ind]
# # For n in Range (nb_words):
# # top_features [n] = f '{top_features [n]} [{weights [n]:.0F}] '
        text = ", ".join(top_features)
        print(f'Topic {idx+1}: {text}')


get_topic_words(model.best_estimator_, cvect.get_feature_names_out())

In [None]:
def plot_topic_words(model, topics, titre='', n_top_words=10, cmap='nipy_spectral'):

    nb_topics = len(model.components_)
    colors = sns.color_palette(cmap, n_colors=nb_topics).as_hex()
    n_cols = 4
    n_rows = nb_topics // n_cols + (nb_topics % n_cols > 0)
    _ = plt.figure(figsize=(n_cols*4, n_rows*3))
# # For n, Category in Enmerate (Categories):
    for topic_idx, topic in enumerate(model.components_):
        ax = plt.subplot(n_rows, n_cols, topic_idx + 1)
        top_features_ind = topic.argsort()[: -n_top_words - 1: -1]
        top_features = [topics[i] for i in top_features_ind]
        weights = topic[top_features_ind]
        ax.barh(top_features, weights, height=0.7, color=colors[topic_idx])
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 16})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=14)
        ax.grid(visible=False)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.suptitle(titre, fontsize=20, y=1.02)


plot_topic_words(model=model.best_estimator_, topics=cvect.get_feature_names_out(),
                 titre='LDA sur bag-of-words, top 10 mots pour chaque topic')
to_png()

In [None]:
def plot_top_words_from_data(df=data_T, feature='sentence_bow_lem', label_col='labels_LDA1',
                             titre='', label_prefix='cluster', nb_words=10, cmap='nipy_spectral'):
    label_names = df[label_col].unique()
    nb_labels = len(label_names)
    colors = sns.color_palette(cmap, n_colors=nb_labels, desat=0.5).as_hex()
    n_cols = 4
    n_rows = nb_labels // n_cols + (nb_labels % n_cols > 0)
    _ = plt.figure(figsize=(n_cols*4, n_rows*3))
    for idx, label in enumerate(sorted(label_names)):
        label_data = df[df[label_col] == label]
        freq_words = (get_corpus_freq(label_data[feature], nb=nb_words)
                      .to_frame('count')
                      .rename_axis('word')
                      .reset_index()
                      )
# # print (f "topic {label}: {freq_words ['word']. Tolist ()}")
        ax = plt.subplot(n_rows, n_cols, idx + 1)
        ax.barh(data=freq_words, y='word', width='count',
                height=0.7, color=colors[idx])
        label_title = f'{label_prefix} {label}' if len(
            str(label)) < 3 else label
        ax.set_title(label_title, fontdict={"fontsize": 16})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=14)
        ax.grid(visible=False)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.suptitle(titre, fontsize=20, y=1.02)


plot_top_words_from_data(
    data_T, feature='sentence_bow_lem', label_col='labels_LDA1',
    label_prefix='topic', titre='Top mots par fréquence dans chaque topic (LDA, Bag-of-Words)')
to_png()

In [None]:
plot_wordclouds_by_categ(
    data_T, feature='sentence_bow_lem', categ_col='labels_LDA1', nb=10)

### 6.3.2 Try 3b: LDA on TF-IDF

In [None]:
# from Sklearn.FEATURE Extraction.Text Import TF IDF QUOTRIER
# from Sklearn.decomposition Import latentdirichletallocation
# from Sklearn.model_selection import gridsearchcv


tfidf_vectorizer = TfidfVectorizer(analyzer='word',
                                   min_df=3,                        # minimum reqd occurrences of a word
                                   stop_words='english',             # remove stop words
                                   lowercase=True,                   # convert all words to lowercase
# # NUM CHARS> 3
                                   token_pattern='[a-zA-Z0-9]{3,}',
                                   )


# We apply to the Sentences Created From Bag-Of Words With Lemmatization
feature = 'sentence_bow_lem'
tfidf_vectors = tfidf_vectorizer.fit_transform(data_T[feature])

lda = decomposition.LatentDirichletAllocation(random_state=RANDOM_SEED,
                                              max_iter=10,               # Max learning iterations
                                              learning_method='online',
                                              batch_size=128,            # n docs in each learning iter
                                              )

# This param_grid takes about 5 minutes
# Define Search Param
search_params = {
    'n_components': [7, 8, 9, 10],
    'learning_decay': [.5, .7, .9]
# # 'Learning_method': ['batch', 'online'],
# # Reduce the Weight of First Iterations by Increasing the Learning Offset
# # 'Learning_offset': [2, 5, 10, 20, 50, 100],
}

# Best Params
# param_grid = {'learning_decay': [0.9], 'learning_method': ['online'], 'learning_offset': [20]}

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params, verbose=1,
                     cv=2, return_train_score=True)

# Do the Grid Search
model.fit(tfidf_vectors)

print(model.best_params_)

In [None]:
plot_lda_cross_validation(model)

In [None]:
lda_model: decomposition.LatentDirichletAllocation = model.best_estimator_
lda_output = lda_model.transform(tfidf_vectors)

# nb_topics = lda_output.shape [1]
nb_topics = lda_model.components_.shape[0]

# Column Names
topics = [f'Topic {i}' for i in range(0, nb_topics)]
df_topics = pd.DataFrame(np.round(lda_output, 2),
                         columns=topics, index=data_T.index)
# Get Dominant Topic for Each Document
df_topics['pred_topic'] = np.argmax(df_topics.values, axis=1)
print(df_topics.head())

df_data_topics = data_T.join(df_topics)
df_data_topics['pred_topic'].value_counts()

In [None]:
labels_true = cat_encoder.fit_transform(df_data_topics['categ_level_1'])
labels_pred = df_data_topics['pred_topic']
data_T['labels_LDA2'] = labels_pred
data_T['labels_LDA2'] = data_T['labels_LDA2'].astype(str)

In [None]:
data_T.columns

In [None]:
conf_matrix_labels(data_T['categ_level_1'],
                   data_T['labels_LDA2']).style.background_gradient(axis=1)

In [None]:
ARI3b = calc_ARI(df_data_topics['categ_level_1'], labels_pred)
print(ARI3b)
# Add to global scores
df_resultats = add_model_score(
    model_name='LDA(TF_IDF) topics', ARI=ARI3b, k=nb_topics)

In [None]:
print(data_T['categ_level_1'].nunique())
print(data_T['labels_LDA2'].nunique())

ari_3b2 = plot_classification_metrics(
    data_T['categ_level_1'], data_T['labels_LDA2'])
plt.title(
    f'LDA (TF-IDF) topics - confusion matrix (ARI={ari_3b2:.3f})', fontsize=14)
to_png()

In [None]:
ARI3b2, X_tsne3b, labels3b = calc_tsne_cluster(tfidf_vectors)
print(ARI3b2)

fig = plot_clusters_sur_2D(X_tsne3b, y_cat_txt,
                           data_T['labels_LDA2'], ARI3b)
fig.axes[1].set_title('Representation des produits par topic LDA')
plt.suptitle(f'LDA sur TF-IDF, ARI = {ARI3b:.3f}')
to_png()

In [None]:

plot_sankey_confusion_diagram(y_cat_txt, labels3b,
                              titre=f'LDA sur TF-IDF, ARI = {ARI3b:.3f}')

In [None]:

get_topic_words(model=model.best_estimator_,
                words=tfidf_vectorizer.get_feature_names_out())
plot_topic_words(model=model.best_estimator_, topics=tfidf_vectorizer.get_feature_names_out(),
                 titre='LDA sur bag-of-words, top 10 mots pour chaque topic')
to_png()

In [None]:
plot_top_words_from_data(
    data_T, feature='sentence_bow_lem', label_col='labels_LDA2', nb_words=10)
plt.suptitle('LDA sur TF-IDF, top 10 mots pour chaque topic')
to_png()

In [None]:
plot_wordclouds_by_categ(
    data_T, feature='sentence_bow_lem', categ_col='labels_LDA2', nb=10)
plt.suptitle(
    'LDA on TF-IDF, 10 most frequent words in the product descriptions of each topic')
to_png()

### Prediction of a topic

In [None]:
# from Sklearn.decomposition Import latentdirichletallocation

def predict_topic(text,
                  preprocessor=transform_bow_lem_fct,
                  feature_extractor=tfidf_vectorizer,
                  dimension_reducer=best_lda_model
                  ):

# # Step 1: Clean Text and Convert to Words
    mytext_2 = transform_bow_lem_fct(text)
    print(mytext_2)

# # Step 2: Extract features
    if feature_extractor:
# # Try:
# # feature_ extractor.check_is_fitted ()
        mytext_3 = feature_extractor.transform([mytext_2])
# # Except:

# # Return 'feature_ extractor is not fitted'

# # Step 3: reduce dimensions/predict transform
    if isinstance(dimension_reducer, decomposition.LatentDirichletAllocation):
        topic_probability_scores = dimension_reducer.transform(mytext_3)
        topic = list(np.argmax(topic_probability_scores, axis=1))

        return topic, topic_probability_scores


print(predict_topic('analog watch battery'))
print(predict_topic('soap cream massage'))

# 7. Word2w

Word2vec is based on ID

- <https://www.tensorflow.org/tutorials/text/word2 dear>

## 7.1 Try 4: Word2 with lemmatized sentences

In [None]:
# import tensorflow as tf

# from tensorflow.keras.preprocessing.sequence import pad_sequences

# from Tensorflow.keras.models Import Model
# For Word2W
# Import Gensim

### Creation of the Word2w

In [None]:
# Length phrases
_ = calc_length_bow(data_T)

In [None]:
w2v_size = 300
w2v_window = 5
w2v_min_count = 1
w2v_epochs = 100
maxlen = 50  # adapt to length of sentences
sentences = data_T['sentence_bow_lem'].to_list()
sentences = [gensim.utils.simple_preprocess(text) for text in sentences]

In [None]:
# Creation and Training of the Word2wave Model

print("Build & train Word2Vec model ...")
w2v_model = gensim.models.Word2Vec(min_count=w2v_min_count, window=w2v_window,
                                   vector_size=w2v_size,
                                   seed=42,
                                   workers=1)
# workers = multiprocessing.cpu_count ())
w2v_model.build_vocab(sentences)
w2v_model.train(
    sentences, total_examples=w2v_model.corpus_count, epochs=w2v_epochs)
model_vectors = w2v_model.wv
w2v_words = model_vectors.index_to_key
print("Vocabulary size: %i" % len(w2v_words))
print("Word2Vec trained")

In [None]:
# Fit A 2D PCA Model to the Vectors
X = w2v_model.wv.get_normed_vectors()
# pca = pca (n_components = 0.95)
# RESULT = PCA.FIT_TRANSFORM (x)

X_red = reducer_pca(X)

In [None]:
# from Sklearn.Manifold Import Tsne

def plot_top_word_vectors(w2v_model_, X_, max_w=50):
    tsne_model = manifold.TSNE(perplexity=40, n_components=2,
                               init='pca', learning_rate=200, n_iter=1000, random_state=23)
    X_tsne = tsne_model.fit_transform(X_[:max_w, :])
# # Plot the T-Sne Output
    _, ax = plt.subplots()
    sns.scatterplot(x=X_tsne[:, 0], y=X_tsne[:, 1], s=1)
    ax.set_title('Words')
    ax.grid(False)
    ax.set_yticklabels([])  # Hide ticks
    ax.set_xticklabels([])  # Hide ticks

    words = list(w2v_model_.wv.index_to_key)
    for i, word in enumerate(words):
        if i < max_w:
            plt.annotate(word, xy=(X_tsne[i, 0], X_tsne[i, 1]))
    plt.show()


plot_top_word_vectors(w2v_model, X, max_w=50)

In [None]:
keys = ['baby', 'soap', 'laptop', 'showpiece', 'curtain', 'kitchen', 'watch']
embedding_clusters = []
word_clusters = []
for word in keys:
    embeddings = []
    words = []
    for similar_word, _ in w2v_model.wv.most_similar(word, topn=10):
        words.append(similar_word)
        embeddings.append(w2v_model.wv[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)

tsne_model_en_2d = manifold.TSNE(
    perplexity=30, n_components=2, init='pca', learning_rate=200, n_iter=3500, random_state=32)
embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(
    embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)


def tsne_plot_similar_words(labels, embedding_clusters, word_clusters, a=0.7):
    plt.figure(figsize=(16, 9))
    colors = sns.color_palette('Dark2', n_colors=len(labels)).as_hex()
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=a, label=label, s=1)
        for i, word in enumerate(words):
            plt.annotate(word, color=color, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=20)
    plt.legend(loc=4)
    plt.grid(False)
    plt.show()
# tsne_plot_similar_words (Keys, Embeddings_en_2d, Word_Clusters)

In [None]:
max_w = 70
ARI4a, X_tsne4a, labels4a = calc_tsne_cluster(X[:max_w, :])
print(ARI4a)

_, ax = plt.subplots(figsize=(16, 9))

sns.scatterplot(x=X_tsne4a[:, 0], y=X_tsne4a[:, 1],
                hue=labels4a, palette='Dark2', s=1, ax=ax, alpha=0.2, legend=None)

colors = sns.color_palette('Dark2', n_colors=len(labels)).as_hex()
words = list(w2v_model.wv.index_to_key)[:max_w]
i = 0

for word, label in zip(words, labels4a):
    plt.annotate(word, color=colors[label], alpha=0.8, size=20,
                 ha='center', va='center',
                 xy=(X_tsne4a[i, 0], X_tsne4a[i, 1]))
    i += 1

plt.grid(False)
plt.title('Representation des top mots par cluster', fontsize=24)
to_png()

In [None]:
def tsne_plot(model, max_w=70):

    labels = []
    tokens = []

# # for word in model.wv:
# # Tokens.Append (Model.wv.get_vector (Word))
# # Labels.Append (Word.index_to_Key)

    tokens_all = model.wv.get_normed_vectors()
    labels_all = model.wv.index_to_key

    tokens = tokens_all[:max_w]
    labels = labels_all[:max_w]

    tsne_model = manifold.TSNE(
        perplexity=40, n_components=2, learning_rate=200, n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)
    clf = cluster.KMeans(n_clusters=7)
    clf.fit(new_values)
    label_color_id = clf.labels_
    colors = sns.color_palette('Dark2', n_colors=7)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])

    plt.figure(figsize=(10, 10))
    for i in range(len(x)):
        plt.scatter(x[i], y[i], color=colors[label_color_id[i]], s=1)
        plt.annotate(labels[i],
                     color=colors[label_color_id[i]],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     fontsize=14,
                     ha='center',
                     va='center')
    plt.grid(False)
# # Plt.show ()

# # Add the word to the groups and focus on specific sets.


tsne_plot(w2v_model)

### sentence preparation (tokenization)

In [None]:
print("Fit Tokenizer ...")
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(sentences)
x_sentences = tf.keras.preprocessing.sequence.pad_sequences(
    tokenizer.texts_to_sequences(sentences),
    maxlen=maxlen, padding='post')

num_words = len(tokenizer.word_index) + 1
print("Number of unique words: %i" % num_words)

## Creation of the Embedding matrix

In [None]:
# Creation of the Embedding Matrix

print("Create Embedding matrix ...")
# w2v_size = 300
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, w2v_size))
i = 0
j = 0

for word, idx in word_index.items():
    i += 1
    if word in w2v_words:
        j += 1
        embedding_vector = model_vectors[word]
        if embedding_vector is not None:
            embedding_matrix[idx] = model_vectors[word]

word_rate = np.round(j/i, 4)
print("Word embedding rate : ", word_rate)
print("Embedding matrix: %s" % str(embedding_matrix.shape))

In [None]:
for i, key in enumerate(word_index.keys()):
    if i < 5:
        print(f'{key} : {word_index[key]}')

## Creation of the embedding model

In [None]:
# Creation of the Model

# Input = tf.keras.layers.input (shape = (len (x_sentencies), maxlen), dtype = 'float64')
word_input = tf.keras.layers.Input(shape=(maxlen,), dtype='float64')
word_embedding = tf.keras.layers.Embedding(input_dim=vocab_size,
                                           output_dim=w2v_size,
                                           weights=[embedding_matrix],
                                           input_length=maxlen)(word_input)
word_vec = tf.keras.layers.GlobalAveragePooling1D()(word_embedding)
embed_model = tf.keras.models.Model([word_input], word_vec)

embed_model.summary()

## Model execution

In [None]:
embeddings = embed_model.predict(x_sentences)
embeddings.shape

In [None]:
ARI4, X_tsne4, labels4 = calc_tsne_cluster(embeddings)
# Add to global scores
df_resultats = add_model_score(
    model_name='Word2Vec', ARI=ARI4, k=pd.Series(labels4).nunique())
data_T['labels_W2V'] = labels4

###Ove Metrics evaluation

In [None]:
ari_4 = plot_classification_metrics(data_T['categ_level_1'], labels4)
plt.title(f'Word2Vec - confusion matrix (ARI={ari_4:.3f})', fontsize=14)
to_png()

In [None]:
fig = plot_clusters_sur_2D(X_tsne4, y_cat_txt, labels4, ARI4)
plt.suptitle(f'Word2Vec, ARI = {ARI4:.3f}')
to_png()

In [None]:
# Cleanup Keras Model
del embed_model
del embeddings
tf.keras.backend.clear_session()

# 8. Bert

In [None]:

# from Transformers Import Autotokenizer
# import bone
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.preprocessing.text import tokenizer
# import tensorflow as tf
# import tensorflow_hub as hub

print(tf.keras.__version__)

# Bert
# from Transformers Import *

os.environ["TF_KERAS"] = '1'

In [None]:

print(tf.__version__)
print("Num GPUs Available: ", len(
    tf.config.experimental.list_physical_devices('GPU')))
print(tf.test.is_built_with_cuda())

## 8.1 Bert - common functions

In [None]:
# import time
# Import Transformers

# Sentences Preparation Function
def bert_inp_fct(sentences, bert_tokenizer, max_length):
    input_ids = []
    token_type_ids = []
    attention_mask = []
    bert_inp_tot = []

    for sent in sentences:
        bert_inp = bert_tokenizer.encode_plus(sent,
                                              add_special_tokens=True,
                                              max_length=max_length,
                                              padding='max_length',
                                              return_attention_mask=True,
                                              return_token_type_ids=True,
                                              truncation=True,
                                              return_tensors="tf")

        input_ids.append(bert_inp['input_ids'][0])
        token_type_ids.append(bert_inp['token_type_ids'][0])
        attention_mask.append(bert_inp['attention_mask'][0])
        bert_inp_tot.append((bert_inp['input_ids'][0],
                             bert_inp['token_type_ids'][0],
                             bert_inp['attention_mask'][0]))

    input_ids = np.asarray(input_ids)
    token_type_ids = np.asarray(token_type_ids)
    attention_mask = np.array(attention_mask)

    return input_ids, token_type_ids, attention_mask, bert_inp_tot


# Features Creation Function
def feature_BERT_fct(model, model_type: str, sentences, max_length, b_size, mode='HF'):

    batch_size = b_size
    batch_size_pred = b_size
    bert_tokenizer = transformers.AutoTokenizer.from_pretrained(model_type)
    time1 = time.time()

    for step in range(len(sentences)//batch_size):
        idx = step*batch_size
        input_ids, token_type_ids, attention_mask, bert_inp_tot = bert_inp_fct(sentences[idx:idx+batch_size],
                                                                               bert_tokenizer, max_length)

        if mode == 'HF':    # Bert HuggingFace
            outputs = model.predict(
                [input_ids, attention_mask, token_type_ids], batch_size=batch_size_pred)
            last_hidden_states = outputs.last_hidden_state

        if mode == 'TFhub':  # Bert Tensorflow Hub
            text_preprocessed = {"input_word_ids": input_ids,
                                 "input_mask": attention_mask,
                                 "input_type_ids": token_type_ids}
            outputs = model(text_preprocessed)
            last_hidden_states = outputs['sequence_output']

        if step == 0:
            last_hidden_states_tot = last_hidden_states
            last_hidden_states_tot_0 = last_hidden_states
        else:
            last_hidden_states_tot = np.concatenate(
                (last_hidden_states_tot, last_hidden_states))

    features_bert = np.array(last_hidden_states_tot).mean(axis=1)

    time2 = np.round(time.time() - time1, 0)
    print("processing time: ", time2)

    return features_bert, last_hidden_states_tot

## 8.2 Try 5: Bert Huggingface

### Model 'Bert-Base-Uncased'

- Downloads 511 MB Pretrained Model

In [None]:
# from Transformers Import Tfautomodel

max_length = 64
batch_size = 10
model_type = 'bert-base-uncased'
model = transformers.TFAutoModel.from_pretrained(model_type)

sentences = data_T['sentence_dl'].to_list()

In [None]:
# Creation of Features
# Downloads Bert Features
features_bert, last_hidden_states_tot = feature_BERT_fct(model, model_type, sentences,
                                                         max_length, batch_size, mode='HF')

In [None]:
ARI5, X_tsne5, labels5 = calc_tsne_cluster(features_bert)
# Add to global scores
df_resultats = add_model_score(
    model_name='Bert HuggingFace', ARI=ARI5, k=pd.Series(labels5).nunique())
data_T['labels_berthf'] = labels5

###Ove Metrics evaluation

In [None]:
ari_5 = plot_classification_metrics(data_T['categ_level_1'], labels5)
plt.title(
    f'Bert HuggingFace - confusion matrix (ARI={ari_5:.3f})', fontsize=14)
to_png()

In [None]:
plot_clusters_sur_2D(X_tsne5, y_cat_txt, labels5, ARI5)
plt.suptitle(f'BERT HuggingFace, ARI = {ARI5:.3f}')
to_png()

In [None]:
# Cleanup Keras Model
del model
del features_bert
tf.keras.backend.clear_session()

## 8.3 Try 6: Bert Hub Tensorflow

In [None]:
# import tensorflow_hub as hub


# Guide on the tensorflow hub: https://www.tensorflow.org/text/tutorials/classify_text_with_bert
# Downloads 500MB of Model
small_model_url = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'
model_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
bert_layer = tensorflow_hub.KerasLayer(model_url, trainable=True)

In [None]:
sentences = data_T['sentence_dl'].to_list()

In [None]:
max_length = 64
batch_size = 10
model_type = 'bert-base-uncased'
model = bert_layer

features_bert, last_hidden_states_tot = feature_BERT_fct(model, model_type, sentences,
                                                         max_length, batch_size, mode='TFhub')

In [None]:
ARI6, X_tsne6, labels6 = calc_tsne_cluster(features_bert)
# Add to global scores
df_resultats = add_model_score(
    model_name='BERT Hub (base uncased)', ARI=ARI6, k=pd.Series(labels6).nunique())
data_T['labels_berthub'] = labels6

### Metrics evaluation (Bert Uncased)

In [None]:
ari_6 = plot_classification_metrics(data_T['categ_level_1'], labels6)
plt.title(
    f'Bert base uncased - confusion matrix (ARI={ari_6:.3f})', fontsize=14)
to_png()

In [None]:
plot_clusters_sur_2D(X_tsne6, y_cat_txt, labels6, ARI6)
print(f'BERT model_url : {model_url}')
plt.suptitle(f'BERT Hub (base uncased), ARI = {ARI6:.3f}')

to_png()

In [None]:
# Cleanup Keras Model
del model
del bert_layer
del features_bert
tf.keras.backend.clear_session()

# 9. Use - Universal Sentence encoder

## 9.1 Try 7: Use - Universal Sentence encoder

In [None]:
# import tensorflow as tf

# Bert
# Import Transformers
os.environ["TF_KERAS"] = '1'

In [None]:

print(tf.__version__)
print("Num GPUs Available: ", len(
    tf.config.experimental.list_physical_devices('GPU')))
print(tf.test.is_built_with_cuda())

In [None]:
# import tensorflow_hub as hub
USE_model_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
# 1GB Model
embed = tensorflow_hub.load(USE_model_url)

In [None]:
def feature_USE_fct(sentences, b_size):
    batch_size = b_size
    time1 = time.time()

    for step in range(len(sentences)//batch_size):
        idx = step*batch_size
        feat = embed(sentences[idx:idx+batch_size])

        if step == 0:
            features = feat
        else:
            features = np.concatenate((features, feat))

    time2 = np.round(time.time() - time1, 0)
    print(f'feature_USE_fct, time_taken = {time2} s')
    return features

In [None]:
batch_size = 10
sentences = data_T['sentence_dl'].to_list()

In [None]:
features_USE = feature_USE_fct(sentences, batch_size)

In [None]:
ARI7, X_tsne7, labels7 = calc_tsne_cluster(features_USE)
# Add to global scores
df_resultats = add_model_score(
    model_name='BERT Hub (base uncased)', ARI=ARI7, k=pd.Series(labels7).nunique())
data_T['labels_USE'] = labels7

### Evaluation Metrics (USE)

In [None]:
ari_7 = plot_classification_metrics(data_T['categ_level_1'], labels7)
plt.title(
    f'Universal Sentence Encoder - confusion matrix (ARI={ari_7:.3f})', fontsize=14)
to_png()

In [None]:
plot_clusters_sur_2D(X_tsne7, y_cat_txt, labels7, ARI7)

USE_model_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
print(f'USE model_url : {USE_model_url}')
plt.suptitle(f'USE - Universal Sentence Encoder, ARI = {ARI7:.3f}')
to_png()

In [None]:
# Cleanup Keras Model
del embed
del features_USE
tf.keras.backend.clear_session()

# 10. Supervised Labeling with a simple neural network

The best score (faster) was with TF-IDF

We will lead to a neural network to see if we can improve the predictions

Note: This model uses a vocabulary very limited to our samples, and perhaps cannot generalize to other products.

If this is the case, you must replace the Tokenizer with an Embedding Layer as in the Word2we Model below.In this case, we can do the transfer learning

In [None]:
# from tensorflow.keras.models import sequential
# from tensorflow.keras.layers import dense
# from tensorflow.keras.preprocessing.text import tokenizer

# from Sklearn.Preprocessing import labelencoder
# from Sklearn.FEATURE Extraction.Text Import TF IDF QUOTRIER
# from Sklearn.model_selection import train_test_split

# Top 1000 Words Present in Minimum of 3 Products
# tfidf_vectorizer = tfidfvectorizer (max_Features = 1000,
# stop_words = 'English', min_df = 3)

# We apply to the sentence created from bag-of-words with lemmatization
# feature = 'sentence_bow_lem'
# tfidf_vectors = tfidf_vectorizer.fit_transform (data_t [feature])


# Documents = data_t ['Description']
# Documents = data_t ['sentence_dl']
documents = data_T['sentence_bow_lem']
labels = data_T['categ_level_1']

# train_documents, test_documents, train_labels, test_labels = train_test_split (documents, labels, stratify = labels, train_size = 0.8)
train_documents, test_documents, train_labels, test_labels = train_test_split(
    documents, labels, train_size=0.8)

# preprcy
# With tf -idf, we need to create a great matrix sparse of the frequencies of words
# For this Study, it works
# But if we had 50,000 products, Maybe the top 1000 terms is not enclic to group them
vocab_size = 1000
tokenize = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size)
tokenize.fit_on_texts(train_documents)

# Word_Frequencies
x_train = tokenize.texts_to_matrix(train_documents, mode='tfidf')
x_test = tokenize.texts_to_matrix(test_documents, mode='tfidf')
print(x_train.shape)


encoder = preprocessing.LabelEncoder()
encoder.fit(train_labels)
y_train = encoder.transform(train_labels)
y_test = encoder.transform(test_labels)
num_labels = labels.nunique()

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(
    512, input_shape=(vocab_size,), activation='relu'))
model.add(tf.keras.layers.Dense(num_labels, activation='softmax'))
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(x_train, y_train,
                    batch_size=16,
                    epochs=10,
                    verbose=1,
                    validation_data=(x_test, y_test))

In [None]:

def plot_diagnostic_learning_curves(history):

    colors = sns.color_palette('tab10')
    fig, axs = plt.subplots(1, 2, figsize=(12, 5))
# # Plot Loss
    ax1 = axs[0]
    ax1.set_title('Cross Entropy Loss')
    ax1.plot(history.history['loss'], c=colors[0], label='train')
    ax1.plot(history.history['val_loss'], c=colors[1], label='validate')
    ax1.set_xlabel('epochs')
    ax1.set_ylabel('loss')
    ax1.legend()
# # Plot accuracy
    ax2 = axs[1]
    ax2.set_title('Classification Accuracy')
    ax2.plot(history.history['accuracy'], c=colors[0], label='train')
    ax2.plot(history.history['val_accuracy'], c=colors[1], label='validate')
    ax2.legend()
    ax2.set_xlabel('epochs')
    ax2.set_ylabel('accuracy')
    fig.suptitle('Loss and accuracy evolution over epochs')


plot_diagnostic_learning_curves(history)

In [None]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

### Calculate predicted labels

In [None]:
predictions = pd.DataFrame({})
for i in range(5):
    prediction = model.predict(np.array([x_test[i]]))
    print(prediction)
    print(np.argmax(prediction[0]))
    text_labels = encoder.classes_
    predicted_label = encoder.inverse_transform(
        [np.argmax(prediction[0]).sum()])
    predicted_weight = np.round(100 * prediction[0].max(), 2)
    print(test_documents.iloc[i][:50], "...")
    print(f'Actual label: {test_labels.iloc[i]}')
    print(f'Predicted label: {predicted_label} ({predicted_weight:.0f}%)')

In [None]:
def calc_pred_label(x_, model_, encoder_, documents_):
    predictions = np.array(model_.predict(x_))
    text_labels = encoder_.classes_

# # print (predictions [: 4])
    predicted_labels = []
    predicted_weights = []
    for i, prediction in enumerate(predictions):
        predicted_label = text_labels[np.argmax(prediction)]
        predicted_labels.append(predicted_label)
        predicted_weights.append(np.round(np.max(prediction)*100, 2))

    pred_labels = pd.Series(
        predicted_labels, index=documents_.index).rename('predicted')
    pred_weights = pd.Series(
        predicted_weights, index=documents_.index).rename('pred_weight')
    return pred_labels, pred_weights


test_pred_labels, test_pred_weights = calc_pred_label(
    x_test, model, encoder, test_documents)
train_pred_labels, train_pred_weights = calc_pred_label(
    x_train, model, encoder, train_documents)

print(list(test_pred_labels[:10]))
print(list(test_labels[:10]))

In [None]:
ARI8 = calc_ARI(test_labels, test_pred_labels)
print(ARI8)
df_resultats = add_model_score(model_name='supervised TF-IDF', ARI=ARI8, k=7)

#### Metrics evaluation - TF -IDF supervised

In [None]:
ari_8 = plot_classification_metrics(test_labels, test_pred_labels)
plt.title(
    f'TF-IDF supervisée - confusion matrix (ARI={ari_8:.3f})', fontsize=14)
to_png()

In [None]:

plot_sankey_confusion_diagram(source=test_labels, target=test_pred_labels)

In [None]:
conf_matrix_labels(test_labels, test_pred_labels).style.background_gradient()

### MISCLASSIFIED Texts

Train and test data is included to see misclassifications

In [None]:
pred_labels = pd.concat([test_pred_labels, train_pred_labels], axis=0)
pred_weights = pd.concat([test_pred_weights, train_pred_weights], axis=0)

In [None]:
df_pred = pd.concat([labels, pred_labels, pred_weights], axis=1).join(
    documents).join(data_T['description'])
df_pred.head()

In [None]:
misclassified = df_pred['categ_level_1'] != df_pred['predicted']
print(misclassified.sum())
df_pred[misclassified].head()

Without the image, it will be difficult for a human to classify these items.So it is not necessarily interesting to make the Learning transfer on these items.

In [None]:
# cleanup
del model
tf.keras.backend.clear_session()

### Supervised classification with clean Word Embeddings

Note: Using only the vocabulary of the documents present, we create an 'overfit' (many biases) on the products already present.

However, if we cannot classify an item automatically via Unsupervised Classification (Text+ Images), we can use this model to offer a more likely category / subcategory

-https://machinelearningmaster.com/use-word-embedding-layers-deep-learning-keras/

In [None]:
# Sequences Ready for Word_Embedding Input
x_train_for_embed = tokenize.texts_to_sequences(train_documents)
x_test_for_embed = tokenize.texts_to_sequences(test_documents)

max_words = 50
x_train = tf.keras.preprocessing.sequence.pad_sequences(
    x_train_for_embed, maxlen=max_words)
x_test = tf.keras.preprocessing.sequence.pad_sequences(
    x_test_for_embed, maxlen=max_words)
embedding_dim = 4
print(x_train.shape)

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(
    vocab_size, embedding_dim, input_length=max_words))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(7, activation='softmax'))
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:

history = model.fit(x_train, y_train,
                    batch_size=16,
                    epochs=100,
                    verbose=0,
                    validation_data=(x_test, y_test))

In [None]:
plot_diagnostic_learning_curves(history)

In [None]:
test_pred_labels, test_pred_weights = calc_pred_label(
    x_test, model, encoder, test_documents)
train_pred_labels, train_pred_weights = calc_pred_label(
    x_train, model, encoder, train_documents)

print(list(test_pred_labels[:10]))
print(list(test_labels[:10]))

#### Metrics evaluation - Word Embedding Supervised

In [None]:

ari_9 = plot_classification_metrics(
    test_labels, test_pred_labels, target_names=test_labels.unique())
plt.title(
    f'Word Embedding supervisée - confusion matrix (ARI={ari_9:.3f})', fontsize=14)
to_png()

In [None]:
df_resultats = add_model_score(
    model_name='Word Embedding supervisée', ARI=ari_9, k=7)

In [None]:
conf_matrix_labels(test_labels, test_pred_labels).style.background_gradient()

# Results

## 11.1 Results summary

In [None]:
df_resultats = df_resultats.sort_values(by='ARI', ascending=False)
df_resultats.to_csv(f'{OUT_FOLDER}/nlp_resultats.csv')
df_resultats

## 11.2 Recording of results

Record data with product labels by each model

In [None]:
# data_t.to_csv (f '{out_folder} /data_text_labelled.csv')