In [1]:
import sys
sys.path.append("../")

In [2]:
import polars as pl
import yaml

from slugify import slugify
from scipy.sparse import csr_matrix
from nltk.corpus import stopwords
import string

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

from utils.load_data import *
from utils.pairwise_similarity import pairwise_similarity

from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)



### Load Data

In [3]:
with open("../config.yml", "r") as file:
    config = yaml.safe_load(file)

In [4]:
monoprix_gold = gold('../data/', 'monoprix', config).collect()
franprix_gold = gold('../data/', 'franprix', config).collect()
auchan_gold = gold('../data/', 'auchan', config).collect()

datasets = [monoprix_gold, franprix_gold, auchan_gold]

In [5]:
dataset_init = (
    pl.concat(
        [
            dataset
            .select(pl.col("product_id"), pl.col("brand_desc_slug").alias(f'brand_desc_slug_{i}'),
                    pl.col(f"level{config['classification_most_relevant_level']}").alias(f"level_{i}"))
            for i, dataset in enumerate(datasets)
        ], 
        how="align"
        )
    .with_columns(pl.concat_list([f'brand_desc_slug_{i}' for i, _ in enumerate(datasets)]).alias('brand_desc_slug'))
    .drop([f'brand_desc_slug_{i}' for i, _ in enumerate(datasets)])
    .explode("brand_desc_slug")
    .filter(pl.col('brand_desc_slug').is_not_null())
)

print(dataset_init.shape)
dataset_init.head(5)

(99359, 5)


product_id,level_0,level_1,level_2,brand_desc_slug
str,str,str,str,str
"""0000000001649""",,"""PATISSERIE ET …",,"""LITTLE MOONS M…"
"""0000000007198""",,"""MELON ET PASTE…",,"""BIO ENSEMBLE"""
"""0000000007256""",,"""BIO FRUITS ET …",,"""BIO ENSEMBLE"""
"""0000000007395""",,"""AIL OIGNONS HE…",,"""BIO ENSEMBLE"""
"""0000000007396""",,"""AIL OIGNONS HE…",,"""BIO ENSEMBLE"""


### Preprocessing

In [6]:
STOPWORDS_LIST = stopwords.words('english') + stopwords.words('french')

lemmatizer = FrenchLefffLemmatizer()

def convert_to_list_of_words(list_of_sentences):
    list_=[]
    for s in list_of_sentences:
        list_ += [''.join(char.lower() for char in item
                          if char not in string.punctuation and len(char)>0)
                          for item in s.split()]
    return list_

def lemmatize_words (list_of_words):
    words_w_stopwords = [i for i in list_of_words if i not in STOPWORDS_LIST]
    return [lemmatizer.lemmatize(w) for w in words_w_stopwords]

def remove_duplicates(l):
    return (list(set(l)))

In [7]:
dataset_init_concat = (
    dataset_init
    .select(["brand_desc_slug"]
           + [pl.col(c) for c in dataset_init.columns if c.startswith('level') and not c.endswith('2')] ## drop Franprix classification
            )
    .unique() 
    .with_columns(pl.concat_list([c for c in dataset_init.columns if c.startswith('level') and not c.endswith('2')]).alias(f"level"))
    .select('brand_desc_slug', 'level')
    .explode(f"level")
    .filter(pl.col(f"level").is_not_null())
    .unique()
    .groupby('brand_desc_slug')
    .agg(pl.col(f"level"))    
    .with_columns(pl.col(f"level").apply(convert_to_list_of_words))
    .with_columns(pl.col(f"level").apply(remove_duplicates))
    .with_columns(pl.col(f"level").apply(lemmatize_words).alias(f"level_lemmatize"))
    .with_columns(pl.col(f'level').cast(pl.List(pl.Utf8)).list.join(" "))  
    .with_columns(pl.col(f'level_lemmatize').cast(pl.List(pl.Utf8)).list.join(" "))  
    .with_columns(
        pl.col(f'level_lemmatize')
        .apply(
            lambda x: slugify(x, separator=" ")
            .upper()
            .strip()
        )
        .alias(f'level_slug')
    )
)

dataset_init_concat.head(10)

brand_desc_slug,level,level_lemmatize,level_slug
str,str,str,str
"""LE RUSTIQUE""","""fromage bries …","""fromage brie b…","""FROMAGE BRIE B…"
"""KER DIOP""","""graine bio fru…","""graine bio fru…","""GRAINE BIO FRU…"
"""CHIPSTER""","""chips biscuit …","""chips biscuit …","""CHIPS BISCUIT …"
"""CHEMINETT""","""barbecue""","""barbecue""","""BARBECUE"""
"""FERMIER D AUVE…","""volaille""","""volaille""","""VOLAILLE"""
"""MONOPRIX GOURM…","""baguette""","""baguette""","""BAGUETTE"""
"""YOPLAIT PERLE …","""aux fruits aro…","""fruit aromatis…","""FRUIT AROMATIS…"
"""CANDEREL SUGAR…","""sucre""","""sucre""","""SUCRE"""
"""LIV""","""classique plat…","""classique plat…","""CLASSIQUE PLAT…"
"""GEFEN""","""casher""","""casher""","""CASHER"""


### TfidfVectorizer

In [8]:
# Grab the column to group (values must be Unicode) Unique ? 
dataset = dataset_init_concat[f'level_slug']

## generate the matrix of TF-IDF values for each item - Ngram
vectorizer_ngram = TfidfVectorizer(stop_words=STOPWORDS_LIST, analyzer='word', token_pattern=r'(?u)\b[A-Za-z]{2,}\b')
tf_idf_matrix_ngram = vectorizer_ngram.fit_transform(dataset)

tfidf_tokens_ngram = vectorizer_ngram.get_feature_names_out()

df_tfidfvect_ngram = pl.DataFrame(data=tf_idf_matrix_ngram.toarray(), schema=tfidf_tokens_ngram.tolist())

print(tf_idf_matrix_ngram.shape)

df_tfidfvect_ngram.head()

(3426, 833)


abbaye,abricot,absorbeur,accessoire,adhesifs,ado,adoucissant,agrume,aide,ail,air,alcool,aliment,alimentaire,alimentation,allege,allegee,alleges,allume,allumette,alternatifs,aluminium,ambiante,ambree,ambrees,americains,ampoule,amsterdam,anchois,andouillette,anglaise,animal,anise,anti,antipasti,aperitif,aperitifs,…,ultra,usa,usage,ustensile,vaisselle,vanille,vaporisateur,veau,vegan,vegetal,vegetale,vegetales,vegetariens,vegetaux,velo,veloutes,vermicelle,verre,vert,viande,vichy,viennoiserie,viennoiseries,vin,vinaigre,vinaigrette,visage,vitalite,vitamine,vitre,vodka,volaille,wc,whisky,wrap,yaourt,yeux
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.591432,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [9]:
## Create list of brands
name_vector = (
    dataset_init_concat
    .get_columns()[0].to_list()
)
print(len(name_vector))

3426


### PCA

In [10]:
pca = PCA(n_components=0.9)
tf_idf_matrix_ngram_pca = pca.fit_transform(df_tfidfvect_ngram)

(f"Nb components: {pca.n_components_}")

'Nb components: 266'

In [11]:
concat_matrix = csr_matrix(tf_idf_matrix_ngram_pca)

print(len(concat_matrix.nonzero()[0]), len(concat_matrix.nonzero()[1]))
concat_matrix.data

911316 911316


array([-0.00449974, -0.00345188, -0.00519737, ...,  0.0051167 ,
        0.01125158, -0.0039131 ])

### Similarity

In [12]:
# Compute cosine similarity matrix
cosine_similarity = cosine_similarity(concat_matrix)
cosine_similarity_csr = csr_matrix(cosine_similarity) 

print(cosine_similarity_csr.nonzero())
print(len(cosine_similarity_csr.nonzero()[0]), len(cosine_similarity_csr.nonzero()[1]))
print(cosine_similarity_csr.data)

(array([   0,    0,    0, ..., 3425, 3425, 3425]), array([   0,    1,    2, ..., 3423, 3424, 3425]))
11737476 11737476
[ 1.         -0.02401624 -0.01837458 ... -0.02492766 -0.01320667
  1.        ]


In [13]:
df_cossim = pairwise_similarity(cosine_similarity_csr, name_vector)
df_cossim = (
    df_cossim
    .sort(by=['similarity'], descending = True)
 )
 
print(df_cossim)
print(df_cossim.shape)

df_cossim.select(pl.col('left_side'), pl.col('right_side'), pl.col('similarity').alias('similarity_sg')).write_csv('../temp_folder/Nomenclature_words_cossim.csv', separator=";")

shape: (11_737_476, 3)
┌──────────────────────────────┬──────────────────────────────┬────────────┐
│ left_side                    ┆ right_side                   ┆ similarity │
│ ---                          ┆ ---                          ┆ ---        │
│ str                          ┆ str                          ┆ f64        │
╞══════════════════════════════╪══════════════════════════════╪════════════╡
│ FRISK CLEAN BEATH            ┆ FRISK CLEAN BEATH            ┆ 1.0        │
│ NESTLE                       ┆ NESTLE                       ┆ 1.0        │
│ HARTLEY S                    ┆ HARTLEY S                    ┆ 1.0        │
│ ROGE CAVAILLES PARAPHARMACIE ┆ ROGE CAVAILLES PARAPHARMACIE ┆ 1.0        │
│ …                            ┆ …                            ┆ …          │
│ MUSTELA PARAPHARMACIE        ┆ MONOPRIX                     ┆ -0.507178  │
│ MONOPRIX                     ┆ MUSTELA PARAPHARMACIE        ┆ -0.507178  │
│ MONOPRIX                     ┆ MUSTELA             