In [1]:
import sys
sys.path.append("../")

In [2]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import yaml

from scipy.sparse import csr_matrix
from nltk.corpus import stopwords

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

from utils.load_data import *
from utils.pairwise_similarity import pairwise_similarity

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)




### Load Data

In [3]:
with open("../config.yml", "r") as file:
    config = yaml.safe_load(file)

In [4]:
monoprix_gold = gold('../data/', 'monoprix', config).collect()
franprix_gold = gold('../data/', 'franprix', config).collect()
auchan_gold = gold('../data/', 'auchan', config).collect()

datasets = [monoprix_gold, franprix_gold, auchan_gold]

In [5]:
dataset_init = (
    pl.concat(
        [
            dataset.select(pl.col("brand_desc_slug"))
            for dataset in datasets
        ], 
        how="vertical"
        )
    .unique()
    )

#### TF-IDF, Words

In [6]:
# Grab the column to group (values must be Unicode) Unique ? 
dataset = dataset_init['brand_desc_slug']
stopwords_list = stopwords.words('english') + stopwords.words('french')

## generate the matrix of TF-IDF values for each item - Ngram
vectorizer = TfidfVectorizer(stop_words=stopwords_list, analyzer='word', token_pattern=r'(?u)\b[A-Za-z]{2,}\b')
tf_idf_matrix = vectorizer.fit_transform(dataset)

tfidf_tokens = vectorizer.get_feature_names_out()

df_tfidfvect = pl.DataFrame(data=tf_idf_matrix.toarray(), schema=tfidf_tokens.tolist())

# print(dataset[0])
# print(len(tfidf_tokens), tfidf_tokens)
print(tf_idf_matrix.shape)
# print(tf_idf_matrix[1])

df_tfidfvect.head()

(6516, 6084)


abatilles,abbaye,abbey,abeilles,aberfeldy,aberlour,abord,absinthe,absolu,absolut,abtey,abus,abystyle,abyx,access,accessibles,accessoires,accro,acer,ach,achva,acorelle,acoustics,acteur,acteurs,actimel,action,activ,active,activia,activilong,activision,actuel,ad,adam,adapt,addax,…,yuasa,yukik,yum,yumi,yunnan,zakia,zani,zapetti,zefal,zelande,zen,zenae,zenitech,zensect,zenspire,zero,zeromoustique,zhiyun,ziclean,zifel,zilia,zip,zipetzap,zipit,zobrowka,zodiac,zoe,zohi,zolux,zon,zoom,zortrax,zubrowka,zuru,zwilling,zyliss,zzz
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
## filter brands with only figures and less than 3 letters
df_tfidfvect_filter = (
    pl.concat([dataset_init.select(pl.col('brand_desc_slug')), df_tfidfvect.with_columns(pl.sum(pl.all()).alias('sum'))], how="horizontal")
    .filter(pl.col('sum')>0)
)
print(df_tfidfvect_filter.shape)

(6468, 6086)


In [8]:
name_vector = (
    df_tfidfvect_filter
    .get_columns()[0]
    .to_list()
)
print(len(name_vector))

df_tfidfvect_filter = df_tfidfvect_filter.drop('brand_desc_slug')

6468


### Dimensionality reduction 

In [9]:
pca = PCA(n_components=650)
tf_idf_matrix_pca = pca.fit_transform(df_tfidfvect_filter)
# plt.plot(np.cumsum(pca.explained_variance_ratio_))
(f"Total variance explained: {np.sum(pca.explained_variance_ratio_):.2f}")

'Total variance explained: 0.36'

In [10]:
pca = PCA(n_components=0.80)
tf_idf_matrix_pca = pca.fit_transform(df_tfidfvect_filter)
pca.n_components_
# (f"Total variance explained: {np.sum(pca.explained_variance_ratio_):.2f}")

3425

In [11]:
concat_matrix = csr_matrix(tf_idf_matrix_pca)

print(len(concat_matrix.nonzero()[0]), len(concat_matrix.nonzero()[1]))
concat_matrix.data

22152057 22152057


array([ 4.98655243e-01, -3.18075134e-02, -2.35536369e-02, ...,
       -3.74074507e-04,  4.59044640e-03,  1.67322683e-02])

### Similarity

In [12]:
# Compute cosine similarity matrix
cosine_similarity = cosine_similarity(concat_matrix)
cosine_similarity_csr = csr_matrix(cosine_similarity) 

print(cosine_similarity_csr.nonzero())
print(len(cosine_similarity_csr.nonzero()[0]), len(cosine_similarity_csr.nonzero()[1]))
print(cosine_similarity_csr.data)

# pdist(concat_matrix, metric='cosine')

(array([   0,    0,    0, ..., 6467, 6467, 6467]), array([   0,    1,    2, ..., 6465, 6466, 6467]))
41835024 41835024
[ 1.         -0.10449687  0.24642381 ...  0.03637915 -0.04923008
  1.        ]


In [13]:
df_cossim = pairwise_similarity(cosine_similarity_csr, name_vector)
df_cossim = (
    df_cossim
    .sort(by=['similarity'], descending = True)
 )
 
print(df_cossim)
print(df_cossim.shape)
df_cossim.select(pl.col('left_side'), pl.col('right_side'), pl.col('similarity').alias('similarity_sg')).write_csv('../temp_folder/SGWords_cossim.csv', separator=";")

shape: (41_835_024, 3)
┌───────────────────────────────────┬───────────────────────────────────┬────────────┐
│ left_side                         ┆ right_side                        ┆ similarity │
│ ---                               ┆ ---                               ┆ ---        │
│ str                               ┆ str                               ┆ f64        │
╞═══════════════════════════════════╪═══════════════════════════════════╪════════════╡
│ RIBILAND                          ┆ RIBILAND                          ┆ 1.0        │
│ ALFAPAC                           ┆ ALFAPAC                           ┆ 1.0        │
│ CASTELAIN                         ┆ CASTELAIN                         ┆ 1.0        │
│ BITDEFENDER                       ┆ BITDEFENDER                       ┆ 1.0        │
│ …                                 ┆ …                                 ┆ …          │
│ F C BARCELONA                     ┆ JOHN FRIDA BYE BYE CHEVEUX BLANC… ┆ -0.86463   │
│ JOHN FRIDA BYE BYE