In [1]:
import sys
sys.path.append("../")

In [2]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import yaml

from slugify import slugify

from scipy.sparse import csr_matrix

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

from utils.load_data import *

from utils.pairwise_similarity import pairwise_similarity

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)




#### Load Data

In [3]:
with open("../config.yml", "r") as file:
    config = yaml.safe_load(file)

In [4]:
monoprix_gold = gold('../data/', 'monoprix', config).collect()
franprix_gold = gold('../data/', 'franprix', config).collect()
auchan_gold = gold('../data/', 'auchan', config).collect()

datasets = [monoprix_gold, franprix_gold, auchan_gold]

In [5]:
dataset_init = (
    pl.concat(
        [
            dataset.select(pl.col("brand_desc_slug"))
            for dataset in datasets
        ], 
        how="vertical"
        )
    .unique()
    .with_columns(
        pl.col("brand_desc_slug")
        .apply(lambda x: slugify(x, separator = ''))
        .alias("brand_desc_without_space")
        )
    .select("brand_desc_slug", 'brand_desc_without_space')
    )

In [6]:
## Create list of brands
name_vector = (
    dataset_init
    .get_columns()[0].to_list()
)
print(len(name_vector))

6516


#### TF-IDF, N-Grams

In [7]:
# Grab the column to group (values must be Unicode) Unique ? 
dataset = dataset_init['brand_desc_without_space']

## generate the matrix of TF-IDF values for each item - Ngram
vectorizer_ngram = TfidfVectorizer(analyzer="char", ngram_range=(2, 5))
tf_idf_matrix_ngram = vectorizer_ngram.fit_transform(dataset)

tfidf_tokens_ngram = vectorizer_ngram.get_feature_names_out()

df_tfidfvect_ngram = pl.DataFrame(data=tf_idf_matrix_ngram.toarray(), schema=tfidf_tokens_ngram.tolist())

print(dataset[0])
print(len(tfidf_tokens_ngram), tfidf_tokens_ngram)
print(tf_idf_matrix_ngram.shape)
print(tf_idf_matrix_ngram[1])

df_tfidfvect_ngram.head()

maredsous
46862 ['00' '00e' '00et' ... 'zzurr' 'zzy' 'zzz']
(6516, 46862)
  (0, 9381)	0.27716186930975595
  (0, 44589)	0.27716186930975595
  (0, 20471)	0.26979552770435017
  (0, 27522)	0.21666529209656993
  (0, 3652)	0.24691358070316294
  (0, 9380)	0.27716186930975595
  (0, 44587)	0.26979552770435017
  (0, 20459)	0.21537055079935052
  (0, 27521)	0.21666529209656993
  (0, 39503)	0.2129241227697118
  (0, 3651)	0.24376723236024045
  (0, 9370)	0.22572491108462925
  (0, 44555)	0.2085160283061943
  (0, 20458)	0.1778576515506478
  (0, 27520)	0.21537055079935052
  (0, 43346)	0.15818324229167147
  (0, 39451)	0.16775504975127303
  (0, 3409)	0.1298297753873842
  (0, 9156)	0.1293574973543044
  (0, 44554)	0.12789007037229821
  (0, 20441)	0.1572969696937516
  (0, 27320)	0.13291909232286322


00,00e,00et,00etb,00g,00go,00gor,00o,00or,00ori,00u,00ul,00ult,0c,0cl,0d,0de,0deg,0deg7,0e,0ep,0epi,0epic,0et,0etb,0etbi,0f,0fe,0fer,0ferm,0g,0go,0gor,0gori,0gr,0gra,0grai,…,zwil,zwill,zy,zyc,zyca,zycaj,zyl,zyli,zylis,zyt,zyti,zytig,zz,zza,zzad,zzade,zzal,zzale,zzap,zzape,zzas,zze,zzea,zzeas,zzet,zzett,zzi,zzio,zzix,zzo,zzol,zzoli,zzu,zzur,zzurr,zzy,zzz
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Dimensionality reduction 

In [8]:
pca = PCA(n_components=0.90)
tf_idf_matrix_ngram_pca = pca.fit_transform(df_tfidfvect_ngram)
pca.n_components_
# (f"Total variance explained: {np.sum(pca.explained_variance_ratio_):.2f}")

4069

In [9]:
concat_matrix = csr_matrix(tf_idf_matrix_ngram_pca)
print(len(concat_matrix.nonzero()[0]))
print(len(concat_matrix.nonzero()[1]))
concat_matrix.data

26513604
26513604


array([-0.01287073,  0.04608799,  0.03234584, ..., -0.00664363,
        0.00855535,  0.01928867])

#### Similarity

In [10]:
# Compute cosine similarity matrix
cosine_similarity = cosine_similarity(concat_matrix)
cosine_similarity_csr = csr_matrix(cosine_similarity) 

print(cosine_similarity_csr.nonzero())
print(len(cosine_similarity_csr.nonzero()[0]))
print(len(cosine_similarity_csr.nonzero()[1]))
print(cosine_similarity_csr.data)

(array([   0,    0,    0, ..., 6515, 6515, 6515]), array([   0,    1,    2, ..., 6513, 6514, 6515]))
42458256
42458256
[ 1.         -0.00614655 -0.00689111 ...  0.00188472 -0.00571867
  1.        ]


In [11]:
df_cossim = pairwise_similarity(cosine_similarity_csr, name_vector)
df_cossim = (
    df_cossim
    .sort(by=['similarity'], descending = True)
 )
 
print(df_cossim)
print(df_cossim.shape)
df_cossim.select(pl.col('left_side'), pl.col('right_side'), pl.col('similarity').alias('similarity_sg')).write_csv('../temp_folder/SG_cossim.csv', separator=";")

shape: (42_458_256, 3)
┌─────────────────┬─────────────────┬────────────┐
│ left_side       ┆ right_side      ┆ similarity │
│ ---             ┆ ---             ┆ ---        │
│ str             ┆ str             ┆ f64        │
╞═════════════════╪═════════════════╪════════════╡
│ KORONA          ┆ KORONA          ┆ 1.0        │
│ DEMAK UP COCOON ┆ DEMAK UP COCOON ┆ 1.0        │
│ VOILA           ┆ VOILA           ┆ 1.0        │
│ NATUR ATTITUD   ┆ NATUR ATTITUD   ┆ 1.0        │
│ …               ┆ …               ┆ …          │
│ MYWAY           ┆ MILKY KISS      ┆ -0.092475  │
│ MILKY KISS      ┆ MYWAY           ┆ -0.092475  │
│ FABULON         ┆ STAR WARS       ┆ -0.117085  │
│ STAR WARS       ┆ FABULON         ┆ -0.117085  │
└─────────────────┴─────────────────┴────────────┘
(42458256, 3)
