In [1]:
import sys
sys.path.append("../")

In [2]:
import polars as pl
import yaml

from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.decomposition import PCA

from utils.load_data import *
from utils.pairwise_similarity import pairwise_similarity

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)



### Load Data

In [3]:
with open("../config.yml", "r") as file:
    config = yaml.safe_load(file)

In [4]:
monoprix_gold = gold('../data/', 'monoprix', config).collect()
franprix_gold = gold('../data/', 'franprix', config).collect()
auchan_gold = gold('../data/', 'auchan', config).collect()

datasets = [monoprix_gold, franprix_gold, auchan_gold]

In [5]:
dataset_init = (
    pl.concat(
        [
            dataset
            .select([pl.col("product_id"), pl.col("brand_desc_slug").alias(f'brand_desc_slug_{i}')]
            + [pl.col(f'level{l}').alias(f"level{l}_{i}") for l in config['classification_levels']])
            for i, dataset in enumerate(datasets)
        ], 
        how="align"
        )
    .with_columns(pl.concat_list([f'brand_desc_slug_{i}' for i, _ in enumerate(datasets)]).alias('brand_desc_slug'))
    .drop([f'brand_desc_slug_{i}' for i, _ in enumerate(datasets)])
    .explode("brand_desc_slug")
    .filter(pl.col('brand_desc_slug').is_not_null())
)
print(dataset_init.shape)
dataset_init.head(5)

(99359, 11)


product_id,level0_0,level1_0,level2_0,level0_1,level1_1,level2_1,level0_2,level1_2,level2_2,brand_desc_slug
str,str,str,str,str,str,str,str,str,str,str
"""0000000001649""",,,,"""SURGELES""","""GLACES ET PATI…","""PATISSERIE ET …",,,,"""LITTLE MOONS M…"
"""0000000007198""",,,,"""FRUITS ET LEGU…","""FRUITS""","""MELON ET PASTE…",,,,"""BIO ENSEMBLE"""
"""0000000007256""",,,,"""FRUITS ET LEGU…","""FRUITS ET LEGU…","""BIO FRUITS ET …",,,,"""BIO ENSEMBLE"""
"""0000000007395""",,,,"""FRUITS ET LEGU…","""LEGUMES""","""AIL OIGNONS HE…",,,,"""BIO ENSEMBLE"""
"""0000000007396""",,,,"""FRUITS ET LEGU…","""LEGUMES""","""AIL OIGNONS HE…",,,,"""BIO ENSEMBLE"""


### Preprocessing

In [6]:
dataset_init_dummies = (
    dataset_init
    .select(["brand_desc_slug"]
            + [pl.col(c) for c in dataset_init.columns if c.startswith('level') and not c.endswith('2')] ## drop Franprix classification
            )
    .unique()
    .to_dummies([c for c in dataset_init.columns if c.startswith('level') and not c.endswith('2')])
)

dataset_init_dummies = (
    dataset_init_dummies
    .drop([col for col in dataset_init_dummies.columns if col.endswith('null')])
    .groupby("brand_desc_slug")
    .max()

)

print(dataset_init_dummies.shape)
dataset_init_dummies.head()

(6516, 1126)


brand_desc_slug,level0_0_ANIMAUX,level0_0_BEAUTY WEEK,level0_0_BEBE,level0_0_BOISSON CAVE,level0_0_BOUCHERIE VOLAILLE,level0_0_CHARCUTERIE,level0_0_CREMERIE,level0_0_ENTRETIEN,level0_0_EPICERIE SALEE,level0_0_EPICERIE SUCREE,level0_0_FRUIT LEGUME,level0_0_HYGIENE BEAUTE,level0_0_MAISON,level0_0_NUTRITION DIETETIQUE,level0_0_PAIN VIENNOISERIE,level0_0_PARAPHARMACIE,level0_0_PRODUIT DE LA MER,level0_0_SURGELES,level0_0_TRAITEUR,level1_0_20 SUPPLEMENTAIRES AVEC LA CARTE M,level1_0_ACCESSOIRE MENAGER,level1_0_ALCOOL COCKTAIL,level1_0_ALIMENTATION BEBE,level1_0_APERITIF ENTREE SNACK,level1_0_APERITIF FRAIS,level1_0_ASSAISONNEMENT CONDIMENT,level1_0_BEAUTE BIO,level1_0_BEURRE MARGARINE,level1_0_BIERE CIDRE,level1_0_BISCOTTE TARTINE GALETTE DE RIZ,level1_0_BISCUIT SEC,level1_0_BOEUF,level1_0_BOISSON VEGETALE,level1_0_BOUCHERIE A LA COUPE,level1_0_BOUCHERIE VOLAILLE BIO,level1_0_BRICOLAGE,…,level2_1_SAUCISSES KNACKS ET BOUDINS,level2_1_SAUCISSONS ENTIERS ET CHORIZO,level2_1_SAUMON FUME ET TRUITE,level2_1_SAVONS ET RECHARGES,level2_1_SELS POIVRES ET EPICES,level2_1_SHAMPOINGS SHAMPOINGS SECS,level2_1_SIROPS,level2_1_SOLS VITRES ET MEUBLES,level2_1_SOUPES ET VELOUTES,level2_1_STEAK ET VIANDE HACHEE SURGELES,level2_1_SUCRES EN POUDRE SUCRES EN MORCEAUX,level2_1_SURIMI ET TRAITEUR DE LA MER,level2_1_TABLETTES DE CHOCOLAT,level2_1_TABLETTES ET ENTRETIEN LAVE VAISSELLE,level2_1_TARTES ET ENTREES SURGELEES,level2_1_TARTINABLE,level2_1_TARTINABLE ET APERITIF DE LA MER,level2_1_TEXTILE,level2_1_THE,level2_1_THE ET INFUSIONS,level2_1_THE INFUSION ET COMPLEMENTS,level2_1_TOUT POUR LA FETE,level2_1_TRAITEUR VEGETAL,level2_1_USTENSILES DE CUISINE,level2_1_USTENSILES DE CUISINE ET MAISON,level2_1_VAISSELLES JETABLES,level2_1_VERRE PICARDIE 250ML M,level2_1_VINAIGRES VINAIGRETTES ET SAUCES SALADES,level2_1_VODKA RHUM GIN,level2_1_VOLAILLE,level2_1_VOLAILLE ET BOULETTES SURGELEES,level2_1_WHISKY,level2_1_YAOURT ALLEGE BIEN ETRE ET SKYR,level2_1_YAOURT ET FROMAGE BLANC NATURE,level2_1_YAOURTS A BOIRE ET ENFANT,level2_1_YAOURTS AUX FRUITS ET AROMATISES,level2_1_YAOURTS BREBIS ET CHEVRE
str,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,…,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8
"""24TERRE CAFE""",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""WONDERBAG""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""LE PERE DEFRAN…",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""CLEAN HUGS""",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""BABYMOOV""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
## create list of brands
name_vector = (
    dataset_init_dummies
    .get_columns()[0].to_list()
)
print(len(name_vector))

6516


### PCA

In [8]:
pca = PCA(n_components=0.90)
tf_idf_matrix_ngram_pca = pca.fit_transform(dataset_init_dummies.drop('brand_desc_slug'))
pca.n_components_

294

In [9]:
concat_matrix = csr_matrix(tf_idf_matrix_ngram_pca)
print(len(concat_matrix.nonzero()[0]))
print(len(concat_matrix.nonzero()[1]))
concat_matrix.data

1915704
1915704


array([ 6.94347843e-01, -1.71360688e-01,  1.59446731e-01, ...,
       -4.68069824e-04,  2.24634610e-04,  2.63698565e-04])

### Similarity

In [10]:
# Compute cosine similarity matrix
cosine_similarity = cosine_similarity(concat_matrix)
cosine_similarity_csr = csr_matrix(cosine_similarity) 

print(cosine_similarity_csr.nonzero())
print(len(cosine_similarity_csr.nonzero()[0]), len(cosine_similarity_csr.nonzero()[1]))
print(cosine_similarity_csr.data)

(array([   0,    0,    0, ..., 6515, 6515, 6515]), array([   0,    1,    2, ..., 6513, 6514, 6515]))
42458256 42458256
[ 1.         -0.15268757 -0.15268757 ...  0.05825587  1.
  1.        ]


In [11]:
df_cossim = pairwise_similarity(cosine_similarity_csr, name_vector)
df_cossim = (
    df_cossim
    .sort(by=['similarity'], descending = True)
 )
 
print(df_cossim)
print(df_cossim.shape)
df_cossim.select(pl.col('left_side'), pl.col('right_side'), pl.col('similarity').alias('similarity_nomenclature')).write_csv('../temp_folder/Nomenclature_cossim.csv', separator=";")

shape: (42_458_256, 3)
┌─────────────────────────┬─────────────────────────┬────────────┐
│ left_side               ┆ right_side              ┆ similarity │
│ ---                     ┆ ---                     ┆ ---        │
│ str                     ┆ str                     ┆ f64        │
╞═════════════════════════╪═════════════════════════╪════════════╡
│ DELACRE DELICHOC        ┆ DELACRE DELICHOC        ┆ 1.0        │
│ FLUOCARIL PARAPHARMACIE ┆ FLUOCARIL PARAPHARMACIE ┆ 1.0        │
│ PYREX                   ┆ PYREX                   ┆ 1.0        │
│ BELZEBUTH               ┆ O SUN                   ┆ 1.0        │
│ …                       ┆ …                       ┆ …          │
│ REFLECTA                ┆ VICHY                   ┆ -0.353393  │
│ VICHY                   ┆ REFLECTA                ┆ -0.353393  │
│ LEISURE                 ┆ VICHY                   ┆ -0.353393  │
│ VICHY                   ┆ LEISURE                 ┆ -0.353393  │
└─────────────────────────┴────────────