In [1]:
import sys
sys.path.append("../")

In [2]:
import polars as pl
import numpy as np
import yaml

from scipy.sparse import csr_matrix

from sentence_transformers import SentenceTransformer, util

from utils.load_data import *
from utils.pairwise_similarity import pairwise_similarity

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

  from .autonotebook import tqdm as notebook_tqdm


#### Load Data

In [3]:
with open("../config.yml", "r") as file:
    config = yaml.safe_load(file)

In [4]:
monoprix_gold = gold('../data/raw/', 'monoprix', config).collect()
franprix_gold = gold('../data/raw/', 'franprix', config).collect()
auchan_gold = gold('../data/raw/', 'auchan', config).collect()

datasets = [monoprix_gold, franprix_gold, auchan_gold]

In [5]:
dataset_init = (
    pl.concat(
        [
            dataset.select(pl.col("brand_desc_slug"))
            for dataset in datasets
        ], 
        how="vertical"
        )
    .unique()
)

#### Encode sentences

In [6]:
model = SentenceTransformer('all-MiniLM-L6-v2')
# 'all-mpnet-base-v2', 'all-MiniLM-L6-v2'

sentences = (
    dataset_init
    .get_columns()[0].to_list()
)
print(len(sentences))

#Encode all sentences
embeddings = model.encode(sentences)
print(embeddings.shape)

# ## Print the embeddings
# for sentence, embedding in zip(sentences, embeddings):
#     print("Sentence:", sentence)
#     print("Embedding:", embedding)
#     print("")

6516
(6516, 384)


#### Similarity

In [7]:
cossim = np.zeros((len(sentences), len(sentences)))

for i in range(len(sentences)):
    cossim[:,i] = util.cos_sim(embeddings[i], embeddings[:])
    # cossim[i:,i] = util.cos_sim(embeddings[i], embeddings[i:])

df_cossim = pairwise_similarity(csr_matrix(cossim), sentences)
df_cossim = (
    df_cossim
    .sort(by=['similarity'], descending = True)
 )
print(df_cossim)
print(df_cossim.shape)
df_cossim.select(pl.col('left_side'), pl.col('right_side'), pl.col('similarity').alias('similarity_st')).write_csv('../data/ST_cossim.csv', separator=";")

shape: (42_458_256, 3)
┌────────────────────────────────┬────────────────────────────────┬────────────┐
│ left_side                      ┆ right_side                     ┆ similarity │
│ ---                            ┆ ---                            ┆ ---        │
│ str                            ┆ str                            ┆ f64        │
╞════════════════════════════════╪════════════════════════════════╪════════════╡
│ EUROSPEN                       ┆ EUROSPEN                       ┆ 1.0        │
│ AQUA                           ┆ AQUA                           ┆ 1.0        │
│ CANARD                         ┆ CANARD                         ┆ 1.0        │
│ PLANTA FIN                     ┆ PLANTA FIN                     ┆ 1.0        │
│ …                              ┆ …                              ┆ …          │
│ MCFARLANE                      ┆ BLANC DES HAUTES PYRENEES      ┆ -0.211125  │
│ BLANC DES HAUTES PYRENEES      ┆ MCFARLANE                      ┆ -0.211125  │
│ AKA

In [8]:
# #Compute cosine similarity between all pairs
# cos_sim = util.cos_sim(embeddings, embeddings)

# #Add all pairs to a list with their cosine similarity score
# all_sentence_combinations = []
# for i in range(len(cos_sim)-1):
#     for j in range(i+1, len(cos_sim)):
#         if cos_sim[i][j] < 0.999999:
#             all_sentence_combinations.append([cos_sim[i][j], i, j])

# #Sort list by the highest cosine similarity score
# all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)

# print("Top-10 most similar pairs:")
# for score, i, j in all_sentence_combinations[0:10]:
#     print("{} \t {} \t {:.4f}".format(sentences[i], sentences[j], cos_sim[i][j]))

#### Get groups

In [9]:
# from utils.pairwise_similarity import pairwise_similarity
# from utils.group_similar_strings import group_similar_strings

In [10]:
# #Two parameters to tune:
# #min_cluster_size: Only consider cluster that have at least n elements
# #threshold: Consider sentence pairs with a cosine-similarity larger than threshold as similar

# #Encode all sentences
# embeddings_torch = model.encode(sentences, convert_to_tensor=True)
# clusters = util.community_detection(embeddings_torch, min_community_size=1, threshold=0.85)

# #Print clusters elements
# clusters_dict = {}
# for i, cluster in enumerate(clusters):
#     for sentence_id in cluster:
#         clusters_dict[sentences[sentence_id]] = i

# clusters_cossim = (
#     pl.DataFrame(list(zip(clusters_dict.keys(), clusters_dict.values())),
#                  schema=['name', 'group_name'])
# )
# print(clusters_cossim)

# clusters_cossim.write_csv('../datasets/ST_group_strings.csv', separator=";")
        
# # #Print elements for all clusters
# # for i, cluster in enumerate(clusters):
# #     print("\nCluster {}, #{} Elements ".format(i+1, len(cluster)))
# #     for sentence_id in cluster:
# #         print("\t", sentences[sentence_id])

In [11]:
# df_compare = (
#     pl.scan_csv('../datasets/SG_cossim.csv', separator = ';').rename({"similairity": "similairity_SG"})
#     .join(pl.scan_csv('../datasets/ST_cossim.csv', separator = ';').rename({"similairity": "similairity_ST"}), 
#           on=['left_side', 'right_side'], 
#           how="outer")
#     .join(pl.scan_csv('../datasets/Nomenclature_cossim.csv', separator = ';').rename({"similairity": "similairity_Nomenclature"}), 
#           on=['left_side', 'right_side'], 
#           how="outer")
#     .collect()
# )
# df_compare.write_csv('../datasets/df_compare.csv', separator=";")