In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from scipy.stats import pearsonr, spearmanr
import numpy as np
from sklearn.preprocessing import Binarizer
from scipy.spatial.distance import cosine, cityblock, canberra, chebyshev, minkowski, braycurtis
from scipy.spatial import distance
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import download
from timeit import timeit




In [2]:

# Téléchargez les ressources nécessaires de NLTK la première fois
"""download('punkt')
download('stopwords')
download('wordnet')"""

# Fonction de nettoyage des phrases
def clean_text(text):
    # Convertir le texte en minuscules
    text = text.lower()
    
    # Supprimer la ponctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Supprimer les nombres
    text = re.sub(r'\d+', '', text)
    
    # Supprimer les caractères non alphabétiques
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Tokenisation
    words = word_tokenize(text)
    
    # Suppression des stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Lemmatisation
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Rejoindre les mots pour former la phrase nettoyée
    cleaned_text = ' '.join(words)
    
    return cleaned_text


# Chargez le modèle 'intfloat/multilingual-e5-large'
tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')
model = AutoModel.from_pretrained('intfloat/multilingual-e5-large')

def average_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


def calculate_similarity(embedding1, embedding2):
    # Normalisation des embeddings pour certaines mesures
    embedding1 = embedding1.detach().numpy().flatten()
    embedding2 = embedding2.detach().numpy().flatten()
    embedding1_normalized = F.normalize(torch.tensor(embedding1), p=2, dim=0).numpy()
    embedding2_normalized = F.normalize(torch.tensor(embedding2), p=2, dim=0).numpy()

    # Déjà existants
    euclidean = np.linalg.norm(embedding1_normalized - embedding2_normalized)
    manhattan = cityblock(embedding1_normalized, embedding2_normalized)
    pearson_corr, _ = pearsonr(embedding1_normalized, embedding2_normalized)
    spearman_corr, _ = spearmanr(embedding1_normalized, embedding2_normalized)
    jaccard = distance.jaccard(embedding1 > 0, embedding2 > 0)

    # Nouveaux indicateurs
    cosine_sim = 1 - cosine(embedding1_normalized, embedding2_normalized)
    canberra_distance = canberra(embedding1_normalized, embedding2_normalized)
    chebyshev_distance = chebyshev(embedding1_normalized, embedding2_normalized)
    minkowski_distance = minkowski(embedding1_normalized, embedding2_normalized, 3)
    braycurtis_sim = 1 - braycurtis(embedding1_normalized, embedding2_normalized)
    kulsinski_distance = distance.kulsinski(embedding1 > 0, embedding2 > 0) 
    dice_distance = distance.dice(embedding1 > 0, embedding2 > 0)
    sokalmichener_distance = distance.sokalmichener(embedding1 > 0, embedding2 > 0)
    sokalsneath_distance = distance.sokalsneath(embedding1 > 0, embedding2 > 0)

    # Définir des seuils pour chaque mesure
    thresholds = {
    'euclidean': 0.588,    # Augmentez si 'euclidean' est trop strict. FAIT
    'manhattan': 0.2,    # Augmentez si 'manhattan' est trop strict. FAIT
    'pearson': 0.8,      # Diminuez si 'pearson' est trop tolérant. FAIT
    'spearman': 0.8,     # Diminuez si 'spearman' est trop tolérant. FAIT
    'jaccard': 0.26,      # Diminuez si 'jaccard' est trop tolérant. FAIT
    'cosine': 0.785,       # Augmentez si 'cosine' est trop strict. FAIT
    'canberra': 0.5,     # Augmentez si 'canberra' est trop strict. FAIT
    'chebyshev': 0.5,    # Augmentez si 'chebyshev' est trop strict. FAIT
    'minkowski': 0.243,    # Augmentez si 'minkowski' est trop strict. FAIT
    'braycurtis': 0.7,   # Augmentez si 'braycurtis' est trop strict. FAIT
    'kulsinski': 0.05,    # Augmentez si 'kulsinski' est trop strict. FAIT
    'dice': 0.13,         # Augmentez si 'dice' est trop strict.  FAIT
    'sokalmichener': 0.23,# Augmentez si 'sokalmichener' est trop strict.
    'sokalsneath': 0.4,  # Augmentez si 'sokalsneath' est trop strict. FAIT
    }
    # Vote majoritaire
    votes = [
        euclidean < thresholds['euclidean'],
        manhattan < thresholds['manhattan'],
        pearson_corr > thresholds['pearson'],
        spearman_corr > thresholds['spearman'],
        jaccard < thresholds['jaccard'],
        cosine_sim > thresholds['cosine'],
        canberra_distance < thresholds['canberra'],
        chebyshev_distance < thresholds['chebyshev'],
        minkowski_distance < thresholds['minkowski'],
        braycurtis_sim > thresholds['braycurtis'],
        kulsinski_distance < thresholds['kulsinski'],
        dice_distance < thresholds['dice'],
        sokalmichener_distance < thresholds['sokalmichener'],
        sokalsneath_distance < thresholds['sokalsneath'],
    ]
    print(votes)
    print("Nombre de True : ",votes.count(True), "          Nombre de False : ", votes.count(False))
    print("Nombre de vote : ",len(votes))
    # Retourne True si la majorité des indicateurs sont au-dessus/au-dessous des seuils
    return votes.count(True) > (len(votes) / 2 )

In [4]:
def generate_embeddings(cleaned_texts):
    # Tokenisation des phrases
    inputs = tokenizer(cleaned_texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
    
    # Génération d'embeddings
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True) 
    
    # Application de l'average pooling
    embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
    
    # Normalisation des embeddings
    normalized_embeddings = F.normalize(embeddings, p=2, dim=1)
    
    return normalized_embeddings

def compare_to_theme(input_sentence, theme_embedding):
    # Nettoyer la phrase d'entrée et générer l'embedding associé
    cleaned_input = clean_text(input_sentence)
    input_embedding = generate_embeddings(cleaned_input)
    
    
    # Comparer l'embedding de la phrase d'entrée avec la moyenne des embeddings du thème
    similar = calculate_similarity(input_embedding, theme_embedding)
    
    return similar

In [5]:
#Création du dataset de référence

#Import du dataset
#pd.set_option('display.max_colwidth', None)
df = pd.read_excel("C://ESILV//A4//Projet_PI2//Dist_compare//Base_clean.xlsx")

#Création du dataset de référence
reference_database = df[df["Isclimate"] == 1]
reference_database.info()

#Embedding sur tout le dataframe
listed_database = reference_database["Titre_Article"].to_list()
cleaned_database = []
for line in listed_database:
    cleaned_database.append(clean_text(line))

embedded_database = generate_embeddings(cleaned_database)

list=[]
for i in range(len(embedded_database)):
    list.append(embedded_database[i])
reference_database["Vectorize"] = list

reference_database.to_excel("Reference_database.xlsx", index = False)

<class 'pandas.core.frame.DataFrame'>
Index: 294 entries, 3765 to 5710
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Titre_Article      294 non-null    object 
 1   Source             294 non-null    object 
 2   Date               84 non-null     object 
 3   Pays               294 non-null    object 
 4   Journal            294 non-null    object 
 5   Texte_Article      78 non-null     object 
 6   Domaine            43 non-null     object 
 7   Isclimate          294 non-null    float64
 8   __index_level_0__  0 non-null      float64
 9   Cluster            294 non-null    int64  
 10  CloseToIsClimate   294 non-null    int64  
dtypes: float64(2), int64(2), object(7)
memory usage: 27.6+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reference_database["Vectorize"] = list


In [20]:
import h5py

# Assuming df["vectorize"] contains your tensors
tensors = reference_database["Vectorize"].values

# Create an HDF5 file
with h5py.File('tensors_reference.h5', 'w') as file:
    # Create a dataset to store tensors
    tensors_reference = file.create_dataset('tensors_reference', shape=(len(tensors),), dtype=h5py.vlen_dtype(np.dtype('float32')))

    # Store your tensors in the dataset
    for i, tensor in enumerate(tensors):
        tensors_reference[i] = tensor

In [41]:
# Load the stored tensors
with h5py.File('tensors_reference.h5', 'r') as file:
    tensors_reference = file['tensors_reference'][:]

tensors_reference

array([array([ 0.01163753, -0.01089273, -0.01454662, ..., -0.0105596 ,
              -0.054786  ,  0.01875991], dtype=float32)               ,
       array([ 0.01546439,  0.00607491, -0.03907051, ..., -0.01163721,
              -0.00448693,  0.00599643], dtype=float32)               ,
       array([ 0.02676414, -0.01934784, -0.02871615, ..., -0.00173952,
              -0.03507503,  0.01161552], dtype=float32)               ,
       array([ 0.02556332,  0.01451178, -0.03627403, ...,  0.03110815,
               0.00267825,  0.04239264], dtype=float32)               ,
       array([ 0.00611275,  0.01518017, -0.02250356, ..., -0.00161508,
              -0.05637214,  0.00449631], dtype=float32)               ,
       array([ 0.01906134, -0.00095603, -0.0037108 , ..., -0.0047059 ,
              -0.0477404 , -0.00304553], dtype=float32)               ,
       array([ 0.0324375 , -0.00757367, -0.03407559, ...,  0.01958467,
              -0.00551551,  0.02679076], dtype=float32)               ,

In [18]:
df = pd.read_excel("C://ESILV//A4//Projet_PI2//Dist_compare//Base_clean.xlsx")

#Embedding sur tout le dataframe
listed_database = df["Titre_Article"].to_list()
cleaned_database = []
i=0
for i in range(len(listed_database)):
        if i<50:
            cleaned_database.append(clean_text(listed_database[i]))
            i+=1
            
embedded_database = generate_embeddings(cleaned_database)

In [30]:
all_vectorized_df = df.iloc[0:50]
list=[]
for i in range(len(embedded_database)):
    list.append(embedded_database[i])
all_vectorized_df["Vectorize"] = list

# all_vectorized_df.to_excel("All_vectorized_database.xlsx", index = False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_vectorized_df["Vectorize"] = list


In [32]:
# Assuming df["vectorize"] contains your tensors
tensors_bis = all_vectorized_df["Vectorize"].values

# Create an HDF5 file
with h5py.File('tensors_comparison.h5', 'w') as file:
    # Create a dataset to store tensors
    tensors_comparison = file.create_dataset('tensors_comparison', shape=(len(tensors_bis),), dtype=h5py.vlen_dtype(np.dtype('float32')))

    # Store your tensors in the dataset
    for i, tensor in enumerate(tensors_bis):
        tensors_comparison[i] = tensor

In [40]:
# Load the stored tensors
with h5py.File('tensors_comparison.h5', 'r') as file:
    tensors_comparison = file['tensors_comparison'][:]

tensors_comparison

array([array([ 0.01341116,  0.00415453, -0.03149473, ..., -0.00991047,
              -0.04159684,  0.00140211], dtype=float32)               ,
       array([ 0.03141526, -0.00026802, -0.01040392, ...,  0.00502164,
              -0.04908801, -0.00306224], dtype=float32)               ,
       array([-0.0091328 , -0.01193233, -0.00010347, ...,  0.00498217,
              -0.06189182,  0.01995868], dtype=float32)               ,
       array([ 0.03092301, -0.02710943, -0.010615  , ..., -0.01046598,
              -0.06647605, -0.00683409], dtype=float32)               ,
       array([ 0.06471872, -0.01150004, -0.03106027, ...,  0.00918655,
              -0.05285541,  0.0296692 ], dtype=float32)               ,
       array([ 0.04019762,  0.00261312, -0.02090177, ...,  0.00491759,
              -0.02279289,  0.01636376], dtype=float32)               ,
       array([ 0.02222293,  0.01303932, -0.01164705, ..., -0.00323848,
              -0.01862807,  0.01761449], dtype=float32)               ,

In [26]:
#restore the All_vectorized_database.xlsx
all_vectorized_df = pd.read_excel("C://ESILV//A4//Projet_PI2//Dist_compare//All_vectorized_database.xlsx")
reference_database = pd.read_excel("C://ESILV//A4//Projet_PI2//Dist_compare//Reference_database.xlsx")
all_vectorized_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Titre_Article      1000 non-null   object 
 1   Source             1000 non-null   object 
 2   Date               1000 non-null   object 
 3   Pays               1000 non-null   object 
 4   Journal            1000 non-null   object 
 5   Texte_Article      941 non-null    object 
 6   Domaine            967 non-null    object 
 7   Isclimate          1000 non-null   int64  
 8   __index_level_0__  0 non-null      float64
 9   Cluster            1000 non-null   int64  
 10  CloseToIsClimate   1000 non-null   int64  
 11  Vectorize          1000 non-null   object 
dtypes: float64(1), int64(3), object(8)
memory usage: 93.9+ KB


In [72]:
#modify the function in order to remove the prints
def calculate_similarity(embedding1, embedding2):
    # Normalisation des embeddings pour certaines mesures
    embedding1 = embedding1.detach().numpy().flatten()
    embedding2 = embedding2.detach().numpy().flatten()
    embedding1_normalized = F.normalize(torch.tensor(embedding1), p=2, dim=0).numpy()
    embedding2_normalized = F.normalize(torch.tensor(embedding2), p=2, dim=0).numpy()

    # Déjà existants
    euclidean = np.linalg.norm(embedding1_normalized - embedding2_normalized)
    manhattan = cityblock(embedding1_normalized, embedding2_normalized)
    pearson_corr, _ = pearsonr(embedding1_normalized, embedding2_normalized)
    spearman_corr, _ = spearmanr(embedding1_normalized, embedding2_normalized)
    jaccard = distance.jaccard(embedding1 > 0, embedding2 > 0)

    # Nouveaux indicateurs
    cosine_sim = 1 - cosine(embedding1_normalized, embedding2_normalized)
    canberra_distance = canberra(embedding1_normalized, embedding2_normalized)
    chebyshev_distance = chebyshev(embedding1_normalized, embedding2_normalized)
    minkowski_distance = minkowski(embedding1_normalized, embedding2_normalized, 3)
    braycurtis_sim = 1 - braycurtis(embedding1_normalized, embedding2_normalized)
    kulsinski_distance = distance.kulsinski(embedding1 > 0, embedding2 > 0) 
    dice_distance = distance.dice(embedding1 > 0, embedding2 > 0)
    sokalmichener_distance = distance.sokalmichener(embedding1 > 0, embedding2 > 0)
    sokalsneath_distance = distance.sokalsneath(embedding1 > 0, embedding2 > 0)

    # Définir des seuils pour chaque mesure
    thresholds = {
    'euclidean': 0.588,    # Augmentez si 'euclidean' est trop strict. FAIT
    'manhattan': 0.2,    # Augmentez si 'manhattan' est trop strict. FAIT
    'pearson': 0.8,      # Diminuez si 'pearson' est trop tolérant. FAIT
    'spearman': 0.8,     # Diminuez si 'spearman' est trop tolérant. FAIT
    'jaccard': 0.26,      # Diminuez si 'jaccard' est trop tolérant. FAIT
    'cosine': 0.785,       # Augmentez si 'cosine' est trop strict. FAIT
    'canberra': 0.5,     # Augmentez si 'canberra' est trop strict. FAIT
    'chebyshev': 0.5,    # Augmentez si 'chebyshev' est trop strict. FAIT
    'minkowski': 0.243,    # Augmentez si 'minkowski' est trop strict. FAIT
    'braycurtis': 0.7,   # Augmentez si 'braycurtis' est trop strict. FAIT
    'kulsinski': 0.05,    # Augmentez si 'kulsinski' est trop strict. FAIT
    'dice': 0.13,         # Augmentez si 'dice' est trop strict.  FAIT
    'sokalmichener': 0.23,# Augmentez si 'sokalmichener' est trop strict.
    'sokalsneath': 0.4,  # Augmentez si 'sokalsneath' est trop strict. FAIT
    }
    # Vote majoritaire
    votes = [
        euclidean < thresholds['euclidean'],
        manhattan < thresholds['manhattan'],
        pearson_corr > thresholds['pearson'],
        spearman_corr > thresholds['spearman'],
        jaccard < thresholds['jaccard'],
        cosine_sim > thresholds['cosine'],
        canberra_distance < thresholds['canberra'],
        chebyshev_distance < thresholds['chebyshev'],
        minkowski_distance < thresholds['minkowski'],
        braycurtis_sim > thresholds['braycurtis'],
        kulsinski_distance < thresholds['kulsinski'],
        dice_distance < thresholds['dice'],
        sokalmichener_distance < thresholds['sokalmichener'],
        sokalsneath_distance < thresholds['sokalsneath'],
    ]
    # print(votes)
    # print("Nombre de True : ",votes.count(True), "          Nombre de False : ", votes.count(False))
    # print("Nombre de vote : ",len(votes))
    # Retourne True si la majorité des indicateurs sont au-dessus/au-dessous des seuils
    return votes.count(True) > (len(votes) / 2 )

In [86]:
all_results = []
not_results = []
highest_similarity = 0
similarity = {}

for i in range(len(tensors_comparison)):
    comparison_tensor = torch.tensor(tensors_comparison[i])
    count_true = 0
    count_false = 0

    print(all_vectorized_df.iloc[i]["Titre_Article"])

    for j in range(len(tensors_reference)):    
        reference_tensor = torch.tensor(tensors_reference[j])

        if calculate_similarity(comparison_tensor, reference_tensor): 
            count_true += 1    
        else:           
            count_false += 1  
    
    #update du dico des similarités
    similarity.update({all_vectorized_df.iloc[i]["Titre_Article"] : count_true/(count_true+count_false)})

    #update du titre avec la plus haute similarité
    if count_true > highest_similarity:
        highest_similarity = count_true
        highest_similarity_title = all_vectorized_df.iloc[i]["Titre_Article"]     
    
    print("Nombre de True : ",count_true, "          Nombre de False : ", count_false)
    print("Nombre de vote : ",len(tensors_reference))

    #update des listes de résultats
    if count_false < count_true:
        print("LA PHRASE EST DANS LE THEME")
        all_results.append(all_vectorized_df.iloc[i]["Titre_Article"])
    else:
        print("LA PHRASE N'EST PAS DANS LE THEME", "\n")
        not_results.append(all_vectorized_df.iloc[i]["Titre_Article"])
    


These disruptors plan Hollywood ending for LA story


  kulsinski_distance = distance.kulsinski(embedding1 > 0, embedding2 > 0)


Nombre de True :  25           Nombre de False :  269
Nombre de vote :  294
LA PHRASE N'EST PAS DANS LE THEME 

Mexico ready to retaliate by hurting US farmers
Nombre de True :  4           Nombre de False :  290
Nombre de vote :  294
LA PHRASE N'EST PAS DANS LE THEME 

Driverless 'Roborace' car makes street track debut
Nombre de True :  0           Nombre de False :  294
Nombre de vote :  294
LA PHRASE N'EST PAS DANS LE THEME 

Supreme Court seems sympathetic to postal worker who didn't work Sundays in dispute over religious accommodations
Nombre de True :  0           Nombre de False :  294
Nombre de vote :  294
LA PHRASE N'EST PAS DANS LE THEME 

Here's what we know about the suspect in the Maine mass shooting
Nombre de True :  2           Nombre de False :  292
Nombre de vote :  294
LA PHRASE N'EST PAS DANS LE THEME 

What made Pelé so great
Nombre de True :  4           Nombre de False :  290
Nombre de vote :  294
LA PHRASE N'EST PAS DANS LE THEME 

'Love Actually' cast to reunite

In [85]:
print(len(all_results), len(not_results))
print("Le titre le plus proche est : \"", highest_similarity_title, "\" avec", highest_similarity, "votes")

0 50
Le titre le plus proche est : " Éducation. Muespach : l’école associative immersive en alsacien peine à se stabiliser " avec 135 votes


In [89]:
sorted_similarity=sorted(similarity.items(), key=lambda x: x[1], reverse=True)
sorted_similarity

[('Éducation. Muespach\xa0: l’école associative immersive en alsacien peine à se stabiliser',
  0.45918367346938777),
 ('Édition jeunesse. Quand Blienschwiller faisait la révolution',
  0.3843537414965986),
 ("L'entretien du dimanche. Fabcaro et Didier Conrad, auteurs d'Astérix\xa0: «\xa0On a apporté notre lot de bagarres\xa0»",
  0.35714285714285715),
 ('Trophées des collectivités d’Alsace. Cinquante-huit projets en lice pour l’édition 2023',
  0.3401360544217687),
 ('Diaporama. La semaine écoulée vue par nos photographes',
  0.3163265306122449),
 ('Loisirs. L’empire des petites briques va envahir Kembs le temps d’un week-end',
  0.2925170068027211),
 ('Vidéo. Musée de Saverne : les statues polychromes infestées partent en soins',
  0.282312925170068),
 ('Mackenheim. L’herbe du diable, une plante toxique, prolifère dans la forêt communale',
  0.272108843537415),
 ('Culture. En Alsace, les bébés spectateurs, « citoyens et publics de demain »',
  0.24489795918367346),
 ('Sécurité routiè

In [None]:
#Test vitesse
""" num_iteration = 10
def wrapper():
    generate_embeddings(input_sentence)

avg_time = timeit(wrapper, number=num_iteration)/num_iteration 
print("Temps moyen de run : ", avg_time)"""