In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sentence_transformers import SentenceTransformer
import json

In [3]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [5]:
data = []
with open("multinli_1.0_train.jsonl", "r") as f:
    for line in f:
        data.append(json.loads(line))

In [6]:
len(data)

392702

In [7]:
extracted_data = [{'sentence1': item['sentence1'], 

                   'sentence2': item['sentence2'], 

                   'gold_label': item['gold_label']} 

                  for item in data if item['gold_label'] in ['neutral', 'entailment', 'contradiction']]

# Convertir en DataFrame pour simplifier le traitement

df = pd.DataFrame(extracted_data)

# Aperçu des données extraites

df.head()

Unnamed: 0,sentence1,sentence2,gold_label
0,Conceptually cream skimming has two basic dime...,Product and geography are what make cream skim...,neutral
1,you know during the season and i guess at at y...,You lose the things to the following level if ...,entailment
2,One of our number will carry out your instruct...,A member of my team will execute your orders w...,entailment
3,How do you know? All this is their information...,This information belongs to them.,entailment
4,yeah i tell you what though if you go price so...,The tennis shoes have a range of prices.,neutral


In [8]:
df_subset = df.sample(n=10000, random_state=42)

In [9]:
df_subset["embedding1"] = df_subset["sentence1"].apply(lambda x: model.encode(x, convert_to_numpy=True))
df_subset["embedding2"] = df_subset["sentence2"].apply(lambda x: model.encode(x, convert_to_numpy=True))

In [11]:
df_subset.to_parquet("dataset_with_embeddings.parquet", index=False)

In [12]:
df_subset["cosine_similarity"] = df_subset.apply(lambda row: cosine_similarity([row["embedding1"]], [row["embedding2"]])[0][0], axis=1)


In [13]:
df_subset

Unnamed: 0,sentence1,sentence2,gold_label,embedding1,embedding2,cosine_similarity
155219,That's all right.,That's fine.,entailment,"[0.016657539, 0.01060511, -0.050037496, -0.079...","[-0.0507622, -0.05900257, -0.076395094, -0.025...",0.609108
327538,Televisions are expensive these days.,Televisions cost $200 when you purchase them f...,neutral,"[0.05825889, 0.017244086, 0.060712803, -0.0311...","[-0.04066672, 0.01306669, 0.029700672, -0.0778...",0.711511
325456,The colonial porte cochre sets the tone for Wa...,The colonial porte coche on the front of the p...,neutral,"[0.041701563, 0.016567806, -0.054379493, 0.007...","[0.100035, 0.03889193, 0.055443026, -0.0165682...",0.510867
341461,and you just sift,"Don't sift, ever.",contradiction,"[0.09222918, -0.035314932, 0.07395715, 0.03412...","[0.04434778, -0.0026965349, 0.074316286, -0.00...",0.702823
54698,You think there's somethin' in all that talk T...,Topham had supported the talk due to his own d...,neutral,"[-0.071036026, -0.061390392, 0.037302133, -0.0...","[0.046486303, 0.0028046842, 0.02795006, -0.036...",0.558429
...,...,...,...,...,...,...
26259,But I never went after him.,"I never followed him, said the investigator.",neutral,"[0.078442946, 0.022416787, -0.019123346, -0.00...","[0.044823203, 0.114247605, -0.038013626, 0.031...",0.631499
6705,it's uh it's kind of unusual book it's a lot a...,"The book is unusual, but it is very good.",neutral,"[-0.029462632, 0.04509958, -0.015067618, 0.031...","[-0.041380197, -0.0031413352, -6.911745e-05, 0...",0.620203
196700,(I learned this from the book Succeeding Gener...,"Robert Haveman and the Grinch wrote ""How to St...",contradiction,"[-0.002023627, 0.004648458, -0.025176905, 0.02...","[-0.10460997, 0.104579404, -0.044954475, 0.016...",0.261064
313511,The Nonomiya Shrine is uniquely renowned for i...,The old culture of Japanese emperors had a gre...,neutral,"[0.05340124, 0.061979506, -0.04517284, 0.01702...","[-0.05257546, 0.04215894, 0.0049503576, -0.028...",0.341712


In [14]:
df_subset["euclidian_distance"] = df_subset.apply(lambda row: euclidean_distances([row["embedding1"]], [row["embedding2"]])[0][0], axis=1)
