Import libraries

In [11]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [13]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embedding

Import Data

In [14]:
data = pd.read_csv('data_riformulate.csv', delimiter = ";")
data_best = pd.read_csv('data_best.csv',delimiter=';')
domande = data['Domande']
riformulazioni = data['Riformulazione']

Embeddings

In [15]:
domande_embeddings = np.array([get_embedding(d) for d in domande])
riformulazioni_embeddings = np.array([get_embedding(r) for r in riformulazioni])

Top3 Function

In [16]:
def similarity_model(similarity_data, domande):
    lista_domande = []
    for i in range(3):
        max_index = np.argmax(similarity_data)
        lista_domande.append(domande[max_index])
        similarity_data[0,max_index] = -1
    return lista_domande

In [37]:
dom_sintetica = data_best['DomandaSintetica']
syn_question = np.array([get_embedding(d) for d in dom_sintetica])
best_questions_list = []


In [43]:
# Rimuovi la dimensione extra da domande_embeddings, riformulazioni_embeddings e syn_question
domande_embeddings = np.squeeze(domande_embeddings)  # Da (31, 1, 384) a (31, 384)
riformulazioni_embeddings = np.squeeze(riformulazioni_embeddings)  # Da (31, 1, 384) a (31, 384)
syn_question = np.squeeze(syn_question)  # Da (74, 1, 384) a (74, 384)


In [44]:
print(domande_embeddings.shape,'\n',riformulazioni_embeddings.shape,'\n',syn_question.shape)

(31, 384) 
 (31, 384) 
 (74, 384)


In [47]:

# Ciclo per calcolare la similarit√† per ogni domanda sintetica
for i in syn_question:
    similarity_values1 = cosine_similarity(i.reshape(1,-1), domande_embeddings)
    similarity_values2 = cosine_similarity(i.reshape(1,-1), riformulazioni_embeddings)
    
    similarity_values = [max(similarity_values1[0][j], similarity_values2[0][j]) for j in range(len(domande))]
    
    top_3_questions = similarity_model(np.array(similarity_values).reshape(1,-1), domande)
    best_questions_list.append(top_3_questions)

In [48]:

# Confronto con le domande "best" del dataset 'data_best'
correct_matches = 0  # Variabile per contare i match corretti

for i in range(len(best_questions_list)):
    if data_best['DomandaCorretta'][i] in best_questions_list[i]:
        correct_matches += 1


# Output del risultato
print(f"Top3 Accuracy Score is: {round(correct_matches/len(data_best),3)}")

log_error = {
    "DomandaSintetica": [],
    "DomandaCorretta": [],
    "ListaTop3": []
}

for i in range(len(best_questions_list)):
    if data_best['DomandaCorretta'][i] not in best_questions_list[i]:
        log_error["DomandaSintetica"].append(data_best['DomandaSintetica'][i])
        log_error["DomandaCorretta"].append(data_best['DomandaCorretta'][i])
        log_error["ListaTop3"].append(best_questions_list[i])

df_log = pd.DataFrame(log_error)
df_log.to_csv(r'c:\Users\antonio.proietti\OneDrive - A2A Group\Desktop\backup lavoro\File Antonio\ML Projects\CosineSimilarityNLP\log_error.csv', encoding = 'utf-8', index = False)

Top3 Accuracy Score is: 0.811
