In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import numpy as np

In [2]:
# Charger le modèle BERT et le tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [3]:
#Lire le fichier CSV contenant les documents
df = pd.read_csv("./Collection/documents.csv")

# Fusionner le titre et le texte pour chaque document
df['combined_text'] = df['title'].astype(str) + ' ' + df['text'].astype(str)

# Prétraitement des documents et obtention des embeddings
document_embeddings = []
for text in df['combined_text']:
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**tokens)
    embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
    document_embeddings.append(embeddings)


In [4]:
# Prétraitement de la requête
query = "I WOULD BE PLEASED TO RECEIVE PAPERS ON THE SECURITY OF INFORMATION IN DATABANKS OR DATABASES.  SECURITY, PRIVACY, CONFIDENTIALITY, PROTECTION OF DATA OR INFORMATION, RIGHTS OF ACCESS.  "
query_tokens = tokenizer(query, return_tensors="pt", truncation=True, padding=True)

# Obtention de l'embedding de la requête
with torch.no_grad():
    query_outputs = model(**query_tokens)
query_embedding = query_outputs.last_hidden_state.mean(dim=1).numpy()

In [5]:
# Concaténer les embeddings en un seul tableau 2D
document_embeddings_concatenated = np.concatenate(document_embeddings, axis=0)

# Calcul de la similarité cosinus entre la requête et les documents concaténés
similarities = cosine_similarity(query_embedding, document_embeddings_concatenated)

# Obtention des indices des documents triés par ordre de similarité décroissante
sorted_indices = similarities.argsort()[0][::-1]

# Affichage des documents triés par ordre de similarité décroissante
for index in sorted_indices:
    doc_number = df['doc_num'][index]
    similarity_score = similarities[0][index]
    print(f"Doc Number: {doc_number}, Similarity: {similarity_score}")

Doc Number: 2279, Similarity: 0.835763692855835
Doc Number: 3267, Similarity: 0.8135824203491211
Doc Number: 5220, Similarity: 0.813490629196167
Doc Number: 3789, Similarity: 0.803385853767395
Doc Number: 5263, Similarity: 0.793982982635498
Doc Number: 3265, Similarity: 0.788101315498352
Doc Number: 1451, Similarity: 0.7880331873893738
Doc Number: 725, Similarity: 0.7878658771514893
Doc Number: 5779, Similarity: 0.7873096466064453
Doc Number: 1278, Similarity: 0.7862679958343506
Doc Number: 1439, Similarity: 0.7830736637115479
Doc Number: 46, Similarity: 0.7830555438995361
Doc Number: 3348, Similarity: 0.7793323993682861
Doc Number: 3928, Similarity: 0.779308021068573
Doc Number: 262, Similarity: 0.778478741645813
Doc Number: 2974, Similarity: 0.7772481441497803
Doc Number: 3529, Similarity: 0.7745518684387207
Doc Number: 3215, Similarity: 0.773922324180603
Doc Number: 5227, Similarity: 0.7736194133758545
Doc Number: 5716, Similarity: 0.7735986709594727
Doc Number: 4318, Similarity: 0.