## BERT

In [12]:
# import necessary libraries
import json
import os
import numpy as np
import pandas as pd
import string
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity

#### Load and open the data
1. documents
2. queries

In [2]:
document_path = '/Users/giudittadelsarto/Desktop/DIS_Project/DIS_code/truncated_documents.json'

In [3]:
# Loading the documents
with open(document_path) as json_file:
    documents = json.load(json_file)

In [4]:
train_queries_path = '../dis-project-1-document-retrieval/train.csv'

In [5]:
# Load the training queries
train_queries = pd.read_csv(train_queries_path)

#### Auxiliary functions

In [6]:
# Extract the document given its docid
def get_doc_by_id(doc_id):
    return next(doc for doc in documents if doc['docid'] == doc_id)

In [7]:
def get_langauge(lang_id):
    """
    Returns the language name given the language id in order to pass it to the stopwords function
    """
    cases = {
        'en' : 'english',
        'de' : 'german',
        'fr' : 'french',
        'es' : 'spanish',
        'it' : 'italian',
        'ar' : 'arabic',
        'ko' : 'korean'
    }
    return cases.get(lang_id, 'unknown') 

In [8]:
def preprocess_text(text, language_id):
    """
    Preprocess the text by removing stopwords, stemming and lemmatizing the text
    """
    # remove punctuation
    text = "".join([ch for ch in text if ch not in string.punctuation])
    # remove stopwords
    stop_words = set(stopwords.words(get_langauge(language_id)))
    words = nltk.word_tokenize(text)
    words = [word for word in words if word.lower() not in stop_words]
    # stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    # lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return words

In [10]:
# def cosine_similarity(a, b):
#     return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

#### Get the model

In [11]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [63]:
# Embedding the documents -> this will take a while: DO NOT USE IT
documents_embeddings = []
for doc in documents[10:50]:
    # print(doc['docid'])
    text = doc['text']
    embeddings = model.encode(text)
    documents_embeddings.append(embeddings)

In [None]:
# list of all texts of the collection
[doc['text'] for doc in documents]

In [49]:
# Smartert way to embed the documents
documents_embeddings2 = model.encode([doc['text'] for doc in documents])

In [None]:
# check gpu 
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [15]:
# embedding the queries -> this will take a while: DO NOT USE IT
train_queries_embeddings = []
gt_documents = []
for _, query in tqdm(train_queries.iterrows()):
    embeddings = model.encode(query['query'])
    gt_documents.append(query['positive_docs'])
    train_queries_embeddings.append(embeddings)

21875it [08:10, 44.62it/s]


In [24]:
# convert the embeddings to numpy arrays
documents_embeddings_np = np.array(documents_embeddings)

In [25]:
documents_embeddings_np.shape

(1000, 768)

In [27]:
# Perform the cosine similarity between the first query and all the documents
train_queries_embeddings_reshaped = train_queries_embeddings[0].reshape(1, -1)
similarity_scores = cosine_similarity(train_queries_embeddings_reshaped, documents_embeddings_np)

In [30]:
# find index of the 10 most similar documents
most_similar_docs = np.argsort(similarity_scores[0])[-10:]

In [31]:
most_similar_docs

array([ 30, 978, 993, 833, 668, 503, 900, 457, 807, 583])

In [32]:
# recoveer the docids of the most similar documents
most_similar_docids = [documents[i]['docid'] for i in most_similar_docs]
most_similar_docids

['doc-en-10634',
 'doc-en-12492',
 'doc-en-3250',
 'doc-en-16089',
 'doc-en-8017',
 'doc-en-5989',
 'doc-en-9127',
 'doc-en-10148',
 'doc-en-9116',
 'doc-en-161']

In [33]:
# save documents embeddings
np.save('documents_embeddings.npy', documents_embeddings_np)

In [34]:
doc2= np.load('documents_embeddings.npy')

In [36]:
doc2.shape

(1000, 768)

In [54]:
# create query embeddings of the test set
test_queries_path = '../dis-project-1-document-retrieval/test.csv'
test_queries = pd.read_csv(test_queries_path)
test_queries_input = [query['query'] for _, query in test_queries.iterrows()]

In [55]:
# Embedding the test queries
test_queries_embeddings = model.encode(test_queries_input)

KeyboardInterrupt: 

In [None]:
# # DO NOT USE IT -> it will take a while
# def create_submission(queries_embeddings, documents_embeddings):
#     """
#     Create a submission file with the 10 most similar documents for each query
#     """
#     submission = []
#     for i, query_embeddings in enumerate(queries_embeddings):
#         similarity_scores = cosine_similarity(query_embeddings.reshape(1, -1), documents_embeddings)
#         most_similar_docs = np.argsort(similarity_scores[0])[-10:]
#         most_similar_docids = [documents[i]['docid'] for i in most_similar_docs]
#         submission.append({
#             'query_id': i,
#             'retrieved_docs': most_similar_docids
#         })
    
#     # save the submission file in csv
#     submission_df = pd.DataFrame(submission)
#     submission_df.to_csv('submission.csv', index=False)
#     print('Submission file created successfully')


In [51]:
# load and merge the embeddings of the documents
documents_embeddings1 = np.load('/Users/giudittadelsarto/Desktop/DIS_Project/doc_embeddings/documents_embeddings_23795.npy')
documents_embeddings2 = np.load('/Users/giudittadelsarto/Desktop/DIS_Project/doc_embeddings/documents_embeddings2.npy')
documents_embeddings = np.concatenate((documents_embeddings1, documents_embeddings2), axis=0)


In [53]:
# save the merged embeddings
np.save('full_documents_embeddings.npy', documents_embeddings)

In [56]:
# load test queries embeddings
test_queries_embeddings = np.load('/Users/giudittadelsarto/Desktop/DIS_Project/doc_embeddings/test_queries_embeddings.npy')

In [57]:
test_queries_embeddings.shape

(2000, 768)

In [58]:
# calculate the cosine similarity between the test queries and the documents without for loop
sim_matrix = cosine_similarity(test_queries_embeddings, documents_embeddings)

In [59]:
sim_matrix.shape

(2000, 268022)

In [60]:
sim_matrix[0].shape

(268022,)

In [61]:
def create_submission(queries_embeddings, documents_embeddings):
    """
    Create a submission file with the 10 most similar documents for each query
    """
    submission = []
    sim_matrix = cosine_similarity(queries_embeddings, documents_embeddings)
    for i in range(len(queries_embeddings)):
        most_similar_docs = np.argsort(sim_matrix[i])[-10:]
        most_similar_docids = [documents[i]['docid'] for i in most_similar_docs]
        submission.append({
            'query_id': i,
            'retrieved_docs': most_similar_docids
        })
    
    # save the submission file in csv
    submission_df = pd.DataFrame(submission)
    submission_df.to_csv('submission.csv', index=False)
    print('Submission file created successfully')

In [62]:
create_submission(test_queries_embeddings, documents_embeddings)

IndexError: list index out of range

In [None]:
# divide documents embeddings according to language of orginal document

embeddings_by_language = {}

for i, doc in enumerate(documents):
    lang = doc['lang']
    if lang not in embeddings_by_language:
        embeddings_by_language[lang] = []
    embeddings = documents_embeddings[i]
    embeddings_by_language[lang].append(embeddings)

In [None]:
# compute when language changes in test queries

breaking_points = [0]
for i in range(1, len(test_queries)):
    if test_queries.iloc[i]['lang'] != test_queries.iloc[i-1]['lang']:
        breaking_points.append(i)
    

In [None]:
def create_submission_bylang(queries_embeddings, embeddings_by_lang, breaking_points):
    """
    Create a submission file with the 10 most similar documents for each query, considering the language of the query
    """
    submission = []
    for i in range(len(breaking_points)):
        start = breaking_points[i]
        end = breaking_points[i+1] if i+1 < len(breaking_points) else len(queries_embeddings)
        queries = queries_embeddings[start:end]
        lang = test_queries.iloc[start]['lang']
        sim_matrix = cosine_similarity(queries, embeddings_by_lang[lang])
        for j in range(len(queries)):
            most_similar_docs = np.argsort(sim_matrix[j])[-10:]
            most_similar_docids = [documents[i]['docid'] for i in most_similar_docs]
            submission.append({
                'id': start+j,
                'docids': most_similar_docids
            })