## BERT

In [1]:
# import necessary libraries
import json
import os
import numpy as np
import pandas as pd
import string
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import torch
import pickle

#### Load and open the data
1. documents
2. queries

In [3]:
document_path = '../dis-project-1-document-retrieval/corpus.json/corpus.json'

In [4]:
# Loading the documents
with open(document_path) as json_file:
    documents = json.load(json_file)

In [5]:
train_queries_path = '../dis-project-1-document-retrieval/train.csv'

In [6]:
# Load the training queries
train_queries = pd.read_csv(train_queries_path)

#### Auxiliary functions

In [27]:
documents[0]

{'docid': 'doc-en-9633',
 'text': 'Mars Hill Church was a Christian megachurch, founded by Mark Driscoll, Lief Moi, and Mike Gunn. It was a multi-site church based in Seattle, Washington and grew from a home Bible study to 15 locations in 4 U.S. states. Services were offered at its 15 locations; the church also podcast content of weekend services, and of conferences, on the Internet with more than 260,000 sermon views online every week. In 2013, Mars Hill had a membership of 6,489 and average weekly attendance of 12,329. Following controversy in 2014 involving founding pastor Mark Driscoll, attendance dropped to 8,0009,000 people per week.\n\nAt the end of September, 2014, an investigation by the church elders found "bullying" and "patterns of persistent sinful behavior" by Driscoll. The church elders crafted a "restoration" plan to help Driscoll and save the church. Instead, Driscoll declined the restoration plan and resigned. On October 31, 2014, lead pastor Dave Bruskas announced pl

In [24]:
# Extract the document given its docid
def get_doc_by_id(doc_id, documents):
    my_doc = [doc for doc in documents if doc['docid'] == doc_id]
    return my_doc[0]

In [36]:
doc = get_doc_by_id('doc-en-810925', documents)

In [37]:
doc

{'docid': 'doc-en-810925',
 'text': 'Triplemanía XIX was a professional wrestling pay-per-view (PPV) event produced by the AAA promotion, which took place on June 18, 2011 at the Palacio de los Deportes ("Sports Palace") in Mexico City, Mexico. The event was the nineteenth annual Triplemanía, which is AAA\'s biggest show of the year. The event featured performers from American promotion Total Nonstop Action Wrestling (TNA) for the second year in a row. The event featured eight matches and was headlined by the culmination of the seven–month storyline rivalry between L.A. Park and El Mesías in a Luchas de Apuestas, or "bet match", where Park put his mask and El Mesías his hair on the line. It also featured the crowning of the first ever AAA Latin American and AAA World Trios Champions. At the event, Octagón, who has been a part of AAA since the promotion was founded in 1992, became the fifth inductee into the AAA Hall of Fame.\n\nProduction\n\nBackground\nIn early 1992 Antonio Peña was w

In [25]:
languages = ['en', 'de', 'fr', 'es', 'it', 'ar', 'ko']

def get_langauge(lang_id):
    """
    Returns the language name given the language id in order to pass it to the stopwords function
    """
    cases = {
        'en' : 'english',
        'de' : 'german',
        'fr' : 'french',
        'es' : 'spanish',
        'it' : 'italian',
        'ar' : 'arabic',
        'ko' : 'korean'
    }
    return cases.get(lang_id, 'unknown') 

In [14]:
def preprocess_text(text, language_id):
    """
    Preprocess the text by removing stopwords, stemming and lemmatizing the text
    """
    # remove punctuation
    text = "".join([ch for ch in text if ch not in string.punctuation])
    # remove stopwords
    stop_words = set(stopwords.words(get_langauge(language_id)))
    words = nltk.word_tokenize(text)
    words = [word for word in words if word.lower() not in stop_words]
    # stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    # lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return words

In [10]:
# def cosine_similarity(a, b):
#     return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

#### Get the model

In [28]:
model = SentenceTransformer('bert-base-nli-mean-tokens')



In [63]:
# Embedding the documents -> this will take a while: DO NOT USE IT
documents_embeddings = []
for doc in documents[10:50]:
    # print(doc['docid'])
    text = doc['text']
    embeddings = model.encode(text)
    documents_embeddings.append(embeddings)

In [None]:
# list of all texts of the collection
[doc['text'] for doc in documents]

In [49]:
# Smartert way to embed the documents
documents_embeddings2 = model.encode([doc['text'] for doc in documents])

In [None]:
# check gpu 
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [15]:
# embedding the queries -> this will take a while: DO NOT USE IT
train_queries_embeddings = []
gt_documents = []
for _, query in tqdm(train_queries.iterrows()):
    embeddings = model.encode(query['query'])
    gt_documents.append(query['positive_docs'])
    train_queries_embeddings.append(embeddings)

21875it [08:10, 44.62it/s]


In [24]:
# convert the embeddings to numpy arrays
documents_embeddings_np = np.array(documents_embeddings)

In [25]:
documents_embeddings_np.shape

(1000, 768)

In [27]:
# Perform the cosine similarity between the first query and all the documents
train_queries_embeddings_reshaped = train_queries_embeddings[0].reshape(1, -1)
similarity_scores = cosine_similarity(train_queries_embeddings_reshaped, documents_embeddings_np)

In [30]:
# find index of the 10 most similar documents
most_similar_docs = np.argsort(similarity_scores[0])[-10:]

In [31]:
most_similar_docs

array([ 30, 978, 993, 833, 668, 503, 900, 457, 807, 583])

In [32]:
# recoveer the docids of the most similar documents
most_similar_docids = [documents[i]['docid'] for i in most_similar_docs]
most_similar_docids

['doc-en-10634',
 'doc-en-12492',
 'doc-en-3250',
 'doc-en-16089',
 'doc-en-8017',
 'doc-en-5989',
 'doc-en-9127',
 'doc-en-10148',
 'doc-en-9116',
 'doc-en-161']

In [33]:
# save documents embeddings
np.save('documents_embeddings.npy', documents_embeddings_np)

In [34]:
doc2= np.load('documents_embeddings.npy')

In [36]:
doc2.shape

(1000, 768)

In [54]:
# create query embeddings of the test set
test_queries_path = '../dis-project-1-document-retrieval/test.csv'
test_queries = pd.read_csv(test_queries_path)
test_queries_input = [query['query'] for _, query in test_queries.iterrows()]

In [None]:
# Embedding the test queries
test_queries_embeddings = model.encode(test_queries_input)

In [None]:
# # DO NOT USE IT -> it will take a while
# def create_submission(queries_embeddings, documents_embeddings):
#     """
#     Create a submission file with the 10 most similar documents for each query
#     """
#     submission = []
#     for i, query_embeddings in enumerate(queries_embeddings):
#         similarity_scores = cosine_similarity(query_embeddings.reshape(1, -1), documents_embeddings)
#         most_similar_docs = np.argsort(similarity_scores[0])[-10:]
#         most_similar_docids = [documents[i]['docid'] for i in most_similar_docs]
#         submission.append({
#             'query_id': i,
#             'retrieved_docs': most_similar_docids
#         })
    
#     # save the submission file in csv
#     submission_df = pd.DataFrame(submission)
#     submission_df.to_csv('submission.csv', index=False)
#     print('Submission file created successfully')


In [51]:
# load and merge the embeddings of the documents
documents_embeddings1 = np.load('../DIS_Project/doc_embeddings/documents_embeddings_23795.npy')
documents_embeddings2 = np.load('../DIS_Project/doc_embeddings/documents_embeddings2.npy')
documents_embeddings = np.concatenate((documents_embeddings1, documents_embeddings2), axis=0)


In [53]:
# save the merged embeddings
np.save('full_documents_embeddings.npy', documents_embeddings)

In [56]:
# load test queries embeddings
test_queries_embeddings = np.load('../DIS_Project/doc_embeddings/test_queries_embeddings.npy')

In [57]:
test_queries_embeddings.shape

(2000, 768)

In [58]:
# calculate the cosine similarity between the test queries and the documents without for loop
sim_matrix = cosine_similarity(test_queries_embeddings, documents_embeddings)

In [59]:
sim_matrix.shape

(2000, 268022)

In [60]:
sim_matrix[0].shape

(268022,)

### Embed each word in documents and combine them 

In [8]:
from transformers import BertTokenizer
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
# bert_vocabularies = create_vocabulary_language(documents, languages, tokenizer_bert)



In [19]:
doc = documents[268000]
print(doc['lang'])
tokenized_text = tokenizer_bert.tokenize(doc['text'])
print(tokenized_text)


ar
['ا', '##ل', '##ح', '##س', '##و', '##ن', 'ا', '##ل', '##ا', '##م', '##ر', '##ي', '##ك', '##ي', '،', 'ا', '##ل', '##م', '##ع', '##ر', '##و', '##ف', 'م', '##ح', '##ل', '##ي', '##ا', 'ا', '##ي', '##ض', '##ا', 'ب', '##ا', '##س', '##م', 'ا', '##ل', '##ح', '##س', '##و', '##ن', 'ا', '##ل', '##ش', '##ر', '##ق', '##ي', 'و', '##ا', '##ل', '##ك', '##ن', '##ا', '##ر', 'ا', '##ل', '##ب', '##ر', '##ي', '،', 'ه', '##و', 'ط', '##ا', '##ي', '##ر', 'ص', '##غ', '##ي', '##ر', 'ي', '##ق', '##ط', '##ن', 'ا', '##م', '##ر', '##ي', '##ك', '##ا', 'ا', '##ل', '##ش', '##م', '##ا', '##ل', '##ي', '##ة', 'و', '##ي', '##ن', '##ت', '##م', '##ي', 'ا', '##ل', '##ى', 'ف', '##ص', '##ي', '##ل', '##ة', 'ا', '##ل', '##ش', '##ر', '##ش', '##و', '##ر', '##ي', '##ا', '##ت', '.', 'ه', '##ذ', '##ه', 'ا', '##ل', '##ط', '##ي', '##و', '##ر', 'م', '##ه', '##ا', '##ج', '##ر', '##ة', '،', 'ي', '##م', '##ت', '##د', 'ن', '##ط', '##ا', '##ق', 'م', '##و', '##ط', '##ن', '##ه', '##ا', 'م', '##ن', 'و', '##س', '##ط', 'ا', '##ل', '##ب', '##ر'

In [22]:
vocab = set(tokenized_text)
# save the vocabulary into a json file
with open('vocabulary_prova.json', 'w') as f:
    json.dump(list(vocab), f)

In [23]:
print(len(vocab))
vocab_json = json.load(open('vocabulary_prova.json'))
print(len(vocab_json))

133
133


In [36]:
def create_vocabulary_language(documents, languages, tokenizer):
    """
    Create the vocabularies for all the languages in the documents
    """
    vocabularies = {}
    for language in languages:
        vocabularies[language] = set()
    for doc in tqdm(documents):
        text = doc['text']
        tokenized_text = tokenizer.tokenize(text)
        language = doc['lang']
        vocabularies[language].update(tokenized_text)
    return vocabularies

In [None]:
bert_vocabularies = create_vocabulary_language(documents, languages, tokenizer_bert)

In [39]:
# save the vocabularies for each language in a json file
for language in languages:
    with open(f'vocabularies/vocabulary_{language}.json', 'w') as f:
        json.dump(list(bert_vocabularies[language]), f)

In [40]:
for lang in languages:
    print("Size of vocabulary for language", lang, ":", len(bert_vocabularies[lang]))

Size of vocabulary for language en : 29070
Size of vocabulary for language de : 28156
Size of vocabulary for language fr : 26214
Size of vocabulary for language es : 25558
Size of vocabulary for language it : 26424
Size of vocabulary for language ar : 24503
Size of vocabulary for language ko : 24196


In [44]:
for lang in languages:
    print("First 10 words in vocabulary for language", lang, ":", list(bert_vocabularies[lang])[:10])

First 10 words in vocabulary for language en : ['concealed', 'railway', '##ites', 'hale', '1883', 'lifted', 'magic', '##mia', 'change', 'treasure']
First 10 words in vocabulary for language de : ['concealed', 'railway', '##ites', 'hale', '1883', '##gus', '##mia', 'magic', 'change', 'treasure']
First 10 words in vocabulary for language fr : ['concealed', 'railway', '##ites', 'hale', '1883', '##gus', '##mia', 'magic', 'change', 'treasure']
First 10 words in vocabulary for language es : ['railway', '##ites', 'hale', '1883', 'magic', '##mia', '##gus', 'change', 'treasure', 'ت']
First 10 words in vocabulary for language it : ['concealed', 'railway', '##ites', 'hale', '1883', '##gus', '##mia', 'magic', 'change', 'treasure']
First 10 words in vocabulary for language ar : ['concealed', 'railway', '##ites', 'hale', '1883', 'magic', '##mia', '##gus', 'change', 'ت']
First 10 words in vocabulary for language ko : ['railway', '##ites', 'hale', '1883', 'magic', '##gus', '##mia', 'change', 'treasure'

In [41]:
full_vocabulary = set()
for language in languages:
    full_vocabulary.update(bert_vocabularies[language])

print("Number of unique tokens in the full vocabulary: ", len(full_vocabulary))

# save the full vocabulary in a json file

with open('vocabularies/full_vocabulary.json', 'w') as f:
    json.dump(list(full_vocabulary), f)

Number of unique tokens in the full vocabulary:  29129


In [24]:
# load the full vocabulary
full_vocabulary = json.load(open('vocabularies/full_vocabulary.json'))

In [25]:
print(len(full_vocabulary))

29129


In [26]:
full_vocabulary_truncated = (list(full_vocabulary)[:10000])

In [31]:
# Embed the full vocabulary
bert_word_embeddings = model.encode(list(full_vocabulary))

print("Word embeddings shape: ", bert_word_embeddings.shape)

# save the word embeddings in a numpy file
np.save('embeddings/bert_word_embeddings.npy', bert_word_embeddings)

Word embeddings shape:  (29129, 768)


In [32]:
vocabulary_embeddings = dict(zip(list(full_vocabulary), bert_word_embeddings))

In [35]:
def embed_document(doc, vocabulary_embeddings, tokenizer):
    """
    Embed a document by averaging the word embeddings of its tokens
    """
    text = doc['text']
    tokens = tokenizer.tokenize(text)
    tokens_embeddings = [vocabulary_embeddings[token] for token in tokens if token in vocabulary_embeddings]
    if len(tokens_embeddings) == 0:
        print("DRAMMA")
        return np.zeros(768)
    doc_embedding = np.mean(tokens_embeddings, axis=0)

    return doc_embedding

In [None]:
documents_embeddings = []
for doc in tqdm(documents):
    doc_embedding = embed_document(doc, vocabulary_embeddings, tokenizer_bert)
    documents_embeddings.append(doc_embedding)

  3%|▎         | 6952/268022 [09:21<5:42:02, 12.72it/s] 

In [None]:
# save the documents embeddings in a numpy file
np.save('embeddings/documents_embeddings.npy', documents_embeddings)

In [None]:
def embed_quey(query, vocabulary_embeddings, tokenizer):
    """
    Embed a query by averaging the word embeddings of its tokens
    """
    tokens = tokenizer.tokenize(query)
    tokens_embeddings = [vocabulary_embeddings[token] for token in tokens if token in vocabulary_embeddings]
    if len(tokens_embeddings) == 0:
        return np.zeros(768)
    query_embedding = np.mean(tokens_embeddings, axis=0)

    return query_embedding

In [None]:
def retrieve(query, document_embeddings, vocabulary_embeddings, tokenizer):
    """
    Retrieve the most similar documents to the query
    """
    query_embedding = embed_quey(query, vocabulary_embeddings, tokenizer)
    similarity_scores = cosine_similarity(query_embedding.reshape(1, -1), document_embeddings)
    most_similar_docs = np.argsort(similarity_scores[0])[-10:]
    most_similar_docids = [documents[i]['docid'] for i in most_similar_docs]
    return most_similar_docids

### Test on train queries

In [None]:
def check_performance(query, document_embeddings, vocabulary_embeddings, tokenizer):
    """
    Check the performance of the retrieval system
    """
    query_text = query['query']
    relevant_doc = query['positive_docs']
    negative_docs = query['negative_docs']
    most_similar_docids = retrieve(query_text, document_embeddings, vocabulary_embeddings, tokenizer)
    for doc in most_similar_docids:
        if doc in relevant_doc:
            print("Bravo")
        elif doc in negative_docs:
            print("Dramma")
    

### Submission


In [61]:
def create_submission(queries_embeddings, documents_embeddings):
    """
    Create a submission file with the 10 most similar documents for each query
    """
    submission = []
    sim_matrix = cosine_similarity(queries_embeddings, documents_embeddings)
    for i in range(len(queries_embeddings)):
        most_similar_docs = np.argsort(sim_matrix[i])[-10:]
        most_similar_docids = [documents[i]['docid'] for i in most_similar_docs]
        submission.append({
            'query_id': i,
            'retrieved_docs': most_similar_docids
        })
    
    # save the submission file in csv
    submission_df = pd.DataFrame(submission)
    submission_df.to_csv('submission.csv', index=False)
    print('Submission file created successfully')

In [None]:
create_submission(test_queries_embeddings, documents_embeddings)

In [None]:
# divide documents embeddings according to language of orginal document

embeddings_by_language = {}

for i, doc in enumerate(documents):
    lang = doc['lang']
    if lang not in embeddings_by_language:
        embeddings_by_language[lang] = []
    embeddings = documents_embeddings[i]
    embeddings_by_language[lang].append(embeddings)

In [None]:
# get the original index of the documents by language

original_idx_by_language = {}

for i, doc in enumerate(documents):
    lang = doc['lang']
    if lang not in original_idx_by_language:
        original_idx_by_language[lang] = []
    original_idx_by_language[lang].append(doc['docid'])

In [None]:
# compute when language changes in test queries

breaking_points = [0]
for i in range(1, len(test_queries)):
    if test_queries.iloc[i]['lang'] != test_queries.iloc[i-1]['lang']:
        breaking_points.append(i)
    

In [None]:
def create_submission_bylang(queries_embeddings, embeddings_by_lang, breaking_points, original_idx_by_language):
    """
    Create a submission file with the 10 most similar documents for each query by language
    """
    submission = []
    for i in range(len(breaking_points)):
        start = breaking_points[i]
        end = breaking_points[i+1] if i+1 < len(breaking_points) else len(queries_embeddings)
        queries = queries_embeddings[start:end]
        lang = test_queries.iloc[start]['lang']
        sim_matrix = cosine_similarity(queries, embeddings_by_lang[lang])
        for j in range(len(queries)):
            most_similar_docs = np.argsort(sim_matrix[j])[-10:]
            most_similar_docids = [original_idx_by_language[lang][k] for k in most_similar_docs]
            submission.append({
                'id': start+j,
                'docids': most_similar_docids
            })
            
    # save the submission file in csv
    submission_df = pd.DataFrame(submission)
    submission_df.to_csv('submission_by_lang.csv', index=False)
    print('Submission file created successfully')

#### Test bert on train queries

In [17]:
# check the scores on the train queries

# embedd first train query
train_query_0 = train_queries['query'][0]
print(train_query_0)
embedding_train_query_0 = model.encode(train_query_0)

What is the connection between AAA and Lucha Underground?


In [19]:
print(train_queries['positive_docs'][0])

doc-en-798457


In [44]:
import ast

In [49]:
# positive document
positive_doc = get_doc_by_id(train_queries['positive_docs'][0], documents)

# embeddings of the positive document
positive_doc_embedding = model.encode(positive_doc['text'])


In [51]:
negative_docs_ids = ast.literal_eval(train_queries['negative_docs'][0])
# print(negative_docs)
negative_docs = [get_doc_by_id(doc_id, documents) for doc_id in negative_docs_ids]

# embeddings of the negative documents
negative_docs_embeddings = [model.encode(doc['text']) for doc in negative_docs]

In [52]:
# perform cosine similarity between the query and the positive document
similarity_positive = cosine_similarity(embedding_train_query_0.reshape(1, -1), positive_doc_embedding.reshape(1, -1))

In [53]:
# perform cosine similarity between the query and the negative documents
similarity_negative = cosine_similarity(embedding_train_query_0.reshape(1, -1), negative_docs_embeddings)

In [54]:
print("the similarity score between the query and the positive document is: ", similarity_positive)
for i, sim in enumerate(similarity_negative[0]):
    print(f"the similarity score between the query and the negative document {i} is: {sim}")

the similarity score between the query and the positive document is:  [[0.5503944]]
the similarity score between the query and the negative document 0 is: 0.48290205001831055
the similarity score between the query and the negative document 1 is: 0.48172470927238464
the similarity score between the query and the negative document 2 is: 0.4308210611343384
the similarity score between the query and the negative document 3 is: 0.44635772705078125
the similarity score between the query and the negative document 4 is: 0.22976043820381165
the similarity score between the query and the negative document 5 is: 0.6115955114364624
the similarity score between the query and the negative document 6 is: 0.25051236152648926
the similarity score between the query and the negative document 7 is: 0.2934030294418335
the similarity score between the query and the negative document 8 is: 0.28571927547454834
the similarity score between the query and the negative document 9 is: 0.5688396692276001
the simila

In [55]:
import random


the similarity score between the query and the random document is:  [[0.1941262]]


In [65]:
rand_docs = random.sample(documents, 100)
# rand_doc = random.choice(documents)
# embeddings of the random document
rand_docs_embedding = [model.encode(doc['text']) for doc in rand_docs]

# perform cosine similarity between the query and the random document
similarity_random = cosine_similarity(embedding_train_query_0.reshape(1, -1), rand_docs_embedding)

# compute the average similarity between the query and the rand documents
average_similarity_rand = np.mean(similarity_random)
print("the average similarity score between the query and the negative documents is: ", average_similarity_rand)

the average similarity score between the query and the negative documents is:  0.31269595
