In [44]:
import os
import pandas as pd
import sentencepiece as spm
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import random
import pickle
from gensim.models import Word2Vec
import numpy as np
import torch
import nn.torch

[nltk_data] Downloading package punkt to /Users/harshil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [31]:
# Step 1 - Get the dataset from MS Marco and put into data frame

training_query_dataset = pd.read_parquet("./v1.1-data/train.parquet") 



In [32]:

# Step 2 - Compile list of all the documenets 
# This step removes the redundant columns from the data frame so we're left with the information we want - we haven't formatted anything 
# at this point.

# query_and_relevant_doc_df = query_dataset.drop(columns=["answers", "query_id", "query_type", "wellFormedAnswers"])

def removeRedundantColumns(dataset):
    return dataset.drop(columns=["answers", "query_id", "query_type", "wellFormedAnswers"])


In [33]:
# Step 3 - Generate triplets 
# - form: (query, relevant_doc, irrelevant_doc)
# Relevant docs are stored with the query in the df above
# Irrelevant doc is a random document from any other query in the df - IMPORTANT - WE'RE ASSUMING THE OTHER QUERY IS UNRELATED TO THIS ONE

# storing in a dictionairy of form (k,v) - k = query, v = [(rel_doc1, irr_doc1), (rel_doc2, irr_doc2)....,] - this is our negative sampling
# query_dataset.head(1).loc[:,"passages"][0]['passage_text'][4]



# triplets_list = []
# num_rows = query_and_relevant_doc_df.shape[0]

# # Reset file
# with open('corpus.txt', 'w') as file:    
#     file.write("") 


# for index, row in query_and_relevant_doc_df.iterrows():
#     query = row["query"]  # Assuming 'query' is the column name for the query text
#     passages = row["passages"]  # Accessing the dictionary in the 'passages' column
#     passage_texts = passages["passage_text"]  # Extracting the list of passage texts

    
#     for relevant_document in passage_texts:        
#         # Randomly select another index
#         random_row_index = random.randint(0, num_rows - 1)
#         # Spliting each passage into a sentence 
#         sentences = sent_tokenize(relevant_document)
#         # appending each sentence into corpus.txt
#         with open('corpus.txt', 'a') as file:
#             for sentence in sentences:
#                 file.write(sentence + '\n')  # Write each sentence on a new line

#         while index == random_row_index:
#             random_row_index = random.randint(0, num_rows - 1)

        
#         # Retrieve a passage from the randomly selected row
#         random_passages = query_and_relevant_doc_df.loc[random_row_index, "passages"]
#         random_passage_texts = random_passages["passage_text"]
        
#         # Optionally, select a random passage text from the selected row
#         irrelevant_document = random.choice(random_passage_texts)
#         triplets_list.append((query, relevant_document, irrelevant_document))


def clearCorpusFile():
    with open('corpus.txt', 'w') as file:    
        file.write("") 


def writeToCorpusAndGenerateTriplets(query_and_relevant_doc_df):
    triplets_list = []
    num_rows = query_and_relevant_doc_df.shape[0]

    for index, row in query_and_relevant_doc_df.iterrows():
        query = row["query"]  # Assuming 'query' is the column name for the query text
        passages = row["passages"]  # Accessing the dictionary in the 'passages' column
        passage_texts = passages["passage_text"]  # Extracting the list of passage texts

        
        for relevant_document in passage_texts:        
            # Randomly select another index
            random_row_index = random.randint(0, num_rows - 1)
            # Spliting each passage into a sentence 
            rel_doc_sentences_list = sent_tokenize(relevant_document)
            # appending each sentence into corpus.txt
            with open('corpus.txt', 'a') as file:
                for sentence in rel_doc_sentences_list:
                    file.write(sentence + '\n')  # Write each sentence on a new line

            while index == random_row_index:
                random_row_index = random.randint(0, num_rows - 1)

            
            # Retrieve a passage from the randomly selected row
            random_passages = query_and_relevant_doc_df.loc[random_row_index, "passages"]
            random_passage_texts = random_passages["passage_text"]
            
            # Optionally, select a random passage text from the selected row
            irrelevant_document = random.choice(random_passage_texts)
            irrel_doc_sentences_list = sent_tokenize(irrelevant_document)
            triplets_list.append((query, rel_doc_sentences_list, irrel_doc_sentences_list))
    return triplets_list

    

# - create triplet for each of the doc values
# - randomly choose a doc from other keys
# - store somewhere 



In [34]:
# Step 4.1 Train the sentencepiece model 


In [46]:
#step 4.2 Load the trained model and tokenising triplets


def tokenise_triplets(triplets_list):
    sp = spm.SentencePieceProcessor()
    sp.load('m.model')
    return [(sp.encode_as_pieces(query), sp.encode_as_pieces(rel_doc), sp.encode_as_pieces(irr_doc))
                      for (query, rel_doc, irr_doc) in triplets_list]

def generateEmbeddedTriplets(tokenised_triplets):
    loaded_model = Word2Vec.load("word2vec_model.model")
    return [(encode_and_pool([query], loaded_model)[0], encode_and_pool(rel_doc, loaded_model), encode_and_pool(irr_doc, loaded_model))
                      for (query, rel_doc, irr_doc) in tokenised_triplets]


def encode_and_pool(sentences, word2vec_model):
    # Function to encode sentences into word embeddings and apply average pooling
    embeddings = []
    for sentence in sentences:
        # Encode each word in the sentence using Word2Vec
        word_embeddings = [word2vec_model.wv[word] for word in sentence if word in word2vec_model.wv.key_to_index]
        # Apply average pooling to obtain a fixed-length representation
        if word_embeddings:
            sentence_embedding = np.mean(word_embeddings, axis=0)
            embeddings.append(sentence_embedding)
        else:
            # If no word embeddings found (out-of-vocabulary words), use zero vector
            embeddings.append(np.zeros(word2vec_model.vector_size))
    return torch.tensor(embeddings)




In [36]:
print("Starting pre processing")
# Loading Traning Data 
training_query_dataset = pd.read_parquet("./v1.1-data/train.parquet") 
training_query_and_relevant_doc_df = removeRedundantColumns(training_query_dataset).head(100)
print("Training data loaded")

# Loading Validation Data
validation_query_dataset = pd.read_parquet("./v1.1-data/validation.parquet") 
validation_query_and_relevant_doc_df = removeRedundantColumns(validation_query_dataset).head(100)
print("Validation data loaded")

# Loading Test Data
test_query_dataset = pd.read_parquet("./v1.1-data/test.parquet") 
test_query_and_relevant_doc_df = removeRedundantColumns(test_query_dataset).head(100)
print("Test data loaded")

# Creating corpus file if it doesn't exist and removing lingering data from any last runs 
clearCorpusFile()
print("Generating triplets and writing to corpus file")
# Creating triplets for each data set and updating all the values to the same corpus
training_triplets_list = writeToCorpusAndGenerateTriplets(training_query_and_relevant_doc_df)
validation_triplets_list = writeToCorpusAndGenerateTriplets(validation_query_and_relevant_doc_df)
test_triplets_list = writeToCorpusAndGenerateTriplets(test_query_and_relevant_doc_df)

print("Triplets generated - training sentence piece")
# Training sentencepiece on training, validation and test data
spm.SentencePieceTrainer.train(input="corpus.txt", model_prefix="m", vocab_size=10_000)
print("Sentence piece trainined, now tokenising triplets")

# Tokenising triplets using sentencepiece 
training_tokenised_triplets = tokenise_triplets(training_triplets_list)
validation_tokenised_triplets = tokenise_triplets(validation_triplets_list)
test_tokenised_triplets = tokenise_triplets(test_triplets_list)
print("Triplets tokenised")
print("Data preprocessing complete")

Starting pre processing
Training data loaded
Validation data loaded
Test data loaded
Generating triplets and writing to corpus file
Triplets generated - training sentence piece


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: corpus.txt
  input_format: 
  model_prefix: m
  model_type: UNIGRAM
  vocab_size: 10000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_

Sentence piece trainined, now tokenising triplets
Triplets tokenised
Data preprocessing complete


Save tokenised triplets into pickles

In [48]:
# Save
# with open('validation_tokenized_triplet.pkl', 'wb') as file:
#     pickle.dump(validation_tokenised_triplets, file, protocol=pickle.HIGHEST_PROTOCOL)

# with open('test_tokenised_triplets.pkl', 'wb') as file:
#     pickle.dump(validation_tokenised_triplets, file, protocol=pickle.HIGHEST_PROTOCOL)

embedded_training_set_values = generateEmbeddedTriplets(training_tokenised_triplets)

with open('training_tokenised_triplets.pkl', 'wb') as file:
    pickle.dump(embedded_training_set_values, file, protocol=pickle.HIGHEST_PROTOCOL)

