In [1]:
import os
import pandas as pd
import sentencepiece as spm
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize


[nltk_data] Downloading package punkt to /Users/harshil/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:


# Step 1 - Get the dataset from MS Marco and put into data frame

query_dataset = pd.read_parquet("./v1.1-data/") 





In [8]:

query_dataset.head(1).loc[:,"passages"][0]['passage_text'][4]
# Step 2 - Compile list of all the documenets 
# - can do as a key, value pair when key is the query id (or query itself) and values are all the docs

query_and_relevant_doc_df = query_dataset.drop(columns=["answers", "query_id", "query_type", "wellFormedAnswers"])

query_and_relevant_doc_df.head(4)


Unnamed: 0,passages,query
0,"{'is_selected': [0, 0, 1, 0, 0, 0, 0], 'passag...",does human hair stop squirrels
1,"{'is_selected': [0, 1, 0, 0, 0, 0, 0, 0, 0], '...",what are the benefits of fossil fuels
2,"{'is_selected': [0, 0, 0, 0, 0, 1, 0, 0, 0], '...",what is a apothem
3,"{'is_selected': [0, 0, 0, 0, 0, 1, 0, 0, 0], '...",average cost for custom canopy


In [12]:
# Step 3 - Generate triplets 
# - form: (query, relevant_doc, irrelevant_doc)
# Relevant docs are stored with the query in the df above
# Irrelevant doc is a random document from any other query in the df - IMPORTANT - WE'RE ASSUMING THE OTHER QUERY IS UNRELATED TO THIS ONE

# storing in a dictionairy of form (k,v) - k = query, v = [(rel_doc1, irr_doc1), (rel_doc2, irr_doc2)....,] - this is our negative sampling
# query_dataset.head(1).loc[:,"passages"][0]['passage_text'][4]

import random


triplets_dict = {}
num_rows = query_and_relevant_doc_df.shape[0]

# # Reset file
# with open('corpus.txt', 'w') as file:    
#     file.write("") 


for index, row in query_and_relevant_doc_df.iterrows():
    query = row["query"]  # Assuming 'query' is the column name for the query text
    passages = row["passages"]  # Accessing the dictionary in the 'passages' column
    passage_texts = passages["passage_text"]  # Extracting the list of passage texts
    relevant_and_irrelevant_doc_pairs_list = []

    
    
    for relevant_document in passage_texts:        
        # Randomly select another index
        random_row_index = random.randint(0, num_rows - 1)
        # Spliting each passage into a sentence 
        sentences = sent_tokenize(relevant_document)
        # appending each sentence into corpus.txt
        with open('corpus.txt', 'a') as file:
            print("Writing to corpus")
            for sentence in sentences:
                print(sentence)
                file.write(sentence + '\n')  # Write each sentence on a new line

        while index == random_row_index:
            random_row_index = random.randint(0, num_rows - 1)

        
        # Retrieve a passage from the randomly selected row
        random_passages = query_and_relevant_doc_df.loc[random_row_index, "passages"]
        random_passage_texts = random_passages["passage_text"]
        
        # Optionally, select a random passage text from the selected row
        irrelevant_document = random.choice(random_passage_texts)

        relevant_and_irrelevant_doc_pairs = (relevant_document, irrelevant_document)
        relevant_and_irrelevant_doc_pairs_list.append(relevant_and_irrelevant_doc_pairs)

    triplets_dict[query] = relevant_and_irrelevant_doc_pairs_list

    

# - create triplet for each of the doc values
# - randomly choose a doc from other keys
# - store somewhere 



Writing to corpus
We have been feeding our back yard squirrels for the fall and winter and we noticed that a few of them have missing fur.
One has a patch missing down his back and under both arms.
Also another has some missing on his whole chest.
They are all eating and seem to have a good appetite.
Writing to corpus
Critters cannot stand the smell of human hair, so sprinkling a barrier of hair clippings around your garden, or lightly working it into the soil when you plant bulbs, apparently does have some merit.
The whole thing kind of makes me laugh.
It never occurred to me that we are the ones that stink.
Writing to corpus
Spread some human hair around your vegetable and flower gardens.
This will scare the squirrels away because humans are predators of squirrels.
It is better if the hair hasn't been washed so the squirrels will easily pick up the human scent.
Writing to corpus
1 You can sprinkle blood meal around your garden as well.
2  Don’t trap and relocate squirrels.
3  This is

In [None]:
# Step 4 - Tokenise your generated data
# Sentence piece 
# Step 4.1 Train the model 
spm.SentencePieceTrainer.train(input=corpus.txt, model_prefix=m, vocab_size=32000)

#step 4.2 Load the trained model 
sp = spm.SentencePieceProcessor()
sp.load('m.model')

tokenized_line = [] 

with open('corpus.txt', 'r', encoding='utf-8') as file:
    for line in file:
        # tokenise each line 
        tokenised = sp.encode_as_pieces(line.strip())