<a href="https://colab.research.google.com/github/JaleelRadhu/NeuralLM/blob/main/Task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import re
import json

In [None]:
class WordPieceTokenizer:
    # Add atributes to the class
    token_count = {}
    word_count = {}

    vocab = set()

    # Initialize the WordPieceTokenizer
    def __init__(self, corpus, vocab_size=30000):

        self.corpus = corpus
        self.vocab_size = vocab_size

    def preprocess_data(self):
        ret = []
        for i in range(len(self.corpus)):
            sentence = self.corpus[i]
            sentence = re.sub(r'[^A-Za-z0-9\s]+', ' ', sentence) # Remove special characters
            sentence = sentence.lower() # Convert to lowercase
            sentence = sentence.strip() # Remove leading/trailing whitespaces
            # replace the special characters with space using regex

            self.corpus[i] = sentence
            ret.append(sentence.split(' '))
        return ret


    def form_tokens(self):
        # Tokenize the corpus
        token_count = {}
        word_count = {}
        for sentence in self.corpus:
            for word in sentence.split():
                for i in range(len(word)):
                    token = ""
                    if(i == 0):
                        token = word[i]
                    else:
                        token = "##" + word[i]


                    if token in token_count:
                        token_count[token] += 1
                    else:
                        token_count[token] = 1

                    self.vocab.add(token)
                tokenized_word = "##".join(list(word))

                if(tokenized_word in word_count):
                    word_count[tokenized_word] += 1
                else:
                    word_count[tokenized_word] = 1

        self.word_count = word_count
        self.token_count = token_count

    def construct_vocabulary(self):
        self.form_tokens()
        token_count = self.token_count
        word_count = self.word_count

        words_added = 0
        # Add the while loop in tqdm
        pbar = tqdm(total=self.vocab_size)
        pbar.update(len(self.vocab))
        while(len(self.vocab) < self.vocab_size):
            pbar.update(1)
            words_added += 1
            pair_score = {}
            # Create the vocabulary
            for word in word_count:
                curr_word = word.split("##")
                for i in range(1,len(curr_word)):
                    curr_word[i] = "##" + curr_word[i]


                for i in range(1,len(curr_word)):
                    pair = (curr_word[i-1], curr_word[i])
                    if pair in pair_score:
                        pair_score[pair] += word_count[word]
                    else:
                        pair_score[pair] = word_count[word]

            max_cnt = 0
            max_val = 0
            max_pair = ()
            for pair in pair_score:
                cnt = pair_score[pair]
                if(token_count[pair[0]] == 0 or token_count[pair[1]] == 0):
                    print(pair,"Error")

                pair_score[pair] /= token_count[pair[0]] * token_count[pair[1]]
                if(max_val < pair_score[pair]):
                    max_cnt = cnt
                    max_val = pair_score[pair]
                    max_pair = pair
            if(max_val == 0):
                print("No pair found, the vocabulary has reached it's maximum size of the corpus")
                pbar.close()
                break
            # Update the token count
            token_count[max_pair[0]] -= max_cnt
            if(max_pair[0] != max_pair[1]):
                token_count[max_pair[1]] -= max_cnt
            token_count[max_pair[0] + max_pair[1][2:]] = max_cnt
            self.vocab.add(max_pair[0] + max_pair[1][2:])

            # Merge the most frequent pair in the word_count
            words_to_update = []
            for word in word_count:
                if((max_pair[0] + max_pair[1]) in word):
                    curr_word = word.split("##")
                    for i in range(1,len(curr_word)):
                        curr_word[i] = "##" + curr_word[i]
                    new_word = []
                    idx = 1
                    while(idx <= len(curr_word)):
                        if(idx < len(curr_word) and curr_word[idx-1] + curr_word[idx] == max_pair[0] + max_pair[1]):
                            new_word.append(max_pair[0] + max_pair[1][2:])
                            idx += 2
                        else:
                            new_word.append(curr_word[idx-1])
                            idx += 1
                    new_word = "".join(new_word)
                    # print(word,new_word)
                    if(new_word != word):
                        words_to_update.append((word,new_word))


            for word in words_to_update:
                word_count[word[1]] = word_count[word[0]]
                del word_count[word[0]]

        # print(self.vocab)
        self.token_count = token_count
        # print(self.token_count)

    def tokenize(self, sentence):
        sentence = re.sub(r'[^A-Za-z0-9\s]+', ' ', sentence) # Remove special characters
        sentence = sentence.lower().strip()
        tokens = []
        for word in sentence.split():
            word_idx = 0
            while(word_idx < len(word)):
                for i in range(len(word), word_idx, -1):
                    token = word[word_idx:i]
                    if(word_idx != 0):
                        token = "##" + token
                    if token in self.vocab:
                        tokens.append(token)
                        word_idx += i - word_idx
                        break
        # print(tokens)
        return tokens



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
if __name__ == "__main__":
    # Read the file corpus.txt

    Group_No = 62
    corpus_file_path = "/content/drive/MyDrive/NLP_assignment/Assignment_1/corpus.txt"
    vocabulary_file_path = f"/content/drive/MyDrive/NLP_assignment/Assignment_1/vocabulary_{Group_No}.txt"
    tokenised_json_path = f"/content/drive/MyDrive/NLP_assignment/Assignment_1/tokenized_{Group_No}.json"
    test_json_path = f"/content/drive/MyDrive/NLP_assignment/Assignment_1/test1.json"

    corpus = []
    with open(corpus_file_path, "r") as file:
        corpus = file.readlines()

    # Initialize the WordPieceTokenizer
    tokenizer = WordPieceTokenizer(corpus)

    tokenizer.preprocess_data()


    tokenizer.construct_vocabulary()

    Group_No = 62
    with open(vocabulary_file_path, "w") as file:
        # Write the vocabulary to the file vocabulary.txt and write each token to new line
        for token in tokenizer.vocab:
            file.write(token + "\n")

    # Open test.json file
    test_data = pd.read_json(test_json_path)

    # Data structure to store the tokenized data
    data_to_write = {}

    for i in range(len(test_data)):
        sentence = test_data["sentence"][i]
        tokens = tokenizer.tokenize(sentence)

        # Add tokens in test data
        data_to_write[f"{test_data['id'][i]}"] = tokens


    # Save the data_to_write to tokenized_{Group No}.json
    with open(tokenised_json_path, "w") as file:
        json.dump(data_to_write, file)


  0%|          | 0/30000 [00:00<?, ?it/s][A
 48%|████▊     | 14498/30000 [00:00<00:00, 143655.34it/s][A
 51%|█████     | 15219/30000 [00:17<00:00, 143655.34it/s][A
 51%|█████     | 15220/30000 [00:17<00:23, 615.99it/s]   [A
 51%|█████     | 15225/30000 [00:17<00:24, 607.31it/s][A
 54%|█████▍    | 16227/30000 [00:37<00:22, 607.31it/s][A
 54%|█████▍    | 16228/30000 [00:37<01:02, 221.35it/s][A
 54%|█████▍    | 16234/30000 [00:37<01:02, 220.43it/s][A
 58%|█████▊    | 17269/30000 [00:57<00:57, 220.43it/s][A
 58%|█████▊    | 17270/30000 [00:57<01:44, 121.80it/s][A
 58%|█████▊    | 17276/30000 [00:57<01:45, 121.10it/s][A
 61%|██████    | 18342/30000 [01:17<02:17, 84.71it/s] [A
 61%|██████    | 18342/30000 [01:17<02:17, 84.71it/s][A
 61%|██████    | 18348/30000 [01:17<02:17, 84.57it/s][A
 65%|██████▌   | 19596/30000 [01:37<02:22, 73.10it/s][A
 65%|██████▌   | 19603/30000 [01:37<02:22, 73.06it/s][A
 68%|██████▊   | 20429/30000 [01:53<02:30, 63.59it/s][A
 68%|██████▊   | 20435

No pair found, the vocabulary has reached it's maximum size of the corpus
