In [33]:
import numpy as np
import pandas as pd
import nltk 
import re
import os
from nltk.tokenize import word_tokenize
from collections import defaultdict

In [None]:
folder_path = "dataset"  
content_list = []

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):  
        file_path = os.path.join(folder_path, filename)
        with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
            content = file.read()
            content_list.append(content)  
content_string = " ".join(content_list) 


def preprocess_text(text):
    text = text.lower() 
    text = re.sub(r"^\d+\.\s*", "", text, flags=re.MULTILINE)  # to remove the numbers from start of sentences
    text = re.sub(r"\s+", " ", text).strip() 
    text = text.replace("\n", " ")
    text = re.sub(r"[^\w\s]", "", text) 
    return text

In [35]:
# get unique characters
def unique_characters(processedtext):
    unique_chars = sorted(set(processedtext))
    character_to_id = {"<UNK>": 0} # assign unknown to 0
    index = 1
    for i in unique_chars:
        character_to_id[i] = index
        index+=1
    return character_to_id

In [36]:
def tokenize_text(text):
    words = word_tokenize(text)
    tokenized_words = []  
    for word in words:
        tokenized_word = list(word)  
        tokenized_word.append('_')  
        tokenized_words.append(tokenized_word)  
    return tokenized_words


In [37]:
# Count frequency of pairs
def get_frequency(corpus):
    pairs = defaultdict(int) # make a empty dict
    for word in corpus:
        for i in range(len(word) - 1):
            pairs[(word[i], word[i + 1])] += 1
    return pairs

In [38]:
# Merge the most frequent pair
def merge_pair(pair, corpus):
    new_token = "".join(pair)
    new_corpus = []
    for word in corpus:
        new_word = []
        i = 0
        while i < len(word):
            if i < len(word) - 1 and (word[i], word[i + 1]) == pair:
                new_word.append(new_token)
                i += 2
            else:
                new_word.append(word[i])
                i += 1
        new_corpus.append(new_word)
    return new_corpus, new_token

In [39]:
def byte_pair_encoding(corpus , vocab_size = 1000):
    vocab = set()
    for i in corpus:
        for j in i:
            vocab.add(j)

    while len(vocab) <= vocab_size:
        pair_freq = get_frequency(corpus)
        if not pair_freq:
            break
        most_frequent_pair = max(pair_freq , key = pair_freq.get)
        corpus, new_token = merge_pair(most_frequent_pair, corpus)
        vocab.add(new_token)
        print(f"Merged pair {most_frequent_pair} into token '{new_token}' with frequency {pair_freq[most_frequent_pair]}")
    return vocab, corpus

### ENCODER

In [68]:
def encode(text, vocab_ids):
    tokens = []
    text = text.replace(" ", "_")  

    # Sort vocabulary keys by length (longest first)
    sorted_vocab = sorted(vocab_ids.keys(), key=len, reverse=True)
    print(sorted_vocab)

    i = 0
    while i < len(text):
        matched = False
        
        for v in sorted_vocab:  # Iterate over sorted vocab and find the longest match
            if text.startswith(v, i):  # Check if a substring of the text matches a vocab token
                tokens.append(vocab_ids[v])  # Use vocab_ids[v] to get the ID
                i += len(v)  # Move index by the length of the matched token
                matched = True
                break
        
        if not matched:
            tokens.append(0)  # Append <UNK> ID (0)
            i += 1  # Move forward by one character

    return tokens


In [63]:
def decode(token_ids, vocab_ids):
    # Create a reverse mapping from ID to word
    id_to_word = {v: k for k, v in vocab_ids.items()}

    text = "".join(id_to_word[token] for token in token_ids if token in id_to_word)
    
    return text.replace("_", " ")  # Replace underscores with spaces

In [None]:
processedtext = preprocess_text(content_string)
character_mapping = unique_characters(processedtext)

print("Character to ID Mapping:", character_mapping)

corpus = tokenize_text(processedtext)
print("Initial Corpus:", corpus[:5])  


Character to ID Mapping: {'<UNK>': 0, ' ': 1, '0': 2, '1': 3, '2': 4, '3': 5, '4': 6, '5': 7, '6': 8, '7': 9, '8': 10, '9': 11, '_': 12, 'a': 13, 'b': 14, 'c': 15, 'd': 16, 'e': 17, 'f': 18, 'g': 19, 'h': 20, 'i': 21, 'j': 22, 'k': 23, 'l': 24, 'm': 25, 'n': 26, 'o': 27, 'p': 28, 'q': 29, 'r': 30, 's': 31, 't': 32, 'u': 33, 'v': 34, 'w': 35, 'x': 36, 'y': 37, 'z': 38, 'é': 39}
Initial Corpus: [['s', 'u', 'b', 'h', 'a', '_'], ['5', '_'], ['b', 'j', 'h', 'e', 'y', '_'], ['u', 't', 'h', 'n', 'a', '_'], ['p', 'e', 'r', 'h', 'a', '_']]


In [43]:
# Apply BPE
final_vocab, final_corpus = byte_pair_encoding(corpus, vocab_size=1000)

Merged pair ('a', '_') into token 'a_' with frequency 2790
Merged pair ('e', '_') into token 'e_' with frequency 2239
Merged pair ('i', '_') into token 'i_' with frequency 2142
Merged pair ('r', '_') into token 'r_' with frequency 1989
Merged pair ('a', 'y') into token 'ay' with frequency 1198
Merged pair ('h', 'a') into token 'ha' with frequency 990
Merged pair ('n', '_') into token 'n_' with frequency 856
Merged pair ('t', 'h') into token 'th' with frequency 843
Merged pair ('a', 'a') into token 'aa' with frequency 828
Merged pair ('ay', '_') into token 'ay_' with frequency 585
Merged pair ('s', '_') into token 's_' with frequency 575
Merged pair ('k', 'a') into token 'ka' with frequency 555
Merged pair ('m', 'a') into token 'ma' with frequency 508
Merged pair ('o', '_') into token 'o_' with frequency 495
Merged pair ('a', 'u') into token 'au' with frequency 490
Merged pair ('h', '_') into token 'h_' with frequency 490
Merged pair ('k', '_') into token 'k_' with frequency 487
Merged 

In [49]:
print("\nFinal Vocabulary:", final_vocab)
print("\nFinal Corpus Sample:", final_corpus[:20])


Final Vocabulary: {'rha_', 'pee', 'honay_', 'scroll_', 'asar_', 'kapray_', 'you', 'movie_', 'nei_', 'fa', 'mera_', 'haa', 'd', 'tyaar_', 'no', 'karte_', 'chale_', 'sha', 'ma_', 'gi_', 'baatein_', 'sir_', '8', 'clas', 'lay_', '230_', 'par_', 'soya_', 'l', 'pohu', 'han', 'namaz_', 'jis_', 'keh_', 'kha_', 'hua_', 'compu', 'sho', 'kyun_', 'weekend_', 'kend_', 'hai_', 'ss_', 'sama', 'chla_', 'fri', 'kal_', 'chaye_', 'phone_', 'pa_', 'h', 'to', 'comp', 'een_', 'os', 'spe', 'ish_', 'apne_', 'guzar_', 'thay_', '15_', 'b_', 'hoo', 'nally_', 'instagram_', 'ba_', 'pa', 'uth_', 'na', 'lun', 'offi', 'net', 'pass_', 'khaa', 'pani_', 'har_', 'aa_', 'ho_', 'bre', 'ki_', 'maz', 'me', 'vi', 'ee', 'mila_', 'paratha_', 'saaf_', 'su', 'gym_', 'tha_', 'thaa_', '4', 'tay_', 'nee', 'hal', 'ts_', 'li', 'atte', 'ap', 'le', 'bj', 'pi', 'gh', 'mil_', 'thori_', 'muj', 'nahaya_', 'hamne_', 'jis', 'apna_', 'mj', 'ra', 'shuru_', 'ki', 'ine_', 'thy_', 'din_', 'aar_', 'dekha_', 'taqreeban_', 'soo', '12_', 'apnay_', 'j

In [None]:
sorted_vocab = sorted(final_vocab)

vocab_ids = {}  
i = 1  

for token in sorted_vocab:  # Iterate over sorted_vocabalary 
    vocab_ids[token] = i  # Assign index to token and then increment
    i += 1  

In [54]:
print("\nVocabulary with IDs:", vocab_ids)


Vocabulary with IDs: {'0': 1, '00_': 2, '0_': 3, '1': 4, '10': 5, '100_': 6, '10_': 7, '11': 8, '1130_': 9, '11_': 10, '12': 11, '12_': 12, '15_': 13, '1_': 14, '2': 15, '20_': 16, '230_': 17, '2_': 18, '3': 19, '30_': 20, '3_': 21, '4': 22, '40_': 23, '4_': 24, '5': 25, '515_': 26, '5_': 27, '6': 28, '600_': 29, '6_': 30, '7': 31, '730_': 32, '7_': 33, '8': 34, '830_': 35, '8_': 36, '9': 37, '9_': 38, '_': 39, 'a': 40, 'a_': 41, 'aa': 42, 'aa_': 43, 'aad_': 44, 'aaj_': 45, 'aakar_': 46, 'aake_': 47, 'aam_': 48, 'aar_': 49, 'aari_': 50, 'aaya_': 51, 'ab': 52, 'ab_': 53, 'abhi_': 54, 'ac': 55, 'ach': 56, 'acha_': 57, 'ad': 58, 'ad_': 59, 'ada_': 60, 'ag': 61, 'agle_': 62, 'ai': 63, 'ai_': 64, 'aik_': 65, 'aj': 66, 'aj_': 67, 'ajj_': 68, 'ak_': 69, 'al': 70, 'al_': 71, 'alar': 72, 'alarm_': 73, 'am': 74, 'am_': 75, 'ami_': 76, 'an': 77, 'and_': 78, 'anda_': 79, 'ann_': 80, 'aor_': 81, 'ap': 82, 'apna_': 83, 'apnay_': 84, 'apne_': 85, 'apni_': 86, 'apny_': 87, 'ar': 88, 'ar_': 89, 'ara_'

### TESTING

In [None]:
folder_path = "Assignment_Dataset"  
test_list = []

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):  
        file_path = os.path.join(folder_path, filename)
        with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
            content = file.read()
            test_list.append(content)  
test_string = " ".join(test_list) # convert to string

In [None]:
def preprocess_text(text):
    text = text.lower() 
    text = re.sub(r"^\d+\.\s*", "", text, flags=re.MULTILINE) 
    text = re.sub(r"\s+", " ", text).strip() 
    text = text.replace("\n", " ")
    text = re.sub(r"[^\w\s]", "", text) 
    return text

In [None]:
processed_test_text = preprocess_text(test_string)

In [None]:
processed_test_text

'aj me subah 6 bage utha aur mene hath mun dhoya aj university late jana tha to bas hath mun dho ke me wapis bister me let gaya mere ik dost ki tabiat bhi bohat dinoun se kharab thi to subah pehle uth ke us ka hal waghaira pata kiya us ke baad jo homework university se mila tha mene who beth ke kiya or us ke baad thori der social media istemal kiya us ke baad me university gaya or wahan pe onestop ja ke apna naya student id card ka form diya us ke baad mene apni sab classes attend kin university se chutti ke baad me apne dostoun ke sath garioun ke showrooms me gaya kyunke ik dost ko gari leni thi us ke baad ham ne wapsi pe ik jaga se khana khaya or wapis flat agae us ke baad mene apni raat ki dawaian lin or bas homework jo rehta tha complete kar ke sogaya 1 aj me subah 9 bage utha or mene ghanta dost se baat ki us ke baad mene nashta kiya or university jane kelia tyar hoa university me pehli class 1 bage thi to bas sari classes attend kin sab classes attend kar ke flat wapis agea or ap

In [65]:
encoded_text = encode(processed_test_text, vocab_ids)
print("Encoded Text:", encoded_text)

decoded_text = decode(encoded_text, vocab_ids)  # Pass vocab_ids here
print("Decoded Text:", decoded_text)

Encoded Text: [67, 590, 848, 30, 115, 318, 943, 107, 600, 355, 618, 623, 236, 988, 67, 930, 533, 427, 879, 901, 140, 355, 618, 623, 236, 39, 471, 590, 970, 153, 841, 746, 590, 537, 852, 314, 603, 384, 439, 253, 496, 856, 384, 103, 39, 151, 158, 240, 681, 623, 804, 479, 747, 114, 884, 901, 848, 707, 941, 471, 933, 441, 345, 39, 954, 319, 63, 748, 699, 500, 933, 471, 118, 433, 360, 589, 982, 439, 930, 804, 609, 879, 600, 952, 361, 148, 471, 500, 679, 933, 471, 118, 889, 234, 827, 591, 410, 259, 581, 500, 933, 471, 118, 590, 930, 314, 679, 958, 703, 670, 647, 843, 423, 471, 83, 624, 988, 837, 923, 268, 852, 387, 172, 745, 219, 441, 275, 678, 567, 246, 933, 471, 118, 600, 86, 791, 206, 105, 498, 930, 804, 198, 909, 471, 118, 590, 85, 252, 681, 623, 471, 798, 303, 774, 681, 623, 471, 813, 952, 778, 566, 784, 590, 314, 517, 471, 384, 439, 253, 503, 311, 537, 658, 884, 933, 471, 118, 348, 648, 971, 703, 384, 439, 422, 304, 804, 483, 489, 679, 970, 275, 522, 852, 61, 40, 260, 933, 471, 118, 60

In [None]:
# Encode and decode example with emojis and invalid characters
# test_text = "mera dost bht acha ha hia 😎 💥 😇 @#!"
# encoded_text = encode(test_text, final_vocab)
# print("Encoded Text:", encoded_text)

# decoded_text = decode(encoded_text)
# print("Decoded Text:", decoded_text)

# #Assert that the decoded text matches the original text (ignoring <UNK>)
# assert decoded_text == test_text.replace("<UNK>", ""), "Decoded text does not match the original text!"


###  Evaluate its performance based on vocabulary reduction and OOV word handling. 

In [None]:
# vocabulary size havent been reduced because we are adding new pairs inside vocabulary and are not removing previous characters which are getting merged as per course instructor