In [8]:
from collections import defaultdict, Counter
import re

class Tokenizer:
    def __init__(self):
        self.vocab = {}
        self.merge_rules = []
        self.token_list=[]

    def merge_vocab(self, pair, vocab):
        pattern = re.compile(re.escape(' '.join(pair)))
        max_freq_word = max(vocab, key=vocab.get)
        vocab2 = {pattern.sub(''.join(pair), word): freq for word, freq in vocab.items()}
        vocab2.pop(' '.join(pair), None)
        vocab2.pop(pair[0], None)
        vocab2.pop(pair[1], None)
        return vocab2
    
    def get_stats(self, vocab):
        pairs_count = defaultdict(int)
        for word, frequency in vocab.items():
            symbols = word.split()
            for i in range(len(symbols) - 1):
                pair = (symbols[i], symbols[i + 1])
                pairs_count[pair] += frequency
        return dict(pairs_count)
    
    def build_vocab(self, corpus, num_merges):
        words = corpus.split()
        tokens = [' '.join(list(word) + ['$']) for word in words]
        all_characters = ''.join([char for word in tokens for char in word if char != ' '])
        self.token_list = list(set(all_characters))
        token_counts = Counter(tokens)
        self.vocab = dict(token_counts)
        for i in range(num_merges):
            pairs = self.get_stats(self.vocab)  # Step 2
            if not pairs:
                break
            best = max(pairs, key=pairs.get)
            self.token_list.append(best[0]+best[1])
            self.merge_rules.append(best)
            self.vocab = self.merge_vocab(best, self.vocab)
        return self.vocab, self.merge_rules,self.token_list
    
    def tokenize(self, input_text):       
        words = input_text.split()
        tokens = [' '.join(list(word) + ['$']) for word in words]
        for rule in self.merge_rules:
            tokens = [token.replace(' '.join(rule), ''.join(rule)) for token in tokens]
        tokens2=[]
        for i in tokens:
            tokens2.append(i.replace(" ",","))
        str2=""
        for i in range(0,len(tokens2)):
            if(i!=len(tokens2)-1):            
                str2+=tokens2[i]+","
            else:
                str2+=tokens2[i]
        return str2
    
#################################################################
    
file = open('corpus2.txt', 'r')
corpus = file.read().replace('\n', ' ')
# print(corpus)
num_merges = 100


tokenizer = Tokenizer()
vocabulary, merge_rules,tok = tokenizer.build_vocab(corpus, num_merges)

tokenized=tokenizer.tokenize("kanye made tailor swift famous")


print("Vocabulary:", vocabulary)
print("Merge Rules:", merge_rules)
print("Tokens:",tok)
print("Tokenised Version:",tokenized)

f = open(  "merge_rules.txt", "w")
for i in merge_rules:
    f.write(i[0] + "," + i[1] + "\n")
f.close()

f = open( "tokens.txt", "w")
for i in tok:
    f.write(i + "\n")
f.close()

f = open("tokenized.txt", "w")
n = int(input("Enter the number of sentences you want to tokenize: "))
for i in range(n):
    s = input("Enter the sentence: ")
    f.write(tokenizer.tokenize(s) + "\n")
f.close()

Vocabulary: {'I$': 1, 'am$': 1, 'sitting$': 1, 'on$': 1, 'this$': 2, 'chair$': 2, 'The$': 2, 'made$': 1, 'of$': 4, 'wood$': 1, 'Wood$': 1, 'definitely$': 1, 'flamable$': 1, 'room$': 1, 'Sridhar$': 1, '302$': 1, 'lives$': 1, 'with$': 1, '3$': 1, 'other$': 1, 'roommates$': 1, 'Some$': 1, 'has$': 2, 'been$': 2, 'written$': 2, 'in$': 2, 'the$': 2, 'library$': 1, 'and$': 1, 'some$': 1, 'it$': 1, 'guest$': 1, 'house$': 1}
Merge Rules: [('e', '$'), ('s', '$'), ('n', '$'), ('i', 't'), ('t', 'h'), ('i', 's$'), ('h', 'a'), ('r', '$'), ('o', 'f'), ('of', '$'), ('o', 'o'), ('e', 'n$'), ('it', 't'), ('h', 'e$'), ('m', 'a'), ('d', '$'), ('m', '$'), ('i', 'n'), ('th', 'is$'), ('c', 'ha'), ('cha', 'i'), ('chai', 'r$'), ('T', 'he$'), ('oo', 'd$'), ('y', '$'), ('r', 'oo'), ('l', 'i'), ('e', 's$'), ('o', 'm'), ('om', 'e$'), ('ha', 's$'), ('b', 'e'), ('be', 'en$'), ('w', 'r'), ('wr', 'itt'), ('writt', 'en$'), ('I', '$'), ('a', 'm$'), ('s', 'itt'), ('sitt', 'in'), ('sittin', 'g'), ('sitting', '$'), ('o', '