In [10]:
from collections import defaultdict, Counter
import re

class Tokenizer:
    def __init__(self):
        self.vocab = {}
        self.merge_rules = []
        self.token_list=[]

    def merge_vocab(self, pair, vocab):
        pattern = re.compile(re.escape(' '.join(pair)))
        max_freq_word = max(vocab, key=vocab.get)
        vocab2 = {pattern.sub(''.join(pair), word): freq for word, freq in vocab.items()}
        vocab2.pop(' '.join(pair), None)
        vocab2.pop(pair[0], None)
        vocab2.pop(pair[1], None)
        return vocab2
    
    def get_stats(self, vocab):
        pairs_count = defaultdict(int)
        for word, frequency in vocab.items():
            symbols = word.split()
            for i in range(len(symbols) - 1):
                pair = (symbols[i], symbols[i + 1])
                pairs_count[pair] += frequency
        return dict(pairs_count)
    
    def build_vocab(self, corpus, num_merges):
        words = corpus.split()
        tokens = [' '.join(list(word) + ['$']) for word in words]
        all_characters = ''.join([char for word in tokens for char in word if char != ' '])
        self.token_list = list(set(all_characters))
        token_counts = Counter(tokens)
        self.vocab = dict(token_counts)
        for i in range(num_merges):
            pairs = self.get_stats(self.vocab)  # Step 2
            if not pairs:
                break
            best = max(pairs, key=pairs.get)
            self.token_list.append(best[0]+best[1])
            self.merge_rules.append(best)
            self.vocab = self.merge_vocab(best, self.vocab)
        return self.vocab, self.merge_rules,self.token_list
    
    def tokenize(self, input_text):       
        words = input_text.split()
        tokens = [' '.join(list(word) + ['$']) for word in words]
        for rule in self.merge_rules:
            tokens = [token.replace(' '.join(rule), ''.join(rule)) for token in tokens]
        tokens2=[]
        for i in tokens:
            tokens2.append(i.replace(" ",","))
        str2=""
        for i in range(0,len(tokens2)):
            if(i!=len(tokens2)-1):            
                str2+=tokens2[i]+","
            else:
                str2+=tokens2[i]
        return str2
    
#################################################################
    
file = open('corpus.txt', 'r')
corpus = file.read().replace('\n', '')
num_merges = 100

tokenizer = Tokenizer()
vocabulary, merge_rules,tok = tokenizer.build_vocab(corpus, num_merges)

tokenized=tokenizer.tokenize("kanye made tailor swift famous")

print("Merge Rules:", merge_rules)
print("Tokenised Version:",tokenized)

print("Vocabulary:", vocabulary)
print("Merge Rules:", merge_rules)
print("Tokens:",tok)
print("Tokenised Version:",tokenized)

f = open(  "merge_rules.txt", "w")
for i in merge_rules:
    f.write(i[0] + "," + i[1] + "\n")
f.close()

f = open( "tokens.txt", "w")
for i in tok:
    f.write(i + "\n")
f.close()

f = open("tokenized.txt", "w")
n = int(input("Enter the number of sentences you want to tokenize: "))
for i in range(n):
    s = input("Enter the sentence: ")
    f.write(tokenizer.tokenize(s) + "\n")
f.close()

Merge Rules: [('e', '$'), ('t', '$'), ('t', 'h'), ('d', '$'), ('i', 'n'), ('i', '$'), ('s', '$'), ('e', 'l'), ('y', '$'), ('a', 'n'), ('f', 'e'), ('in', 'g'), ('fe', 'el'), ('ing', '$'), ('o', '$'), ('e', 'r'), ('o', 'u'), ('o', 'n'), ('e', 'n'), ('feel', '$'), ('e', 'd$'), ('an', 'd$'), ('o', 'r'), ('t', 'o$'), ('a', 'l'), ('th', 'e$'), ('r', 'e'), ('a', 't$'), ('m', '$'), ('f', '$'), ('t', 'i'), ('a', '$'), ('l', 'i'), ('a', 'r'), ('th', 'at$'), ('er', '$'), ('l', '$'), ('h', 'a'), ('s', 't'), ('o', 'f$'), ('l', 'y$'), ('v', 'e$'), ('o', 'm'), ('i', 's$'), ('in', '$'), ('m', 'y$'), ('o', 'w'), ('en', '$'), ('or', '$'), ('k', 'e$'), ('w', 'a'), ('w', 'h'), ('i', 't$'), ('b', 'e'), ('on', '$'), ('r', 'i'), ('l', 'e'), ('w', 'i'), ('a', 'b'), ('a', 't'), ('h', '$'), ('s', 'e$'), ('m', 'e$'), ('a', 'c'), ('li', 'ke$'), ('l', 'o'), ('k', '$'), ('i', 'm$'), ('ou', 't$'), ('s', 'o$'), ('e', 's$'), ('s', 'u'), ('b', 'u'), ('d', 'i'), ('s', 'i'), ('f', 'or$'), ('n', 'o'), ('m', 'a'), ('th', '