In [2]:
import torch
import pandas as pd
import tokenizers
import transformers

from pathlib import Path
from typing import Generator
from tqdm import tqdm

from tokenizers import (
    Tokenizer, 
    models, 
    normalizers, 
    pre_tokenizers, 
    decoders, 
    trainers, 
    processors
)

import os
import yaml
with open('config.yaml') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
           

## **Data Preprocessing**

In [3]:
data_config = config['data']
docs = data_config['src_to_tgt']
src_lang = data_config['src']
tgt_lang = data_config['tgt']
docs

{'train1.en.txt': 'train1.ta.txt',
 'train2.en.txt': 'train2.ta.txt',
 'train3.en.txt': 'train3.ta.txt'}

In [4]:
def read_text_file(file_path):
    with open(file_path, 'r') as file:
        lines = [line.strip() for line in file.readlines()]
    return lines

dfs = []
for key, val in docs.items():
    txt1, txt2 = (read_text_file(os.path.join(data_config['data_dir'], key)), 
                  read_text_file(os.path.join(data_config['data_dir'], val)))
    corpus = pd.DataFrame({'src': txt1, 'tgt': txt2})    
    dfs.append(corpus)

corpus = pd.concat(dfs, ignore_index=True)
corpus.rename(columns={'src': data_config['src'], 'tgt': data_config['tgt']}, inplace=True)
corpus

Unnamed: 0,en,ta
0,That's what I am saying.,என்றுதான் நான் சொல்ல வருகிறேன்.
1,Every tournament is difficult.,ஒவ்வொரு சுற்றுப்பயணமும் கடினமானது.
2,"One of the first questions Flavio posed was, D...",பல வருடங்களாக அவர் அந்த நித்திய எரிநரக தண்டனைய...
3,He gave full credit to the Union Finance Minis...,அவர் நிதி அமைச்சர் அருண்ஜேட்லியின் முயற்சியை த...
4,Some art historians have suggested that he onl...,சில கலை வரலாற்றாசிரியர்கள் அவர் ஒரு வருடத்திற்...
...,...,...
5198656,mental,மன
5198657,mental aberration,மனப் பிறழ்ச்சி
5198658,mental competency,மனத் தேர்ச்சி
5198659,mental deficiency,மன ஊனம்


In [5]:
# corpus.to_parquet(os.path.join(data_config['data_dir'], 'dataset_large.parquet'), index=False)

## **Build Tokenizer**

In [58]:

def train_bpe_tokenizer(tokenizer: Tokenizer, series, config):
    tokenizer_path = Path(config['tokenizer_path'])
    special_tokens = {
        config['special_tokens']['bos_token']: 0,
        config['special_tokens']['pad_token']: 1,
        config['special_tokens']['eos_token']: 2,
        config['special_tokens']['unk_token']: 3,
        config['special_tokens']['mask_token']: 50264,
    }
    
    if config['lang'] == 'en':
        normalizer = normalizers.Sequence([
            normalizers.NFKC(),
            normalizers.Lowercase()
        ])
        pre_tokenizer = pre_tokenizers.Metaspace()
        decoder = decoders.Metaspace()
    elif config['lang'] == 'ta':
        normalizer = normalizers.NFKC()
        pre_tokenizer = pre_tokenizers.ByteLevel()
        decoder = decoders.ByteLevel()
    else:
        raise ValueError(f"Unsupported language: {config['lang']}")
    
    post_processor = processors.TemplateProcessing(
        single=f"{config['special_tokens']['bos_token']} $A {config['special_tokens']['eos_token']}",
        special_tokens=list(special_tokens.items()),
    )
    
    trainer = trainers.BpeTrainer(
        special_tokens=list(special_tokens.keys()),
        vocab_size=config['vocab_size'],
        min_frequency=config['min_frequency'],
        show_progress=True,
    )

    tokenizer.normalizer = normalizer
    tokenizer.pre_tokenizer = pre_tokenizer
    tokenizer.decoder = decoder
    tokenizer.post_processor = post_processor
    
    def get_sentences(series: pd.Series) -> Generator[str, None, None]:
        for text in series:
            yield text

    tokenizer.train_from_iterator(
        get_sentences(series=series),
        trainer=trainer,
        length=len(series),
    )

    tokenizer.save(str(tokenizer_path))
    print(tokenizer)
    print(tokenizer.get_vocab_size())


In [60]:
with open('config.yaml') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

config = config['tokenizer']
df = pd.read_parquet(config['data_path'])

for items in (config['src'], config['tgt']):
    _df = df[items['lang']]

    tokenizer = Tokenizer(models.BPE(unk_token=items['special_tokens']['unk_token']))
    train_bpe_tokenizer(tokenizer, _df, items)

    encoded_tokens = tokenizer.encode(_df[0])
    print(encoded_tokens.ids)
    print(encoded_tokens.type_ids)
    print(encoded_tokens.tokens)
    print(encoded_tokens.overflowing)

    encoded_ids = encoded_tokens.ids

    decoded_string = tokenizer.decode(encoded_ids)
    print(f"{decoded_string = }")

    print("Size of vocabulary:", tokenizer.get_vocab_size())
    print("Successfully trained tokenizer", tokenizer)





<tokenizers.Tokenizer object at 0x559c74a0cb00>
50265
[0, 3013, 270, 184, 329, 9500, 2]
[0, 0, 0, 0, 0, 0, 0]
['<s>', "▁that's", '▁what', '▁i', '▁am', '▁saying.', '</s>']
[]
decoded_string = "that's what i am saying."
Size of vocabulary: 50265
Successfully trained tokenizer <tokenizers.Tokenizer object at 0x559c74a0cb00>



<tokenizers.Tokenizer object at 0x559c7d3353a0>
50265
[0, 258, 212, 227, 214, 217, 219, 221, 212, 244, 219, 221, 212, 241, 249, 224, 212, 224, 274, 214, 216, 215, 227, 238, 221, 246, 2]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['<s>', 'Ġà®İà®©', 'à¯į', 'à®±', 'à¯ģ', 'à®¤', 'à®¾', 'à®©', 'à¯į', 'Ġà®¨', 'à®¾', 'à®©', 'à¯į', 'Ġà®ļ', 'à¯Ĭ', 'à®²', 'à¯į', 'à®²', 'Ġà®µà®°', 'à¯ģ', 'à®ķ', 'à®¿', 'à®±', 'à¯ĩ', 'à®©', 'à¯į.', '</s>']
[]
decoded_string = ' என்றுதான் நான் சொல்ல வருகிறேன்.'
Size of vocabulary: 50265
Successfully trained tokenizer <tokenizers.Tokenizer object at 0x559c7d3353a0>


In [63]:
encoded_tokens = tokenizer.encode(_df[3])
print(encoded_tokens.ids)
print(encoded_tokens.type_ids)
print(encoded_tokens.tokens)
print(encoded_tokens.overflowing)

encoded_ids = encoded_tokens.ids

decoded_string = tokenizer.decode(encoded_ids)
print(f"{decoded_string = }")

print("Size of vocabulary:", tokenizer.get_vocab_size())
print("Successfully trained tokenizer", tokenizer)

[0, 266, 212, 244, 215, 217, 215, 299, 226, 235, 212, 366, 212, 418, 214, 242, 212, 304, 238, 220, 212, 224, 215, 228, 215, 221, 212, 234, 214, 471, 212, 235, 215, 228, 226, 240, 249, 251, 215, 224, 212, 371, 212, 270, 212, 217, 215, 228, 215, 224, 212, 608, 214, 250, 212, 220, 214, 225, 212, 225, 234, 219, 237, 215, 546, 212, 232, 212, 355, 212, 217, 215, 228, 321, 215, 221, 212, 454, 215, 228, 226, 223, 212, 608, 219, 216, 230, 243, 227, 214, 406, 212, 216, 214, 332, 212, 227, 281, 212, 905, 226, 229, 215, 217, 215, 228, 226, 299, 226, 217, 212, 217, 214, 216, 212, 216, 249, 220, 214, 217, 212, 863, 212, 216, 214, 222, 212, 276, 212, 227, 214, 222, 212, 240, 249, 251, 215, 224, 212, 371, 212, 270, 212, 217, 215, 324, 212, 224, 219, 217, 234, 219, 237, 215, 546, 212, 232, 214, 293, 212, 240, 219, 248, 212, 232, 212, 1263, 219, 216, 236, 226, 216, 212, 273, 212, 250, 212, 220, 214, 225, 212, 225, 219, 218, 212, 232, 212, 258, 212, 227, 231, 219, 251, 212, 223, 212, 223, 214, 644, 212, 

In [64]:
from collections import defaultdict

dct = defaultdict(int)
for text in tqdm(_df):
    encoded_tokens = tokenizer.encode(text)
    for token in encoded_tokens.tokens:
        dct[token] += 1

dct = dict(sorted(dct.items(), key=lambda x: x[1], reverse=True))
dct

 10%|█         | 539578/5198661 [01:53<16:16, 4771.65it/s]


KeyboardInterrupt: 

### **Inspecting BART Tokenizer**

In [68]:
from transformers import BartTokenizer
_tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
_tokenizer

BartTokenizer(name_or_path='facebook/bart-base', vocab_size=50265, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True, special=False),
}

In [69]:
_tokenizer.decode(tokenizer.encode(_df[0]))

'<s>என்றுதான் நான் சொல்ல வருகிறேன்.</s>'

In [70]:
lst = _tokenizer.encode(_df[0])
for item in lst:
    print(_tokenizer.decode(item))

<s>
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
 
�
�
�
�
�
�
�
�
�
�
�
�
 
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
 
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
.
</s>


## **Model**