In [1]:
import torch
import pandas as pd
import tokenizers
import transformers
import lightning.pytorch as pl

from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split
from pathlib import Path
from typing import Generator
from tqdm import tqdm

from torch.utils.data import Dataset, DataLoader
from tokenizers import (
    Tokenizer, 
    models, 
    normalizers, 
    pre_tokenizers, 
    decoders, 
    trainers, 
    processors
)


import os
import yaml
with open('../config.yaml') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
           

## **Data Preprocessing**

In [2]:
data_config = config['data']
docs = data_config['src_to_tgt']
src_lang = data_config['src']
tgt_lang = data_config['tgt']
docs

{'train1.en.txt': 'train1.ta.txt',
 'train2.en.txt': 'train2.ta.txt',
 'train3.en.txt': 'train3.ta.txt'}

In [3]:
def read_text_file(file_path):
    with open(file_path, 'r') as file:
        lines = [line.strip() for line in file.readlines()]
    return lines

dfs = []
for key, val in docs.items():
    txt1, txt2 = (read_text_file(os.path.join(data_config['data_dir'], key)), 
                  read_text_file(os.path.join(data_config['data_dir'], val)))
    corpus = pd.DataFrame({'src': txt1, 'tgt': txt2})    
    dfs.append(corpus)

corpus = pd.concat(dfs, ignore_index=True)
corpus.rename(columns={'src': data_config['src'], 'tgt': data_config['tgt']}, inplace=True)
corpus

Unnamed: 0,en,ta
0,That's what I am saying.,என்றுதான் நான் சொல்ல வருகிறேன்.
1,Every tournament is difficult.,ஒவ்வொரு சுற்றுப்பயணமும் கடினமானது.
2,"One of the first questions Flavio posed was, D...",பல வருடங்களாக அவர் அந்த நித்திய எரிநரக தண்டனைய...
3,He gave full credit to the Union Finance Minis...,அவர் நிதி அமைச்சர் அருண்ஜேட்லியின் முயற்சியை த...
4,Some art historians have suggested that he onl...,சில கலை வரலாற்றாசிரியர்கள் அவர் ஒரு வருடத்திற்...
...,...,...
5198656,mental,மன
5198657,mental aberration,மனப் பிறழ்ச்சி
5198658,mental competency,மனத் தேர்ச்சி
5198659,mental deficiency,மன ஊனம்


In [4]:
# corpus.to_parquet(os.path.join(data_config['data_dir'], 'dataset_large.parquet'), index=False)

## **Build Tokenizer**

In [5]:

def train_bpe_tokenizer(tokenizer: Tokenizer, series, config):
    tokenizer_path = Path(config['tokenizer_path'])
    special_tokens = {
        config['special_tokens']['bos_token']: 0,
        config['special_tokens']['pad_token']: 1,
        config['special_tokens']['eos_token']: 2,
        config['special_tokens']['unk_token']: 3,
        config['special_tokens']['mask_token']: config['vocab_size'] - 1,
    }
    
    if config['lang'] == 'ta':
        normalizer = normalizers.NFKC()
        pre_tokenizer = pre_tokenizers.Metaspace()
        decoder = decoders.Metaspace()
    elif config['lang'] == 'en':
        normalizer = normalizers.Sequence([
            normalizers.NFKC(),
            normalizers.Lowercase()
        ])
        pre_tokenizer = pre_tokenizers.ByteLevel()
        decoder = decoders.ByteLevel()
    else:
        raise ValueError(f"Unsupported language: {config['lang']}")
    
    post_processor = processors.TemplateProcessing(
        single=f"{config['special_tokens']['bos_token']} $A {config['special_tokens']['eos_token']}",
        special_tokens=list(special_tokens.items()),
    )
    
    trainer = trainers.BpeTrainer(
        special_tokens=list(special_tokens.keys()),
        vocab_size=config['vocab_size'],
        min_frequency=config['min_frequency'],
        show_progress=True,
    )

    tokenizer.normalizer = normalizer
    tokenizer.pre_tokenizer = pre_tokenizer
    tokenizer.decoder = decoder
    tokenizer.post_processor = post_processor
    
    def get_sentences(series: pd.Series) -> Generator[str, None, None]:
        for text in series:
            yield text

    tokenizer.train_from_iterator(
        get_sentences(series=series),
        trainer=trainer,
        length=len(series),
    )

    tokenizer.save(str(tokenizer_path))
    print(tokenizer)
    print(tokenizer.get_vocab_size())


In [9]:
with open('../config.yaml') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

config = config['tokenizer']
df = pd.read_parquet(config['data_path'])

for items in (config['src'], config['tgt']):
    _df = df[items['lang']]

    tokenizer = Tokenizer(models.BPE(unk_token=items['special_tokens']['unk_token']))
    train_bpe_tokenizer(tokenizer, _df, items)

    encoded_tokens = tokenizer.encode(_df[0])
    print(encoded_tokens.ids)
    print(encoded_tokens.type_ids)
    print(encoded_tokens.tokens)
    print(encoded_tokens.overflowing)

    encoded_ids = encoded_tokens.ids

    decoded_string = tokenizer.decode(encoded_ids)
    print(f"{decoded_string = }")

    print("Size of vocabulary:", tokenizer.get_vocab_size())
    print("Successfully trained tokenizer", tokenizer)





<tokenizers.Tokenizer object at 0x55b15a282a30>
10000
[0, 171, 331, 273, 189, 335, 1878, 18, 2]
[0, 0, 0, 0, 0, 0, 0, 0, 0]
['<s>', 'Ġthat', "'s", 'Ġwhat', 'Ġi', 'Ġam', 'Ġsaying', '.', '</s>']
[]
decoded_string = " that's what i am saying."
Size of vocabulary: 10000
Successfully trained tokenizer <tokenizers.Tokenizer object at 0x55b15a282a30>



<tokenizers.Tokenizer object at 0x55b14e49db40>
10000
[0, 2839, 3272, 2959, 3221, 3409, 2952, 2]
[0, 0, 0, 0, 0, 0, 0, 0]
['<s>', '▁என்று', 'தான்', '▁நான்', '▁சொல்ல', '▁வருகிற', 'ேன்.', '</s>']
[]
decoded_string = 'என்றுதான் நான் சொல்ல வருகிறேன்.'
Size of vocabulary: 10000
Successfully trained tokenizer <tokenizers.Tokenizer object at 0x55b14e49db40>


In [10]:
encoded_tokens = tokenizer.encode(_df[3])
print(encoded_tokens.ids)
print(encoded_tokens.type_ids)
print(encoded_tokens.tokens)
print(encoded_tokens.overflowing)

encoded_ids = encoded_tokens.ids

decoded_string = tokenizer.decode(encoded_ids)
print(f"{decoded_string = }")

print("Size of vocabulary:", tokenizer.get_vocab_size())
print("Successfully trained tokenizer", tokenizer)

[0, 2881, 3673, 3749, 9110, 945, 6946, 960, 3227, 6038, 975, 3406, 5781, 2683, 3803, 3520, 8450, 3117, 4453, 4402, 3593, 3148, 7031, 4120, 3433, 5209, 7311, 2715, 4570, 2884, 2703, 3069, 2813, 4490, 2778, 3406, 4294, 4901, 3092, 6293, 7169, 3011, 2773, 2953, 3877, 3633, 2751, 2771, 962, 3375, 3195, 3663, 2695, 3007, 8095, 4757, 5183, 2750, 3954, 7464, 9429, 5366, 2857, 2784, 7791, 2687, 6747, 3154, 2]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['<s>', '▁அவர்', '▁நிதி', '▁அமைச்சர்', '▁அருண்', 'ஜ', 'ேட்', 'ல', 'ியின்', '▁முயற்சிய', 'ை', '▁தொழில்', '▁உற்பத்திய', 'ில்', '▁ஈடுப', 'ட்டுள்ள', '▁மாநிலங்கள்', '▁மத்திய', '▁அரசின்', '▁உதவிய', 'ைப்', '▁ஈ', 'டாக', '▁பெறு', 'வதற்கு', '▁ஏற்ற', '▁கட்டளை', 'வி', 'தியை', '▁அமை', 'த்து', 'க்கொ', 'டுத்த', 'ற்கும்', '▁மற்றும்', '▁தொழில்', '▁உற்பத்தி', '▁இல்லாத', '▁மாநில', 'ங்களுடன்', '▁தாங்கள்'

In [8]:
from collections import defaultdict

dct = defaultdict(int)
for text in tqdm(_df):
    encoded_tokens = tokenizer.encode(text)
    for token in encoded_tokens.tokens:
        dct[token] += 1

dct = dict(sorted(dct.items(), key=lambda x: x[1], reverse=True))
dct

  0%|          | 0/5198661 [00:00<?, ?it/s]

 26%|██▋       | 1366695/5198661 [02:14<06:18, 10125.29it/s]


KeyboardInterrupt: 

### **Inspecting BART Tokenizer**

In [None]:
from transformers import BartTokenizer
_tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
_tokenizer

BartTokenizer(name_or_path='facebook/bart-base', vocab_size=50265, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True, special=False),
}

In [None]:
_tokenizer.decode(tokenizer.encode(_df[0]))

'<s>என்றுதான் நான் சொல்ல வருகிறேன்.</s>'

In [None]:
lst = _tokenizer.encode(_df[0])
for item in lst:
    print(_tokenizer.decode(item))

<s>
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
 
�
�
�
�
�
�
�
�
�
�
�
�
 
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
 
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
�
.
</s>


## **Model**

In [10]:
# Author : NavinKumarMNK
"""Transformer Model"""
from dataclasses import dataclass
from typing import Tuple, Optional
import math
import torch
import torch.nn as nn
import yaml

SEED = 42
torch.manual_seed(seed=SEED)
torch.cuda.manual_seed_all(seed=SEED)
torch.backends.cudnn.deterministic = True


class InputEmbeddings(nn.Module):
    # stores embedding of the tokens
    def __init__(self, dim_model: int, vocab_size: int):
        super().__init__()
        self.dim_model = dim_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(
            num_embeddings=self.vocab_size, embedding_dim=self.dim_model
        )

    def forward(self, x: torch.Tensor):
        return self.embedding(x) * math.sqrt(self.dim_model)


class PositionalEncoding(nn.Module):
    # calculate the positional embedding
    def __init__(self, dim_model: int, seq_len: int, dropout: float):
        super().__init__()
        self.dropout = nn.Dropout(dropout)

        # PE => (seq_len, d_model) ; position, div_term => (self.seq_len, 1)
        position_encoding = torch.zeros(size=(seq_len, dim_model))
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, dim_model, 2).float() * (-math.log(10000.0) / dim_model)
        )  # e^(2*i * (ln 10000) / dim_model)

        # sin() to even pos & cos() to odd position
        position_encoding[:, 0::2] = torch.sin(position * div_term)
        position_encoding[:, 1::2] = torch.cos(position * div_term)
        position_encoding = position_encoding.unsqueeze(0)  # (1, seq_len, dim_model)

        # register buffer => Keep with module but not as learnable paramter
        self.register_buffer("position_encoding", position_encoding)

    def forward(self, x: torch.Tensor):
        # positional encodings are added only till the valid tokens in x : (batch_size, seq_len, dim)
        return self.dropout(
            x + (self.position_encoding[:, : x.shape[1], :]).requires_grad_(False)
        )

class ConvBlock(nn.Module):
    def __init__(self, input_size, ):
        super().__init__()
        self.block = nn.Sequential(
            nn.Conv1d(in_channels=input_size,
                      out_channels=input_size // 2 , 
                      kernel_size=3),
            nn.ReLU(),
            nn.Conv1d(in_channels=input_size//2, 
                       out_channels=input_size//4, 
                       kernel_size=3),
            nn.ReLU(),
        )
    
    def forward(self, x):
        return self.block(x)

class LayerNorm(nn.Module):
    # Normazalize across Layers => Xj = (xj - uj)/(sigma^2 + e)^(0.5)
    def __init__(self, eps: float = 10**-6) -> None:
        super().__init__()
        self.eps = eps

        # parameters: alpha (multiplicative) bias (additive)
        self.alpha = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.zeros(1))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return (
            self.alpha
            * (x - x.mean(-1, keepdim=True))
            / (self.eps + x.std(dim=-1, keepdim=True))
        ) + self.bias


class FeedForwardBlock(nn.Module):
    # sequence of linear layer : [dim -> ddf(general-4*dim) -> dim]
    def __init__(self, dim_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        # Linear -> Norm -> Activation -> Dropout -> Linear
        self.ffn = nn.Sequential(
            nn.Linear(dim_model, d_ff),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, dim_model),
            nn.Dropout(dropout),
        )

    def forward(self, x: torch.Tensor):
        return self.ffn(x)


class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, dim_model: int, num_heads: int, dropout: float) -> None:
        super().__init__()
        self.dim_model = dim_model
        self.num_heads = num_heads
        assert (
            self.dim_model % self.num_heads == 0
        ), "dim_model is not divisible by num_heads"

        self.d_k = self.dim_model // self.num_heads

        # key, query, value
        self.w_q = nn.Linear(self.dim_model, self.dim_model)
        self.w_k = nn.Linear(self.dim_model, self.dim_model)
        self.w_v = nn.Linear(self.dim_model, self.dim_model)

        # concat([heads]) * w_o
        self.w_o = nn.Linear(self.dim_model, self.dim_model)
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        mask: Optional[torch.Tensor] = None,
        dropout: Optional[nn.Dropout] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        d_k = query.shape[-1]

        # attention = q.k / sqrt(dim) : (batch_szie, h, seq_len, d_k) -> (_, _, _, seq_len)
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)

        if mask is not None:
            attention_scores.masked_fill_(mask == 0, -1e4)
        attention_scores = attention_scores.softmax(dim=-1)

        if dropout is not None:
            attention_scores = dropout(attention_scores)

        return attention_scores @ value

    # mask => control attention by blocking interactions between two words
    def forward(self, query, key, value, mask: Optional[torch.Tensor]):
        # (batch_size, seq_len, dim_model) -> (batch_size, seq_len, dim_model) ->
        # (batch_size, seq_len, heads, d_k) -> (batch_size, heads, seq_len, d_k) : process across heads
        query = (
            self.w_q(query)
            .view(query.shape[0], query.shape[1], self.num_heads, self.d_k)
            .transpose(1, 2)
        )
        key = (
            self.w_k(key)
            .view(key.shape[0], key.shape[1], self.num_heads, self.d_k)
            .transpose(1, 2)
        )
        value = (
            self.w_v(value)
            .view(value.shape[0], value.shape[1], self.num_heads, self.d_k)
            .transpose(1, 2)
        )

        # final multihead attentions
        x = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)

        # (batch_size, heads, seq_len, d_k) -> (_, seq_len, heads, _) -> (batch_size, seq_len, dim_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.dim_model)
        x = self.w_o(x)
        return x


class EncoderBlock(nn.Module):
    # pre_norm -> mha() -> residual(before norm) -> pre_norm -> ffn() -> residual(before norm)
    def __init__(
        self, dim_model: int, num_heads: int, dropout: float, d_ff: int
    ) -> None:
        super().__init__()
        self.multi_head_attention = MultiHeadAttentionBlock(
            dim_model=dim_model, num_heads=num_heads, dropout=dropout
        )
        self.ffn = FeedForwardBlock(dim_model=dim_model, d_ff=d_ff, dropout=dropout)

        self.norm_ffn = LayerNorm()
        self.norm_mha = LayerNorm()

    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor]):
        x_norm = self.norm_mha(x)
        x_atten = x + self.multi_head_attention(x_norm, x_norm, x_norm, mask)
        x_ffn = x_atten + self.ffn(self.norm_ffn(x_atten))
        return x_ffn

class Encoder(nn.Module):
    # stacked n EncoderBlocks
    def __init__(
        self,
        dim_model: int,
        num_layers: int,
        dropout: float,
        num_heads: int,
        d_ff: int,
    ) -> None:
        super().__init__()
        self.layer_norm = LayerNorm()
        self.encoder_layers = nn.ModuleList(
            modules=[
                EncoderBlock(
                    dim_model=dim_model, num_heads=num_heads, dropout=dropout, d_ff=d_ff
                )
                for _ in range(num_layers)
            ]
        )

    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor]) -> torch.Tensor:
        for layer in self.encoder_layers:
            x = layer(x, mask)
        return self.layer_norm(x)


class DecoderBlock(nn.Module):
    # pre_norm -> masked mha (self-attention) -> residual -> pre_norm(q) + encoder(k, v) ->
    # mha (cross-attention) -> residual -> pre_norm -> ffn -> residual
    def __init__(
        self, dim_model: int, num_heads: int, dropout: float, d_ff: int
    ) -> None:
        super().__init__()

        self.self_attention = MultiHeadAttentionBlock(
            dim_model=dim_model, num_heads=num_heads, dropout=dropout
        )
        self.cross_attention = MultiHeadAttentionBlock(
            dim_model=dim_model, num_heads=num_heads, dropout=dropout
        )
        self.ffn = FeedForwardBlock(dim_model=dim_model, d_ff=d_ff, dropout=dropout)

        self.norm_self = LayerNorm()
        self.norm_cross = LayerNorm()
        self.norm_ffn = LayerNorm()

    def forward(
        self,
        x: torch.Tensor,
        encoder_x: torch.Tensor,
        src_mask: Optional[torch.Tensor],
        tgt_mask: Optional[torch.Tensor],
    ) -> torch.Tensor:
        x_norm = self.norm_self(x)
        x = x + self.self_attention(x_norm, x_norm, x_norm, tgt_mask)

        x_norm = self.norm_cross(x)
        x = x + self.cross_attention(x, encoder_x, encoder_x, src_mask)

        x_norm = self.norm_ffn(x)
        x = x + self.ffn(x_norm)

        return x


class Decoder(nn.Module):
    # stacked n DecoderBlocks
    def __init__(
        self,
        dim_model: int,
        num_layers: int,
        dropout: float,
        num_heads: int,
        d_ff: int,
    ) -> None:
        super().__init__()

        # module list of n Decoder Blocks
        self.layer_norm = LayerNorm()
        self.decoder_layers = nn.ModuleList(
            modules=[
                DecoderBlock(
                    dim_model=dim_model, num_heads=num_heads, dropout=dropout, d_ff=d_ff
                )
                for _ in range(num_layers)
            ]
        )

    def forward(
        self,
        x: torch.Tensor,
        encoder_output: torch.Tensor,
        src_mask: torch.Tensor,
        tgt_mask: torch.Tensor,
    ) -> torch.Tensor:
        for layer in self.decoder_layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)

        return self.layer_norm(x)


class ProjectionHead(nn.Module):
    # feature vector to vocab
    def __init__(self, dim_model: int, vocab_size: int) -> None:
        super().__init__()
        self.proj = nn.Linear(dim_model, vocab_size)

    def forward(self, x) -> torch.Tensor:
        # (batch_size, seq_len, d_model) -> (_, _, vocab_size)
        return torch.log_softmax(self.proj(x), dim=-1)


class Transformer(nn.Module):
    def __init__(
        self,
        dim_model: int,
        num_layers: int,
        dropout: float,
        num_heads: int,
        d_ff: int,
        src_max_seq_len: int,
        tgt_max_seq_len: int,
        src_vocab_size: int,
        tgt_vocab_size: int,
    ) -> None:
        super().__init__()
        self.dim_model = dim_model
        self.num_layers = num_layers
        self.dropout = dropout
        self.num_heads = num_heads
        self.d_ff = d_ff
        self.src_max_seq_len = src_max_seq_len
        self.tgt_max_seq_len = tgt_max_seq_len
        self.src_vocab_size = src_vocab_size
        self.tgt_vocab_size = tgt_vocab_size

        # Embeddings (vocab -> vector)
        self.src_emb = InputEmbeddings(
            dim_model=self.dim_model, vocab_size=self.src_vocab_size
        )
        self.tgt_emb = InputEmbeddings(
            dim_model=self.dim_model, vocab_size=self.tgt_vocab_size
        )

        self.src_pos = PositionalEncoding(
            dim_model=self.dim_model, seq_len=self.src_max_seq_len, dropout=self.dropout
        )
        self.tgt_pos = PositionalEncoding(
            dim_model=self.dim_model, seq_len=self.tgt_max_seq_len, dropout=self.dropout
        )
        
        # Core Layers
        self.encoder = Encoder(
            dim_model=self.dim_model,
            num_layers=self.num_layers,
            dropout=self.dropout,
            num_heads=self.num_heads,
            d_ff=self.d_ff,
        )
        self.decoder = Decoder(
            dim_model=self.dim_model,
            num_layers=self.num_layers,
            dropout=self.dropout,
            num_heads=self.num_heads,
            d_ff=self.d_ff,
        )

        # Conversion head (vector -> word)
        self.projection = ProjectionHead(
            dim_model=self.dim_model, vocab_size=self.tgt_vocab_size
        )

        for params in self.parameters():
            if params.dim() > 1:
                nn.init.xavier_uniform_(params)

    def encode(self, src, src_mask):
        src = self.src_emb(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)

    def decode(self, tgt, src_output, src_mask, tgt_mask):
        tgt = self.tgt_emb(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, src_output, src_mask, tgt_mask)

    def project(self, x: torch.Tensor) -> torch.Tensor:
        return self.projection(x)
    
    def forward(self, src, tgt, src_mask, tgt_mask):
        src_output = self.encode(src, src_mask)
        tgt_output = self.decode(tgt, src_output, src_mask, tgt_mask)
        return self.project(tgt_output)


with open("../config.yaml") as f:
    config = yaml.safe_load(f)

model = Transformer(**config["model"]["parameters"])    
print(model)

# calculate no of parameters in encoder, decoder, and embedding layers separately
# encoder
encoder_params = sum(p.numel() for p in model.encoder.parameters() if p.requires_grad)
print(f"Encoder Parameters: {encoder_params:,}")

# decoder
decoder_params = sum(p.numel() for p in model.decoder.parameters() if p.requires_grad)
print(f"Decoder Parameters: {decoder_params:,}")

# embedding
embedding_params = sum(p.numel() for p in model.src_emb.parameters() if p.requires_grad)
print(f"Embedding Parameters: {embedding_params:,}")

# embedding target 
embedding_params = sum(p.numel() for p in model.tgt_emb.parameters() if p.requires_grad)
print(f"Embedding Parameters: {embedding_params:,}")

pos_parms = sum(p.numel() for p in model.src_pos.parameters() if p.requires_grad)
print(f"Positional Encoding Parameters: {pos_parms:,}")

pos_parms = sum(p.numel() for p in model.tgt_pos.parameters() if p.requires_grad)
print(f"Positional Encoding Parameters: {pos_parms:,}")


# projection
projection_params = sum(p.numel() for p in model.projection.parameters() if p.requires_grad)
print(f"Projection Parameters: {projection_params:,}")

Transformer(
  (src_emb): InputEmbeddings(
    (embedding): Embedding(10000, 256)
  )
  (tgt_emb): InputEmbeddings(
    (embedding): Embedding(10000, 256)
  )
  (src_pos): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (tgt_pos): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Encoder(
    (layer_norm): LayerNorm()
    (encoder_layers): ModuleList(
      (0-5): 6 x EncoderBlock(
        (multi_head_attention): MultiHeadAttentionBlock(
          (w_q): Linear(in_features=256, out_features=256, bias=True)
          (w_k): Linear(in_features=256, out_features=256, bias=True)
          (w_v): Linear(in_features=256, out_features=256, bias=True)
          (w_o): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ffn): FeedForwardBlock(
          (ffn): Sequential(
            (0): Linear(in_features=256, out_features=1024, bias=True)
            (1): ReLU()
     

## **Dataset**

In [4]:

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

class Seq2SeqDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer_src: str, tokenizer_tgt: str, src_lang: str, 
                 tgt_lang: str, src_seq_len: int, tgt_seq_len: int ) -> None:
        super().__init__()
        
        self.df = df
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        self.src_seq_len = src_seq_len
        self.tgt_seq_len = tgt_seq_len

        self.tokenizer_src: Tokenizer = Tokenizer.from_file(self.tokenizer_src)
        self.tokenizer_tgt: Tokenizer = Tokenizer.from_file(self.tokenizer_tgt)

        self.sos_token_src = self.sos_token_tgt = torch.tensor([0], dtype=torch.int64)
        self.pad_token_src = self.pad_token_tgt = torch.tensor([1], dtype=torch.int64)
        self.eos_token_src = self.eos_token_tgt = torch.tensor([2], dtype=torch.int64)
        self.mask_token_src = torch.tensor([self.tokenizer_src.get_vocab_size() - 1], dtype=torch.int64)  
        self.mask_token_tgt = torch.tensor([self.tokenizer_tgt.get_vocab_size() - 1], dtype=torch.int64) 

    def __len__(self) -> int:
        return len(self.df)
    
    def __getitem__(self, idx: int) -> dict:
        # support for only DataFrame pair
        src_text, tgt_text = self.df.iloc[idx][self.src_lang], self.df.iloc[idx][self.tgt_lang]
        src_input_ids, tgt_input_ids = self.tokenizer_src.encode(src_text).ids, self.tokenizer_tgt.encode(tgt_text).ids
        src_pad_len, tgt_pad_len = self.src_seq_len - len(src_input_ids) - 2, \
            self.tgt_seq_len - len(tgt_input_ids) - 1 # -2 for sos and eos token & -1 for sos token

        if src_pad_len < 0 or tgt_pad_len < 0:
            src_input_ids, tgt_input_ids = src_input_ids[:self.src_seq_len-2], \
                tgt_input_ids[:self.tgt_seq_len-1]
            src_pad_len, tgt_pad_len = 0, 0
        
        src_input_ids, tgt_input_ids = torch.tensor(src_input_ids, dtype=torch.int64), \
            torch.tensor(tgt_input_ids, dtype=torch.int64)
         
        # concatenating sos, eos and pad tokens
        src_input_ids = torch.cat(
            [self.sos_token_src, src_input_ids, self.eos_token_src, self.pad_token_src.repeat(src_pad_len)])
        label = torch.cat(
            [tgt_input_ids, self.eos_token_tgt, self.pad_token_tgt.repeat(tgt_pad_len)])
        tgt_input_ids = torch.cat(
            [self.sos_token_tgt, tgt_input_ids, self.pad_token_tgt.repeat(tgt_pad_len)])
        
        # attention mask in encoder is 1 for all non-pad tokens and 0 for pad tokens
        attention_mask_src = (src_input_ids != self.pad_token_src).int()
        # casual attention mask in decoder is 1 for previous tokens and 0 for future tokens
        casual_attention_mask_tgt = torch.triu(
            torch.ones((self.tgt_seq_len, self.tgt_seq_len), dtype=torch.int64), diagonal=1
        )
        
        return {
            "encoder_input": src_input_ids, # (seq_len,)
            "decoder_input": tgt_input_ids, # (seq_len,)
            "encoder_mask": attention_mask_src.unsqueeze(0).unsqueeze(0), # (seq_len,)
            "decoder_mask": casual_attention_mask_tgt.unsqueeze(0), # (seq_len, seq_len)
            "label" : label # (seq_len,)
        }

        
class Seq2SeqDataLoader(pl.LightningDataModule):
    def __init__(self, df: pd.DataFrame, tokenizer_src: str, tokenizer_tgt: str, 
                 src_lang: str, tgt_lang: str, src_seq_len: int, tgt_seq_len: int, 
                 batch_size: int, num_workers: int, split_size:int) -> None:
        super().__init__()
        
        self.df = df
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        self.src_seq_len = src_seq_len
        self.tgt_seq_len = tgt_seq_len
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.split_size = split_size

    def setup(self, stage: str = None):
        self.train_df, self.val_df = train_test_split(
            self.df, test_size=self.split_size, random_state=42
        )
        self.train_dataset = Seq2SeqDataset(
            df=self.train_df, 
            tokenizer_src=self.tokenizer_src,
            tokenizer_tgt=self.tokenizer_tgt, 
            src_lang=self.src_lang,
            tgt_lang=self.tgt_lang,
            src_seq_len=self.src_seq_len,
            tgt_seq_len=self.tgt_seq_len)
        
        self.val_dataset = Seq2SeqDataset(
            df=self.val_df, 
            tokenizer_src=self.tokenizer_src,
            tokenizer_tgt=self.tokenizer_tgt, 
            src_lang=self.src_lang,
            tgt_lang=self.tgt_lang,
            src_seq_len=self.src_seq_len,
            tgt_seq_len=self.tgt_seq_len)

        print(len(self.train_dataset), len(self.val_dataset))
        
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers)
        

with open("../config.yaml") as f:
    config = yaml.safe_load(f)

config = config['train']
dataset = Seq2SeqDataLoader(
    df=pd.read_parquet(config['dataset_path']),
    tokenizer_src=config['tokenizer']['src']['path'],
    tokenizer_tgt=config['tokenizer']['tgt']['path'],
    src_lang=config['tokenizer']['src']['lang'],
    tgt_lang=config['tokenizer']['tgt']['lang'],
    src_seq_len=config['tokenizer']['src']['seq_len'],
    tgt_seq_len=config['tokenizer']['tgt']['seq_len'],
    batch_size=config['batch_size'],
    num_workers=config['num_workers'],
    split_size=config['split_size']
)
dataset.setup()
print("Dataset setup complete")

for batch in dataset.train_dataloader():
    print(batch)
    break

4678794 519867
Dataset setup complete
{'encoder_input': tensor([[   0,    0,  309,  ...,    1,    1,    1],
        [   0,    0,  189,  ...,    1,    1,    1],
        [   0,    0, 6182,  ...,    1,    1,    1],
        ...,
        [   0,    0,  405,  ...,    1,    1,    1],
        [   0,    0,  180,  ...,    1,    1,    1],
        [   0,    0,  395,  ...,    1,    1,    1]]), 'decoder_input': tensor([[   0,    0, 8721,  ...,    1,    1,    1],
        [   0,    0, 3602,  ...,    1,    1,    1],
        [   0,    0, 1753,  ...,    1,    1,    1],
        ...,
        [   0,    0, 2809,  ...,    1,    1,    1],
        [   0,    0, 2822,  ...,    1,    1,    1],
        [   0,    0, 5941,  ...,    1,    1,    1]]), 'encoder_mask': tensor([[[[1, 1, 1,  ..., 0, 0, 0]]],


        [[[1, 1, 1,  ..., 0, 0, 0]]],


        [[[1, 1, 1,  ..., 0, 0, 0]]],


        ...,


        [[[1, 1, 1,  ..., 0, 0, 0]]],


        [[[1, 1, 1,  ..., 0, 0, 0]]],


        [[[1, 1, 1,  ..., 0, 0, 0]]]], dty

## **Train**

In [11]:
# Author : NavinKumarMNK
"""Training script"""

import torch
import torch.nn as nn
import yaml
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

SEED = 42
torch.manual_seed(seed=SEED)
torch.cuda.manual_seed_all(seed=SEED)
torch.backends.cudnn.deterministic = True

with open("../config.yaml") as f:
    config = yaml.safe_load(f)

train_config = config['train']
model_config = config['model']
token_config = config['tokenizer']
data_config = config['data']

'''
# Preprocessing Dataset
corpus = load_corpus(data_config)
corpus.to_parquet(data_config['dataset_path'])
print("Corpus Preprocessed and saved to ", data_config['dataset_path'])

# Tokenizer setup
# df = pd.read_parquet(token_config['dataset_path'])
src_bool = True
for items in (token_config['src'], token_config['tgt']):
    _df = corpus[items['lang']]

    bpe_tokenizer = BPETokenizer(items)
    bpe_tokenizer.train(_df)
    print("Size of vocabulary:", bpe_tokenizer.tokenizer.get_vocab_size())
    if src_bool:
        tokenizer_src = bpe_tokenizer.tokenizer
        src_bool = False
    else:
        tokenizer_tgt = bpe_tokenizer.tokenizer

print("Successfully trained Tokenizers")

'''
from tokenizers import Tokenizer
tokenizer_src = Tokenizer.from_file(token_config['src']['tokenizer_path'])
tokenizer_tgt = Tokenizer.from_file(token_config['tgt']['tokenizer_path'])


# Dataset setup
dataset = Seq2SeqDataLoader(
    df=pd.read_parquet(train_config['dataset_path']),
    tokenizer_src=train_config['tokenizer']['src']['path'],
    tokenizer_tgt=train_config['tokenizer']['tgt']['path'],
    src_lang=train_config['tokenizer']['src']['lang'],
    tgt_lang=train_config['tokenizer']['tgt']['lang'],
    src_seq_len=train_config['tokenizer']['src']['seq_len'],
    tgt_seq_len=train_config['tokenizer']['tgt']['seq_len'],
    batch_size=train_config['batch_size'],
    num_workers=train_config['num_workers'],
    split_size=train_config['split_size']
)
dataset.setup()
print("Dataset setup complete")

train_loader = dataset.train_dataloader()
val_loader = dataset.val_dataloader()

# device settings for training
device = train_config['device'] 
if device == 'cuda':
    if not torch.cuda.is_available():
        print("Device set to cuda but cuda is not available. Using CPU")
        device = 'cpu'

# model setup
model = Transformer(**model_config["parameters"])
optimzer = torch.optim.Adam(
    params=model.parameters(), **train_config['optimizer'])

# label smooting = True;
loss_fn = nn.CrossEntropyLoss(
    ignore_index=1, # ignore padding token
    label_smoothing=train_config['label_smoothing'],
).to(device)

print(model)

# tensorboard logging
writer = SummaryWriter(log_dir=train_config['logging']['dir'])

model.to(device)
print(f"Training on: {device}")

if train_config['fine_tune']:
    model.load_state_dict(torch.load(model_config['path']))

4678794 519867
Dataset setup complete
Transformer(
  (src_emb): InputEmbeddings(
    (embedding): Embedding(10000, 256)
  )
  (tgt_emb): InputEmbeddings(
    (embedding): Embedding(10000, 256)
  )
  (src_pos): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (tgt_pos): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Encoder(
    (layer_norm): LayerNorm()
    (encoder_layers): ModuleList(
      (0-5): 6 x EncoderBlock(
        (multi_head_attention): MultiHeadAttentionBlock(
          (w_q): Linear(in_features=256, out_features=256, bias=True)
          (w_k): Linear(in_features=256, out_features=256, bias=True)
          (w_v): Linear(in_features=256, out_features=256, bias=True)
          (w_o): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ffn): FeedForwardBlock(
          (ffn): Sequential(
            (0): Linear(in_features=256, out_features=1024, bi

In [17]:
for epoch in tqdm(range(train_config['epochs']), desc="Epochs"):
        # training loop
        model.train()
        for batch in tqdm(train_loader, desc=f"Training epoch {epoch:02d}"):
            encoder_input = batch['encoder_input'].to(device) # (batch_size, seq_len)
            decoder_input = batch['decoder_input'].to(device) # (batch_size, seq_len)
            encoder_mask = batch['encoder_mask'].to(device) # (batch_size, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device) # (batch_size, 1, seq_len, seq_len)

            # forward pass
            output = model(src=encoder_input, tgt=decoder_input,
                           src_mask=encoder_mask, tgt_mask=decoder_mask)
            
            
            print(decoder_input.shape)
            print(output.shape, label.shape)
            
            # loss calculation
            loss = loss_fn(output.view(-1, output.size(-1)), label.view(-1))
            writer.add_scalar("Loss/train", loss.item(), epoch)
            writer.flush()

            # set postfix for tqdm
            postfix = {
                "train_loss": loss.item(),
            }
            tqdm.set_postfix(postfix, refresh=True)
            
            # backward pass
            loss.backward()
            optimzer.step()
            optimzer.zero_grad()
        
        # validation loop
        model.eval()
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Validation epoch {epoch:02d}"):
                if batch.shape[0] != train_config['batch_size']:
                    continue

                encoder_input = batch['encoder_input'].to(device) # (batch_size, seq_len)
                decoder_input = batch['decoder_input'].to(device) # (batch_size, seq_len)
                encoder_mask = batch['encoder_mask'].to(device) # (batch_size, 1, 1, seq_len)
                decoder_mask = batch['decoder_mask'].to(device) # (batch_size, 1, seq_len, seq_len)
                label = batch['label'].to(device) # (batch_size, seq_len)

                # forward pass
                encoder_output = model.encoder(
                    src=encoder_input, scr_mask=encoder_mask) # (batch_size, seq_len, d_model)
                decoder_output = model.decoder(
                    tgt=decoder_input, tgt_mask=decoder_mask, 
                    src_output=encoder_output, src_mask=encoder_mask) # (batch_size, seq_len, d_model)

                output = model.project(decoder_output) # (batch_size, seq_len, tgt_vocab_size)

                # loss calculation
                loss = loss_fn(output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
                writer.add_scalar("Loss/val", loss.item(), epoch)
                writer.flush()

                # set postfix for tqdm
                postfix = {
                    "valid_loss": loss.item(),
                }
                tqdm.set_postfix(postfix, refresh=True)
   
        # Save model
        torch.save(model.state_dict(), config['model']['path']+f"epoch_{epoch}.pt")
        


Training epoch 00:   0%|          | 0/292425 [00:01<?, ?it/s]
Epochs:   0%|          | 0/10 [00:01<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacty of 5.79 GiB of which 54.88 MiB is free. Process 607380 has 4.58 GiB memory in use. Of the allocated memory 4.48 GiB is allocated by PyTorch, and 19.55 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

: 

## **Inference**

In [16]:
from train import Seq2SeqModel
from tokenizers import Tokenizer
import torch

# Load the model and tokenizer
model = Seq2SeqModel.load_from_checkpoint("/workspace/Mozhi/logs/Seq2Seq/version_2/checkpoints/epoch=04-val_loss=0.22.ckpt")
tokenizer_src: Tokenizer = Tokenizer.from_file("/workspace/Mozhi/weights/tokenizer_en.json")
tokenizer_tgt: Tokenizer = Tokenizer.from_file("/workspace/Mozhi/weights/tokenizer_ta.json")

torch.save(model.model.state_dict(), "/workspace/Mozhi/weights/mozhi-19M.pt")

# Prepare the input
str = "that's what i am saying."
seq_len = 256
src_input_ids = torch.Tensor(tokenizer_src.encode(str).ids).long()
print(src_input_ids)
src_pad_len = seq_len - len(src_input_ids)
if src_pad_len < 0:
    src_input_ids = src_input_ids[:seq_len]
    src_pad_len = 0
src_input_ids = torch.cat(
    [src_input_ids, torch.Tensor([1]*src_pad_len)]
).long().to("cuda")
attenion_mask = (src_input_ids != 1).unsqueeze(0).unsqueeze(0).to("cuda")

print(src_input_ids, attenion_mask)
print("என்றுதான் நான் சொல்ல வருகிறேன்.")
model.eval()
with torch.no_grad():
    encoder_output = model.model.encode(src_input_ids, attenion_mask)
    
    # Decode the output
    output = torch.empty(1, 1).fill_(0).type_as(src_input_ids).long().to("cuda")
    for i in range(seq_len):
        decoder_mask = torch.triu(
            torch.ones((i+1, i+1)).type_as(attenion_mask), diagonal=1
        ).unsqueeze(0).to("cuda")
        decoder_output = model.model.decode(
            tgt = output,
            src_output = encoder_output,
            src_mask = attenion_mask,
            tgt_mask = decoder_mask
        )
        decoder_output = decoder_output[:, -1, :].argmax(dim=-1).unsqueeze(1)
        print(decoder_output)
        output = torch.cat([output, decoder_output], dim=1)
        if decoder_output == 2:  # End of Sentence token
            break
    print(output)
    print(tokenizer_tgt.decode(output.squeeze(0).tolist()))
