In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import time

from transformers import PreTrainedTokenizerFast

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
SEED = 183441
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
train_df = pd.read_csv('/kaggle/input/nmt-23-languages/cleaned-train.csv')
test_df  = pd.read_csv('/kaggle/input/nmt-23-languages/cleaned-train.csv')
train_df.drop(columns=['Unnamed: 0'], inplace=True)
test_df.drop(columns=['Unnamed: 0'], inplace=True)

In [5]:
tokenizer_id = "MadBonze/indic-tokenizer-initial-alphabet-32K"

tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_id)

tokenizer_config.json:   0%|          | 0.00/5.19k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.40M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/833 [00:00<?, ?B/s]

In [6]:
languages = list(set(list(train_df['Lang1'].unique()) + list(train_df['Lang2'].unique())))

le = LabelEncoder()
le.fit(languages)

train_df['Lang1'] = le.transform(train_df['Lang1'])
train_df['Lang2'] = le.transform(train_df['Lang2'])
test_df['Lang1'] = le.transform(test_df['Lang1'])
test_df['Lang2'] = le.transform(test_df['Lang2'])

In [7]:
class NMTDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.lang2 = self.df.Lang2
        self.sentences1 = self.df.Sentence_Lang1
        self.sentences2 = self.df.Sentence_Lang2
        self.idxs = self.df.Sentence_ID
    
    def __len__(self):
        return len(self.lang1)
    
    def __getitem__(self, idx):
        lang2 = torch.tensor(self.lang2.iloc[idx])
        
        sent1 = self.sentences1.iloc[idx]
        sent2 = self.sentences2.iloc[idx]
        
        return {
            'sent1': sent1, 
            'sent2': sent2, 
            'tgt_lang': lang2
        }

In [8]:
train_dataset = NMTDataset(train_df)

In [9]:
train_dataset[0]

{'sent1': 'তেওঁ ২০০৫ চনৰ পৰা ২০১৭ চনলৈকে ষ্টৰ্টিঙত ফিনমাৰ্কৰ এজন প্ৰতিনিধি হিচাপে কাৰ্যনিৰ্বাহ কৰিছিল।\n',
 'sent2': 'उʼनें 2005 कोला 2017 तक्कर स्टार्टिंग च फिनमार्क दे प्रतिनिधि दे रूपै च कम्म कीता।\n',
 'tgt_lang': tensor(3)}

In [10]:
tokenizer([train_dataset[0]['sent1']])

{'input_ids': [[4293, 27279, 4809, 4004, 19114, 17052, 3457, 2580, 2967, 520, 1347, 5098, 535, 12300, 4461, 9449, 5539, 2497, 14861, 6230, 14207, 5874, 15372, 3529, 1563]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [11]:
class PositionalEmbedding(nn.Module):
    def __init__(self, d_model, max_len):
        super(PositionalEmbedding, self).__init__()
        self.d_model = d_model
        self.max_len = max_len
        pos = torch.arange(self.max_len)
        self.indices = (torch.arange(self.d_model) * 2) / self.d_model
        self.indices = 1 / torch.pow(10000, self.indices)
        self.indices = self.indices.unsqueeze(0)
        self.pos = pos.unsqueeze(1).float()
        self.pe = self.pos @ self.indices
        row_indices = torch.arange(self.pe.shape[0]).unsqueeze(1).expand(-1, self.pe.shape[1])
        mask = (row_indices % 2 == 0)
        self.pe = mask * torch.sin(self.pe) + (~mask) * torch.cos(self.pe)
    def forward(self, x):
        return self.pe[:x.shape[0]]

In [12]:
class RoPE(nn.Module):
    def __init__(self, base, head_dim, max_len):
        super(RoPE, self).__init__()
        assert head_dim % 2 == 0
        self.theta = 1 / (base ** (torch.arange(0, head_dim, 2) / head_dim)).float()
        self.sequences = torch.arange(max_len).float()
        self.theta_sequences = self.sequences.unsqueeze(1) @ self.theta.unsqueeze(0)
        self.complex = torch.polar(torch.ones_like(self.theta_sequences), self.theta_sequences)
        self.register_buffer('freqs_complex', self.complex.unsqueeze(0).unsqueeze(2))

    def forward(self, x):
        B, T, C = x.shape
        complex_x = torch.view_as_complex(x.float().reshape(B, T, -1, 2))
        rotated_x = complex_x * self.freqs_complex[:, T, :, :]
        return torch.view_as_real(rotated_x).reshape(*x.shape).type_as(x)

In [13]:
class AttentionHead(nn.Module):
    def __init__(self, max_len, n_embed, head_size):
        super(AttentionHead, self).__init__()
        self.q = nn.Linear(n_embed, head_size)
        self.k = nn.Linear(n_embed, head_size)
        self.v = nn.Linear(n_embed, head_size)
        self.register_buffer('tril', torch.tril(torch.ones(max_len, max_len)))

    def forward(self, x):
        B, T, C = x.shape
        q = self.q(x)
        k = self.k(x)
        v = self.v(x)
        attn = q @ k.transpose(-2, -1)
        attn = attn.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        attn = F.softmax(attn, dim=-1)
        return attn @ v

In [14]:
class MultiHeadAttention(nn.Module):
    def __init__(self, n_embed, n_head, max_len, dropout):
        super(MultiHeadAttention, self).__init__()
        self.heads = nn.ModuleList([AttentionHead(max_len, n_embed, n_embed // n_head) for _ in range(n_head)])
        self.proj = nn.Linear((n_embed // n_head) * n_head, n_embed)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        return self.dropout(self.proj(out))

In [172]:
class MultiQueryAttention(nn.Module):
    def __init__(self, n_embed, n_head, max_len, dropout):
        super(MultiQueryAttention, self).__init__()
        self.n_head = n_head
        self.head_size = n_embed // n_head
        self.q = nn.Linear(n_embed, n_head * self.head_size)
        self.k = nn.Linear(n_embed, self.head_size)
        self.v = nn.Linear(n_embed, self.head_size)
        self.register_buffer('tril', torch.tril(torch.ones(max_len, max_len)))
    def forward(self, x):
        B, T, C = x.shape
        qw = self.q(x).reshape(self.n_head, B, T, self.head_size) # nh, B, T, hs
        qk = self.k(x).reshape(B, T, self.head_size) # B, T, hs
        qv = self.v(x).reshape(B, T, self.head_size) # B, T, hs
        weights = qw @ qk.transpose(1, 2) # nh, B, T, T
        for _ in range(self.n_head):
            weights[_, :, :, :] = weights[_, :, :, :].masked_fill(self.tril[:T, :T] == 0, float('-inf'))
            weights[_, :, :, :] = F.softmax(weights[_, :, :, :], dim=-1)
        value = weights @ qv
        value = value.reshape(B, T, self.n_head * self.head_size)
        return value

In [173]:
class MultiGroupedQueryAttention(nn.Module):
    def __init__(self, n_embed, n_head, n_kv_head, max_len, dropout):
        super(MultiGroupedQueryAttention, self).__init__()
        assert n_head % n_kv_head == 0
        self.n_rep = n_head // n_kv_head
        self.heads = nn.ModuleList([MultiQueryAttention(n_embed, self.n_rep, max_len, dropout) for _ in range(n_kv_head)])
        self.proj = nn.Linear((n_embed // self.n_rep) * self.n_rep * n_kv_head, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        return self.dropout(self.proj(out))

In [15]:
class FeedForwardNetwork(nn.Module):
    def __init__(self, n_embed, dropout):
        super(FeedForwardNetwork, self).__init__()
        self.ffn = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.ffn(x)

In [16]:
class LayerNorm(nn.Module):
    def __init__(self, n_embed, eps=1e-5):
        super(LayerNorm, self).__init__()
        self.eps = eps
        self.gammas = nn.Parameter(torch.ones(n_embed))
        self.betas = nn.Parameter(torch.zeros(n_embed))
    def forward(self, x):
        x = F.normalize(x, dim=-1)
        return (x * self.gammas) + self.betas

In [17]:
class RMSNorm(nn.Module):
    def __init__(self, n_embed, eps=1e-5):
        super(RMSNorm, self).__init__()
        self.eps = eps
        self.gammas = nn.Parameter(torch.ones(n_embed))
    def forward(self, x):
        rms = (torch.mean(torch.square(x))) ** 0.5
        x = x / (rms + self.eps)
        return (x * self.gammas)

In [18]:
class DecoderBlock(nn.Module):
    def __init__(self, n_embed, n_head, max_len, dropout):
        super(DecoderBlock, self).__init__()
        self.mha = MultiHeadAttention(n_embed, n_head, max_len, dropout)
        self.ffn = FeedForwardNetwork(n_embed, dropout)
        self.ln1 = LayerNorm(n_embed)
        self.ln2 = LayerNorm(n_embed)
    def forward(self, x):
        x = self.ln1(x + self.mha(x))
        x = self.ln2(x + self.ffn(x))
        return x

In [19]:
class Block(nn.Module):
    def __init__(self, n_decoder, n_vocab, n_embed, n_head, max_len, dropout):
        super(Block, self).__init__()
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.positional = PositionalEmbedding(n_embed, max_len)
        self.decoder = nn.Sequential(*[DecoderBlock(n_embed, n_head, max_len, dropout) for _ in range(n_decoder)])
        self.lm_head = nn.Linear(n_embed, n_vocab)
    def forward(self, x):
        x = torch.tensor(x)
        embed = self.embedding(x)
        pos_embed = self.positional(x)
        x = embed + pos_embed
        x = self.decoder(x)
        return x #self.lm_head(x)[:, -1, :]

In [20]:
transformer = Block(2, tokenizer.vocab_size, 32, 2, 32, 0.2)

In [21]:
output = transformer(tokenizer([train_dataset[0]['sent1']]).input_ids)