In [162]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from datasets import Dataset
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import os
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [163]:
class InputEmbeddings(nn.Module) :
     def __init__(self , vocab_size , d_model):
         super().__init__()
         self.d_model = d_model
         self.embd = nn.Embedding(vocab_size, d_model)
     def forward(self,X) : # X(batch_size , max_len)
        return self.embd(X)*(np.sqrt(self.d_model)) # batch_size , max_len , d_model
        
class PositionEncoding(nn.Module) :
    def __init__(self , max_len , d_model ,dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.encodings = torch.zeros(max_len , d_model ,dtype  = torch.float)
        self.dropout = nn.Dropout(dropout)
        for pos in range(max_len) : 
            for i in range(d_model) :
                if(i%2):
                    self.encodings[pos][i]  =  torch.cos(  torch.tensor( pos/(10000)**( (i-1) / d_model) , dtype  = torch.float32 )  )
                else:
                    self.encodings[pos][i]  =  torch.sin(torch.tensor(pos/(10000)**(i / d_model) , dtype = torch.float32))

    def forward(self,input_embed) : # batch_size , max_len , d_model
        return self.dropout((input_embed + input_embed))  # batch_size , max_len , d_model
                             

In [190]:
class MultiHeadAttention(nn.Module):
    def __init__(self , d_model , num_heads  , dropout = 0.1):
        super().__init__()
        self.W_q = nn.Linear(d_model , d_model , bias = False)
        self.W_v = nn.Linear(d_model , d_model , bias = False)
        self.W_k = nn.Linear(d_model , d_model , bias = False)
        self.d_k = d_model//num_heads
        self.W_o = nn.Linear(d_model , d_model )
        self.dropout = nn.Dropout(p=dropout)
        self.num_heads = num_heads


    def forward(self,q,k = None,v = None,mask = None):
        if k == None : 
            k = q
        if v == None :
            v = q
        
        Query =  self.W_q(q) # batch , max_len , d_model  
        key   =  self.W_k(k)
        value =  self.W_v(v)
        batch_size = q.shape[0]
        Query = Query.view(batch_size, -1, self.num_heads, self.d_k).permute(0, 2, 1, 3)  # (batch_size, num_heads, seq_len, d_k)
        key = key.view(batch_size, -1, self.num_heads, self.d_k).permute(0, 2, 1, 3)
        value = value.view(batch_size, -1, self.num_heads, self.d_k).permute(0, 2, 1, 3)

        attn_score = torch.matmul(Query ,Key.permute(0,1,3,2))/np.sqrt(d_k)
        if mask is not None :
            attn_scores.masked_fill_(mask == 0, float('-inf'))
        attn_score = self.dropout(attn_score) #batch , h ,max_len ,max_len
        attn_weight = torch.softmax(attn_score , dim = -1) #each token .and
        output = torch.matmul(attn_weight , value)#batch , h ,max_len ,d_k
        output = output.permute(0,2,1,3).contiguous() #batch,max_len,h,d_k
        output.reshape(output.shape[0] , -1 , self.d_k*self.num_heads) #batch,max_len,d_model
        return self.W_o(outputs) #batch,max_len,d_model
        

In [191]:
class add_and_norm(nn.Module) :
    def __init__(self ,d_model ,  dropout = 0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.layernorm = nn.LayerNorm(d_model)
    def forward(self,x,output):
        return self.layernorm(x + self.dropout(output) ) #batch,max_len,d_model
        
    

In [192]:
class FFN(nn.Module):
    def __init__(self , dff ,   d_model ,dropout = 0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.W1 = nn.Linear(d_model,dff)
        self.W2 = nn.Linear(dff,d_model)
        self.relu = nn.ReLU()
    def forward(self,X):
        return self.W2(self.dropout(self.relu(self.W1(X)))) #batch,max_len,d_model

In [193]:
class Encoderblock(nn.Module):
    def __init__(self , d_model , num_heads , dff , dropout = 0.1 ):
        super().__init__()
        self.mha = MultiHeadAttention(d_model , num_heads , dropout)
        self.ffn = FFN(dff , d_model ,dropout)
        self.residual1  = add_and_norm(d_model ,dropout)
        self.residual2  = add_and_norm(d_model ,dropout)
    def forward(self , x , mask):
        attn = self.mha(x , mask = mask)
        x = self.residual1(x , attn_out)
        ffn_out = self.ffn(x)
        x = self.residual2(x,ffn_output)
        return x #batch,max_len,d_model
        

In [194]:
class Encoder(nn.Module):
    def __init__(self  ,  d_model , num_heads , dff , dropout = 0.1 , num_layers = 6 ):
      super().__init__() 
      self.enc_layers = nn.ModuleList([Encoderblock(d_model,num_heads,dff,dropout)   for x in range(num_layers)] )
    
    def forward(self , x , mask = None):
        for layer in self.enc_layers :
            x = layer(x,mask)
        return x    #batch,max_len,d_model

In [195]:
class DecoderBlock(nn.Module):
    def __init__( self , d_model , num_heads , dff , dropout = 0.1  ):
        super().__init__() 
        self.masked_attn = MultiHeadAttention(d_model , num_heads , dropout = 0.1)
        self.cross_attn =  MultiHeadAttention(d_model , num_heads , dropout = 0.1)
        self.ffn = FFN(dff,d_model,dropout)
        self.res1 = add_and_norm(d_model,dropout)
        self.res2 = add_and_norm(d_model,dropout)
        self.res3 = add_and_norm(d_model,dropout)
    def forward(self , x , enc_output , enc_mask = None , dec_mask = None):
        attn1 = self.masked_attn(x,mask = enc_mask)
        x = self.res1(x,attn1)

        attn2 = self.cross_attn(q = x,k = enc_output , v = enc_output , mask = dec_mask)
        x = self.res2(x,attn2)

        linear = self.ffn(x)
        x = self.res3(x,linear)
        return x #batch,max_len,d_model
        
    

In [196]:
class Decoder(nn.Module):
    def __init__(self  ,  d_model , num_heads , dff , vocab_size , dropout = 0.1 , num_layers = 6 ):
        super(Decoder,self).__init__()
        self.dec_layers = nn.ModuleList([DecoderBlock(d_model,num_heads,dff,dropout)   for x in range(num_layers)] )
        self.linear = nn.Linear(d_model,vocab_size )
    def forward(self,enc_output,enc_mask,dec_mask):
        for layer in self.dec_layers:
            x = layer(x,enc_output,enc_mask,dec_mask)
        return self.linear(x) #batch,max_len,vocab_size

In [197]:
class transformer(nn.Module): 
    def __init__(self ,src_vocab_size ,tgt_vocab_size , d_model , dff , max_len = 512 , num_heads = 8 , dropout = 0.1, num_layers=6):
        super().__init__()
        self.d_model = d_model
        self.max_len = max_len
        self.src_vocab_size = src_vocab_size
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.encoder_embedding = InputEmbeddings(src_vocab_size , d_model)
        self.pos_enc = PositionEncoding(max_len,d_model,dropout)
        self.encoder  = Encoder(d_model,num_heads,dff,dropout,num_layers) 
        self.decoder_embedding = InputEmbeddings(tgt_vocab_size,d_model)
        self.pos_dec = PositionEncoding(max_len,d_model,dropout)
        self.decoder = Decoder(d_model,num_heads,dff,tgt_vocab_size,dropout,num_layers)
        self.init_weights()

    def init_weights(self):
        for p in self.parameters():
            if p.dim() > 1 :
                nn.init.xavier_uniform_(p) 
        
    def forward(self,X,y,enc_mask = None,dec_mask = None):
        enc_emb = self.encoder_embedding(X)
        enc_emb = self.pos_enc(enc_emb)
        dec_emb = self.decoder_embedding(y)
        dec_emb = self.pos_dec(dec_emb)
        enc_output = self.encoder(enc_emb , mask = enc_mask)
        dec_output = self.decoder(dec_emb , enc_output , enc_mask , dec_mask)
        return dec_output
        
    

In [None]:

dataset = load_dataset('wmt16', 'de-en')


In [None]:
dataset

In [None]:
train_raw = dataset['train'].shuffle(seed=0).select(range(30000))


In [None]:
type(train_raw['translation'])

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2", errors = 'replace',
                                         unk_token = '<UNK>',
                                         bos_token = '<SOS>',
                                         eos_token = '<EOS>',
                                         pad_token = '<PAD>',
                                         )

def tokenize_function(batch):
    eng_sentences = ["<SOS>"+data["en"]+"<EOS>" for data in batch["translation"]]   
    german_sentences = ["<SOS>"+data["de"]+"<EOS>" for data in batch["translation"]]
    dec_tokenized = tokenizer(eng_sentences, padding="max_length", truncation=True, max_length=256)
    enc_tokenized = tokenizer(german_sentences, padding="max_length", truncation=True, max_length=256)
    return {
        "enc_input_ids": enc_tokenized["input_ids"],
        "enc_attention_mask": enc_tokenized["attention_mask"],
        "dec_input_ids": dec_tokenized["input_ids"],
        "dec_attention_mask" : dec_tokenized["attention_mask"],
    }
# train_raw_dataset = Dataset.from_dict(train_raw)
tokenized_train_raw = train_raw.map(tokenize_function, batched=True, remove_columns=["translation"])

In [None]:
print("BOS Token:", tokenizer.bos_token , tokenizer.bos_token_id)
print("EOS Token:", tokenizer.eos_token , tokenizer.eos_token_id)
print("UNK Token:", tokenizer.unk_token , tokenizer.unk_token_id)
print("PAD Token:", tokenizer.pad_token, tokenizer.pad_token_id)


In [None]:
print(type(tokenized_train_raw))
print(tokenizer.vocab_size)
# model_name = "Helsinki-NLP/opus-mt-de-en"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# text = "Dies ist ein Test."
# inputs = tokenizer(text, return_tensors="pt")

# output = model.generate(**inputs)
# print(tokenizer.decode(output[0], skip_special_tokens=True))

In [None]:

from torch.utils.data import Dataset 


class TranslationDataset(Dataset):
    def __init__(self, tokenized_dataset):
        self.enc_input_ids = tokenized_dataset["enc_input_ids"]
        self.enc_attention_mask = tokenized_dataset["enc_attention_mask"]
        self.dec_input_ids = tokenized_dataset["dec_input_ids"]

    def __len__(self):
        return len(self.enc_input_ids)

    def __getitem__(self, idx):
        #teacher forcing
        dec_input_ids = self.dec_input_ids[idx][:-1]
        dec_target_ids = self.dec_input_ids[idx][1:]
        seq_len = len(dec_input_ids)
        dec_attention_mask = torch.tril(torch.ones(seq_len, seq_len)).to(dtype=torch.bool)

        return {
            "encoder_input": {
                "input_ids": torch.tensor(self.enc_input_ids[idx], dtype=torch.long),
                "attention_mask": torch.tensor(self.enc_attention_mask[idx], dtype=torch.long)
            },
            "decoder_input": {
                "attention_mask": dec_attention_mask,
                "input_ids": torch.tensor(dec_input_ids, dtype=torch.long),
                "labels": torch.tensor(dec_target_ids, dtype=torch.long)
            }
        }

In [None]:
token = tokenizer.convert_ids_to_tokens(3485)
print("Token for ID 3480:", token)

In [None]:
train_dataset  = TranslationDataset(tokenized_train_raw)

In [None]:
def train():
    device = "cuda"  if torch.cuda.is_available() else "cpu"

In [None]:
train_dataset 

In [None]:
print(train_dataset[0]["decoder_input"]["labels"]) 
print(train_dataset[0]['decoder_input']['input_ids'])
print(train_dataset[0]['decoder_input']['attention_mask'])

In [None]:
BATCH_SIZE = 32
LEARNING_RATE = 1e-4
NUM_EPOCHS = 3
GRAD_CLIP = 1.0
LAYERS = 6 
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
model = transformer(
    src_vocab_size=len(tokenizer),
    tgt_vocab_size=len(tokenizer),
    d_model=256,
    num_heads=8,
    num_layers=6,
    dff=1024
).to(device)


In [None]:
from torch.optim.lr_scheduler import LambdaLR

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss(ignore_index = tokenizer.pad_token_id)
# Warmup scheduler
def lr_lambda(step, warmup_steps=4000):
    if step < warmup_steps:
        return step / warmup_steps
    return (warmup_steps**0.5) * (step**-0.5)

scheduler = LambdaLR(optimizer, lr_lambda)

In [None]:
def train(model,train_loader):
    model.train()
    for epoch in range(NUM_EPOCHS):
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{NUM_EPOCHS}"):
            enc_input_ids = batch['encoder_input']['input_ids'].to(device)
            enc_mask = batch["encoder_input"]["attention_mask"].to(device)
            dec_input_ids = batch["decoder_input"]["input_ids"].to(device)
            dec_labels = batch["decoder_input"]["labels"].to(device)
            dec_mask = batch["decoder_input"]["attention_mask"].to(device)

            logits = model(enc_input_ids, dec_input_ids, enc_mask, dec_mask)  # (batch_size, seq_len, vocab_size)
            loss = loss_fn(logits.view(-1, logits.size(-1)), dec_labels.view(-1))

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} - Loss: {total_loss/len(train_loader)}")
    return total_loss / len(train_loader)


In [None]:
train(model,train_loader)