In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import random
from tqdm import tqdm

import os
from torch.utils.tensorboard.writer import SummaryWriter
import numpy as np
import time
import math

In [33]:
from torch.masked import masked_tensor

class TransformerEncoderLayer(nn.Module) :
    def __init__(self, d_model, nhead = 1, dim_feedforward=2048, dropout=0.1, activation = F.relu) :
        super(TransformerEncoderLayer,self).__init__()

        self.Q_v = nn.Linear(in_features = d_model ,out_features = d_model ) # (token, d_model) -> (token, d_model)
        self.K_v = nn.Linear(in_features = d_model ,out_features = d_model ) # (token, d_model) -> (token, d_model)
        self.V_v = nn.Linear(in_features = d_model ,out_features = d_model ) # (token, d_model) -> (token, d_model)


    def forward( self , x ) : # (batch_size, tokens, embedding_size)
        key = self.K_v(x) # (batch_size, tokens, d_model)
        query = self.Q_v(x) # (batch_size, tokens, d_model)
        value = self.V_v(x) # (batch_size, tokens, d_model)
        
        scores = torch.sum(query.unsqueeze(1)*key.unsqueeze(2) , dim = 3) # (batch_size, tokens, tokens)
        weights = F.softmax(scores, dim = -1) # (batch_size, tokens, tokens)

        result = torch.matmul(weights , value )               # (batch_size, tokens, tokens) @ (batch_size, tokens, d_model)  

        return result    


    
class MultiHeadAttention(nn.Module) :
    def __init__(self, d_model, nhead = 4, dropout=0.1) :
        super( MultiHeadAttention,self).__init__()
        assert  d_model%nhead == 0
        self.nhead =  nhead
        self.d_model = d_model
        self.Q_v = nn.Linear(in_features = d_model ,out_features = d_model)  # (token, d_model) -> (token, d_model)
        self.K_v = nn.Linear(in_features = d_model ,out_features = d_model)  # (token, d_model) -> (token, d_model)
        self.V_v = nn.Linear(in_features = d_model ,out_features = d_model ) # (token, d_model) -> (token, d_model)

        self.dropout = nn.Dropout(dropout)

    def forward( self , x , mask_x ) : # (batch_size, tokens, d_model)
        batch_size, seq_length =  x.size(0), x.size(1)
        key   = self.K_v(x).reshape(batch_size, seq_length, self.nhead , self.d_model // self.nhead).transpose(1,2)   # (batch_size, n_head, tokens , d_model//n_head)
        query = self.Q_v(x).reshape(batch_size, seq_length, self.nhead , self.d_model // self.nhead).transpose(1,2) # (batch_size, n_head, tokens , d_model//n_head)
        value = self.V_v(x).reshape(batch_size, seq_length, self.nhead , self.d_model // self.nhead).transpose(1,2)    # (batch_size, n_head ,tokens , d_model//n_head)
        
        scores = torch.matmul( query , key.transpose(-2, -1))/ math.sqrt(self.d_model//self.nhead) # (batch_size, nhead, tokens, tokens)


        Mask_x = (mask_x.unsqueeze(1) * mask_x.unsqueeze(2)).bool() # (batch, max_len, max_len)
        mask_combined = ~(Mask_x.unsqueeze(1))
        scores = scores.masked_fill(mask_combined , -float('inf'),)  # (batch_size, nhead, tokens, tokens)


        weights = self.dropout(F.softmax(scores, dim = -1) )# (batch_size, nhead, tokens, tokens)
        result = torch.matmul(weights , value ) # (batch_size, nhead, tokens, d_model//n_head)
        result = torch.transpose(result, 1, 2)   # (batch_size, tokens, n_head, d_model//n_head)
        result = result.reshape( batch_size, seq_length, self.d_model )
        return result    
    


class MaskedMultiHeadAttention(nn.Module) :
    def __init__(self, d_model, nhead = 4, dropout=0.1) :
        super(MaskedMultiHeadAttention,self).__init__()
        assert  d_model%nhead == 0
        self.nhead =  nhead
        self.d_model = d_model
        self.Q_v = nn.Linear(in_features = d_model ,out_features = d_model)  # (token, d_model) -> (token, d_model)
        self.K_v = nn.Linear(in_features = d_model ,out_features = d_model)  # (token, d_model) -> (token, d_model)
        self.V_v = nn.Linear(in_features = d_model ,out_features = d_model ) # (token, d_model) -> (token, d_model)

        self.dropout = nn.Dropout(dropout)
        self.max_len = 1000
        self.register_buffer("mask", torch.tril(torch.ones( 1 , 1 , self.max_len ,self.max_len) , diagonal= 1).bool() ) # The maximum lenght is 1000 here ! 

    def forward( self , x , mask_x) : # (batch_size, tokens, d_model)  & (batch_size, max_len)
        batch_size, seq_length =  x.size(0), x.size(1)
        key   = self.K_v(x).reshape(batch_size, seq_length, self.nhead , self.d_model // self.nhead).transpose(1,2)   # (batch_size, n_head, tokens , d_model//n_head)
        query = self.Q_v(x).reshape(batch_size, seq_length, self.nhead , self.d_model // self.nhead).transpose(1,2)   # (batch_size, n_head, tokens , d_model//n_head)
        value = self.V_v(x).reshape(batch_size, seq_length, self.nhead , self.d_model // self.nhead).transpose(1,2)   # (batch_size, n_head ,tokens , d_model//n_head)
            
        scores = torch.matmul( query , key.transpose(-2, -1))/ math.sqrt(self.d_model//self.nhead) # (batch_size, nhead, tokens, tokens)
    #   if mask_x is not None:

        Mask = self.mask[:,:, :seq_length, :seq_length] # ( 1, 1 , max_len, max_len)


        Mask_x = mask_x.unsqueeze(1) * mask_x.unsqueeze(2) # (batch, max_len, max_len)
        mask_combined = Mask | ~(Mask_x.unsqueeze(1)).bool()
        scores = scores.masked_fill(mask_combined , -float('inf'),)  # (batch_size, nhead, tokens, tokens)


        weights = F.softmax(scores, dim = -1) # (batch_size, nhead, tokens, tokens)
        weights =  self.dropout(weights)

        result = torch.matmul(weights , value ) # (batch_size, nhead, tokens, d_model//n_head)
        result = torch.transpose(result, 1, 2)   # (batch_size, tokens, n_head, d_model//n_head)
        result = result.reshape( batch_size, seq_length, self.d_model )  # (batch_size, tokens, d_model)
        return result 
    


class TransformerEncoder(nn.Module) :
    def __init__(self, d_model, nhead = 4, dim_feedforward= 128, dropout=0.1, activation = nn.ReLU) :
        super(TransformerEncoder, self).__init__()
        self.multi_head_att = MultiHeadAttention(d_model, nhead = nhead, dropout=dropout)
        self.feed_fw = nn.Sequential(
                                nn.Linear(in_features = d_model ,out_features = dim_feedforward),
                                activation() ,
                                nn.Linear(in_features=dim_feedforward , out_features=d_model)
                                    )
                                     
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        
    def forward(self , x , mask_x) :  # (batch_size, token, dim)
        x =  self.layer_norm1(x +  self.multi_head_att(x , mask_x))  # (batch_size, token, dim)
        x =  self.layer_norm2(x + self.feed_fw(x))          # (batch_size, token, dim)
        return x

class CustomMultiHeadAttention(nn.Module) :
    def __init__(self, d_model, nhead = 4, dropout=0.1) :
        super( CustomMultiHeadAttention,self).__init__()
        assert  d_model%nhead == 0
        self.nhead =  nhead
        self.d_model = d_model
        self.Q_v = nn.Linear(in_features = d_model ,out_features = d_model)  # (token, d_model) -> (token, d_model)
        self.K_v = nn.Linear(in_features = d_model ,out_features = d_model)  # (token, d_model) -> (token, d_model)
        self.V_v = nn.Linear(in_features = d_model ,out_features = d_model ) # (token, d_model) -> (token, d_model)

        self.dropout = nn.Dropout(dropout)

    def forward( self , x , y  ) : # (batch_size, tokens, d_model) x : output of encoder, y output of masked_multi_head
        assert x.shape == y.shape
        batch_size, seq_length =  x.size(0), x.size(1)
        key   = self.K_v(x).reshape(batch_size, seq_length, self.nhead , self.d_model // self.nhead).transpose(1,2)   # (batch_size, n_head, tokens , d_model//n_head)
        query = self.Q_v(y).reshape(batch_size, seq_length, self.nhead , self.d_model // self.nhead).transpose(1,2) # (batch_size, n_head, tokens , d_model//n_head)
        value = self.V_v(x).reshape(batch_size, seq_length, self.nhead , self.d_model // self.nhead).transpose(1,2)    # (batch_size, n_head ,tokens , d_model//n_head)
        
        scores = torch.matmul( query , key.transpose(-2, -1))/ math.sqrt(self.d_model//self.nhead) # (batch_size, nhead, tokens, tokens)
        weights = self.dropout(F.softmax(scores, dim = -1) )# (batch_size, nhead, tokens, tokens)
        result = torch.matmul(weights , value ) # (batch_size, nhead, tokens, d_model//n_head)
        result = torch.transpose(result, 1, 2)   # (batch_size, tokens, n_head, d_model//n_head)
        result = result.reshape( batch_size, seq_length, self.d_model )
        return result        
    


class TransformerEncoderDecoder(nn.Module) :
    def __init__(self, d_model, nhead = 4, dim_feedforward= 128, dropout=0.1, activation = nn.ReLU) :
        super(TransformerEncoderDecoder, self).__init__()
        self.transformer_encoder = TransformerEncoder(d_model=d_model , nhead=nhead, dim_feedforward= dim_feedforward, dropout=dropout)
        self.masked_multi_head_att =  MaskedMultiHeadAttention(d_model, nhead = nhead, dropout=dropout)
        self.multi_head_att = CustomMultiHeadAttention(d_model, nhead = nhead, dropout=dropout)

        self.feed_fw = nn.Sequential(
                                nn.Linear(in_features = d_model ,out_features = dim_feedforward),
                                activation() ,
                                nn.Linear(in_features=dim_feedforward , out_features=d_model)
                                    )
                                     
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.layer_norm3 = nn.LayerNorm(d_model)

        
    def forward(self , x , y , x_mask, y_mask) :  # (batch_size, token, dim) & (batch_size, max_len)
        assert x.shape == y.shape
        # ENCODER
        x = self.transformer_encoder(x , x_mask) # (batch_size, token, dim)
        # DECODER 
        y =  self.layer_norm1(y +  self.masked_multi_head_att(y , y_mask))  # (batch_size, token, dim)
        z = self.multi_head_att(x,y) # (batch_size, token, dim)
        z = self.layer_norm2( y + z ) # (batch_size, token, dim)
        z =  self.layer_norm3(z + self.feed_fw(z)) # (batch_size, token, dim)

        return z   # (batch_size, token, dim)
    

class Transformer(nn.Module) :
    def __init__(self, embedding_dim = 512, max_len = 1000 , vocab_size = 30522 , nhead = 8, dim_feedforward = 2048, dropout=0.1, activation = nn.ReLU) :
        super(Transformer, self).__init__()
        self.embedding = nn.Embedding( num_embeddings = vocab_size, embedding_dim = embedding_dim ) #optim_SGD
        self.Transformer_encoder_decoder = TransformerEncoderDecoder(d_model=embedding_dim, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, activation=activation)
        self.linear_layer = nn.Linear(embedding_dim, vocab_size, bias=False)
        self.linear_layer.weight  = self.embedding.weight #  (embedding_dim --> vocab_size )

        P = torch.zeros(max_len, embedding_dim) 
        position = torch.arange(0,  max_len, dtype=torch.float).unsqueeze(1)  # Shape: (seq_len, 1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * -(math.log(10000.0) / embedding_dim))

        P[:, 0::2] = torch.sin(position * div_term)  # sinus to even indices
        P[:, 1::2] = torch.cos(position * div_term)  # cosinus to odd indices


        self.register_buffer("positional_encoding",  P ) # The maximum lenght is max_len here !  

    def forward(self, input_ids , target_ids , input_mask , target_mask ) :   # ids are in  (batch_size, seq_len)
        seq_len = input_ids.size(1)

        
        x = self.embedding(input_ids) #  (batch_size, seq_len , dim)
        x = x + self.positional_encoding[ :seq_len, :].unsqueeze(0).to(x.device)
        y = self.embedding(target_ids) #  (batch_size, seq_len , dim)
        y = y + self.positional_encoding[ :seq_len, :].unsqueeze(0).to(x.device)

        z = self.Transformer_encoder_decoder(x, y , input_mask , target_mask) #  (batch_size, seq_len , dim)
        z = self.linear_layer(z)    #  (batch_size, seq_len , vocab_size)

        # z = F.softmax(z , dim = -1)   #  (batch_size, seq_len , vocab_size)

        # I will remove it if I use cross entropy loss ...

        return z  #  (batch_size, seq_len , vocab_size)




In [34]:
from transformers import AutoTokenizer

# Replace 'bert-base-uncased' with the model you are using
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Example
input_text = "What is the weather like today?"


tokens = tokenizer(input_text, padding="max_length", truncation=True, max_length=32, return_tensors="pt")

# print(tokens['input_ids'].shape)



In [None]:
from transformers import AutoTokenizer

train_dataset = {
    "input_text": [
        "Quel temps fait-il aujourd'hui ?" ,
        "Traduisez cette phrase en français.",
        "Comment allez-vous ?",
        "Écrivez une courte histoire sur un robot.",
        "Résumez l'article suivant.",
    ],
    "target_text": [
        "The weather is sunny and warm.",
        "Traduisez cette phrase en français.",
        "I’m doing well, thank you!",
        "Once upon a time, there was a robot who wanted to be human.",
        "This article discusses the importance of AI in modern technology.",
    ]
}
vocab_set = list(set( ''.join(train_dataset['input_text'])+ ''.join(train_dataset['target_text'])))
vocab_size = len(vocab_set)

# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# tokenize = lambda txt : tokenizer(txt, padding="max_length", truncation=True, max_length=32, return_tensors="pt")
# vocab_size = tokenizer.vocab_size

token_to_id = { v:i for  i , v in enumerate(vocab_set)}
id_to_token = { i:v for  i , v in enumerate(vocab_set)}
tokenize = lambda txt : torch.tensor( [ token_to_id[txt[i]] for i in range(len(txt))])
# tokenizer_decode =  lambda txt : ''.join([ id_to_token[txt[i]] for i in range(len(txt))])

training_corpus = pd.DataFrame(train_dataset) 

training_corpus.head()   

Unnamed: 0,input_text,target_text
0,Quel temps fait-il aujourd'hui ?,The weather is sunny and warm.
1,Traduisez cette phrase en français.,Traduisez cette phrase en français.
2,Comment allez-vous ?,"I’m doing well, thank you!"
3,Écrivez une courte histoire sur un robot.,"Once upon a time, there was a robot who wanted..."
4,Résumez l'article suivant.,This article discusses the importance of AI in...


In [67]:
class CustomDataset(Dataset) :
    def __init__(self , training_corpus , tokenize) :
        self.training_corpus = training_corpus
        # self.training_corpus['input_token'] = self.training_corpus['input_text'].apply(lambda txt : tokenizer(txt, padding="max_length", truncation=True, max_length=32, return_tensors="pt") )
        # self.training_corpus['target_token'] = self.training_corpus['target_text'].apply(lambda txt : tokenizer(txt, padding="max_length", truncation=True, max_length=32, return_tensors="pt") )
        self.training_corpus['input_token'] = self.training_corpus['input_text'].apply(lambda txt : tokenize(txt) )
        self.training_corpus['target_token'] = self.training_corpus['target_text'].apply(lambda txt : tokenize(txt) )

    def __getitem__(self, index) :
        row =  self.training_corpus.iloc[index]
        input = row['input_token']
        target = row['target_token']
   
        # return input['input_ids'].squeeze(0), target['input_ids'].squeeze(0), input['attention_mask'].squeeze(0), target['attention_mask'].squeeze(0)
        # return input['input_ids'].squeeze(0), target['input_ids'].squeeze(0), input['attention_mask'].squeeze(0), target['attention_mask'].squeeze(0)
        return input, target
    

    def __len__(self) :
        return len(self.training_corpus)
    
def my_custom_collater(batch) :
    inputs = [ e[0]  for e in batch ]
    targets = [ e[1]  for e in batch ]
    input_len = [len(inp) for inp in inputs]
    target_len = [len(targ) for targ in targets]
    max_len_t = max(input_len)
    max_len_i = max(target_len)
    max_len = max(max_len_t,max_len_i)

    # Padding 
    pad_value = 0
    padded_inputs = torch.stack([F.pad(seq, (0,max_len - seq.size(0)) , mode='constant', value=pad_value)  for seq in  inputs] )
    padded_targets = torch.stack([F.pad(seq, (0 , max_len - seq.size(0)), mode='constant', value=pad_value)   for seq in  targets] )
    input_mask = (padded_inputs != pad_value).long()  # (batch_size, max_len)
    output_mask = (padded_inputs != pad_value).long() # (batch_size, max_len)

    return padded_inputs , padded_targets , input_mask , output_mask 




    # return padded_inputs, padded_targets, input_mask, output_mask

train_dataset = CustomDataset(training_corpus , tokenize)


In [70]:
criterion = nn.CrossEntropyLoss(reduction='none')       
model = Transformer(
    embedding_dim=64, vocab_size=vocab_size, nhead=8,
    dim_feedforward=128, dropout=0.1, activation=nn.GELU
)
# model = Transformer(embedding_dim = 512, vocab_size = vocab_size , nhead = 8, dim_feedforward = 2048, dropout=0.1, activation = nn.GELU) 


data_loader = DataLoader(train_dataset, batch_size=2, shuffle=False, collate_fn=my_custom_collater)
optimizer = optim.SGD(model.parameters() , lr = 1e-4 )

num_epochs = 10

for epoch in range(num_epochs):
    for batch in data_loader:
        input, target, input_mask, target_mask = batch  # Assume custom collate returns these

        # Forward pass
        result = model(input, target, input_mask, target_mask)  # (batch_size, seq_len, vocab_size)

        # Reshape for loss calculation
        batch_size, seq_len, vocab_size = input.size(0), input.size(1), result.size(2)
        result = result.reshape(batch_size * seq_len, vocab_size)  # (batch_size * seq_len, vocab_size)
        target = target.view(-1)  # (batch_size * seq_len)
        input_mask = input_mask.view(-1).float()  # Flatten and cast mask to float

        # Compute loss
        optimizer.zero_grad()
        loss = (criterion(result, target) * input_mask).sum() / input_mask.sum()

        # Backpropagation
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')



        

Epoch 1/10, Loss: nan
Epoch 2/10, Loss: nan
Epoch 3/10, Loss: nan
Epoch 4/10, Loss: nan
Epoch 5/10, Loss: nan
Epoch 6/10, Loss: nan
Epoch 7/10, Loss: nan
Epoch 8/10, Loss: nan
Epoch 9/10, Loss: nan
Epoch 10/10, Loss: nan


In [71]:
for name, param in model.named_parameters():
    if param.grad is not None:
        print(f"{name} gradient: {param.grad.abs().max()}")


embedding.weight gradient: nan
Transformer_encoder_decoder.transformer_encoder.multi_head_att.Q_v.weight gradient: nan
Transformer_encoder_decoder.transformer_encoder.multi_head_att.Q_v.bias gradient: nan
Transformer_encoder_decoder.transformer_encoder.multi_head_att.K_v.weight gradient: nan
Transformer_encoder_decoder.transformer_encoder.multi_head_att.K_v.bias gradient: nan
Transformer_encoder_decoder.transformer_encoder.multi_head_att.V_v.weight gradient: nan
Transformer_encoder_decoder.transformer_encoder.multi_head_att.V_v.bias gradient: nan
Transformer_encoder_decoder.transformer_encoder.feed_fw.0.weight gradient: nan
Transformer_encoder_decoder.transformer_encoder.feed_fw.0.bias gradient: nan
Transformer_encoder_decoder.transformer_encoder.feed_fw.2.weight gradient: nan
Transformer_encoder_decoder.transformer_encoder.feed_fw.2.bias gradient: nan
Transformer_encoder_decoder.transformer_encoder.layer_norm1.weight gradient: nan
Transformer_encoder_decoder.transformer_encoder.layer_