<a href="https://www.kaggle.com/code/kalvakaushik/transformer?scriptVersionId=200446403" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, TensorDataset, random_split, ConcatDataset, Dataset
from tqdm import tqdm

from torch.nn.utils.rnn import pad_sequence

from transformers import BertTokenizer, BertModel

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer

In [3]:
ds = load_dataset("openai/gsm8k", "main")

Downloading readme:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})

In [5]:
ds['train'][0]

{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
 'answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'}

In [6]:
if torch.cuda.is_available(): 
    device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Multi-Head Attention

In [7]:
class MultiHeadAttention(nn.Module):
    def __init__(self, emb_size, heads):
        super(MultiHeadAttention,self).__init__()
        self.heads=heads
        self.emb_size=emb_size
        self.head_dim=self.emb_size//self.heads
        self.w_k=nn.Linear(emb_size,emb_size, bias=False)
        self.w_q=nn.Linear(emb_size,emb_size, bias=False)
        self.w_v=nn.Linear(emb_size,emb_size, bias=False)
        self.out=nn.Linear(emb_size,emb_size)

        assert(self.head_dim * heads == emb_size),"embeding size is not divisible by number of heads"


    def forward(self,k,q,v,mask=None):
        N=q.shape[0]  # batch size
        K=self.w_k(k)
        Q=self.w_q(q)
        V=self.w_v(v)

        K=K.view(N,K.shape[1],self.heads,self.head_dim).transpose(1,2)    # (batch size, sequence len, heads, head dimention)
        Q=Q.view(N,Q.shape[1],self.heads,self.head_dim).transpose(1,2)    # transposed to give(batch size, heads, sequence len, head dimention)
        V=V.view(N,V.shape[1],self.heads,self.head_dim).transpose(1,2)

        attention=(torch.matmul(Q,K.transpose(-2,-1)))/torch.tensor(self.head_dim**0.5)
        
        if mask is not None:
            mask=mask.reshape(-1,1,1,1024)
            attention.masked_fill_(mask==0, -1e9)

        attention_scores=F.softmax(attention, dim=-1)
        output=torch.matmul(attention_scores,V)
        output = output.transpose(1, 2).reshape(N, -1, self.emb_size)    # combine all heads (batch_size, seqlen, dmodel)
        output=self.out(output)                                          # (batch_size, seqlen, dmodel)

        return output

# Transformer architecture

In [8]:
class Encoder(nn.Module):
    def __init__(self, heads, emb_size):
        super(Encoder, self).__init__()
        self.mha=MultiHeadAttention(emb_size, heads)
        self.ff1=nn.Linear(emb_size,2*emb_size)
        self.ff2=nn.Linear(2*emb_size, emb_size)
        self.norm1=nn.LayerNorm(emb_size)
        self.norm2=nn.LayerNorm(emb_size)
        self.dropout=nn.Dropout(p=0.2)

    def forward(self, x, mask=None):
        attention_out=self.mha(x,x,x,mask)
        attention_out = self.dropout(attention_out)
        out1=self.norm1(x+attention_out)

        ff_out=F.relu(self.ff1(out1))
        ff_out=self.ff2(ff_out)
        out2=self.dropout(ff_out)
        encoder_out=self.norm2(out1+out2)
        return encoder_out

In [9]:
class Decoder(nn.Module):
    def __init__(self, heads, emb_size):
        super(Decoder, self).__init__()
        self.mmha=MultiHeadAttention(emb_size, heads)
        self.mha=MultiHeadAttention(emb_size, heads)
        self.ff1=nn.Linear(emb_size,2*emb_size)
        self.ff2=nn.Linear(2*emb_size, emb_size)
        self.norm1=nn.LayerNorm(emb_size)
        self.norm2=nn.LayerNorm(emb_size)
        self.norm3=nn.LayerNorm(emb_size)
        self.dropout=nn.Dropout(p=0.2)

    def forward(self, x, encoder_out, source_mask, target_mask):
        mask_attention_out=self.mmha(x,x,x,target_mask)
        mask_attention_out=self.dropout(mask_attention_out)
        out1=self.norm1(x+mask_attention_out)

        enc_dec_attention_out=self.mha(encoder_out,out1,encoder_out)
        enc_dec_attention_out=self.dropout(enc_dec_attention_out)
        out2=self.norm2(out1+enc_dec_attention_out)

        ff_output=F.relu(self.ff1(out2))
        ff_output=self.ff2(ff_output)
        ff_output=self.dropout(ff_output)
        out3=self.norm3(out2+ff_output)

        return out3

In [10]:
class PositionalEmbedding(nn.Module):
    def __init__(self, seq_len, emb_size, n=10000):
        super(PositionalEmbedding, self).__init__()
        self.embedding = self.create_positional_embedding(seq_len, emb_size, n)

    def create_positional_embedding(self, seq_len, emb_size, n):
        P = np.zeros((seq_len, emb_size))
        for pos in range(seq_len):
            for i in range(emb_size // 2):
                denominator = np.power(n, 2 * i / emb_size)
                P[pos, 2 * i] = np.sin(pos / denominator)
                P[pos, 2 * i + 1] = np.cos(pos / denominator)
        return torch.tensor(P, dtype=torch.float32)

    def forward(self, idx):
        pos_idx=self.embedding
        pos_idx=pos_idx.to(device)
        idx=idx+pos_idx
        return idx

In [11]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, input_dim, emb_size, num_encoder_layers, num_decoder_layers, heads, seq_len):
        super(Transformer, self).__init__()
        
        self.embedding=nn.Embedding(vocab_size, emb_size)
        self.encoder_layers = nn.ModuleList([Encoder(heads, emb_size) for _ in range(num_encoder_layers)])
        self.decoder_layers = nn.ModuleList([Decoder(heads, emb_size) for _ in range(num_decoder_layers)])
        self.position_encodings = PositionalEmbedding(seq_len, emb_size)
        self.linear = nn.Linear(emb_size, vocab_size)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):

        src=self.embedding(src)   # need to do *sqrt(d_model)
        tgt=self.embedding(tgt)   # for transulation task give different embedding

        src=self.position_encodings(src)
        tgt=self.position_encodings(tgt)

        for encoder in self.encoder_layers:
            src = encoder(src, src_mask)
        
        for decoder in self.decoder_layers:
            tgt = decoder(tgt, src, src_mask, tgt_mask)

        output = self.linear(tgt)
        output = F.softmax(output, dim=-1)
        
        return output
    


input_dim = 1000
emb_size = 512
heads = 8
num_encoder_layers = 6
num_decoder_layers = 6


# Tokenization

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [13]:
# Preprocess data
def preprocess_function(examples):
    # Combine question and answer for input
    inputs = [f"question: {q} answer:" for q in examples['question']]
    targets = examples['answer']
    
    model_inputs = tokenizer(inputs, truncation=True, padding='max_length', max_length=1024)
    labels = tokenizer(targets, truncation=True, padding='max_length', max_length=1024)
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Split dataset
train_data = ds['train'].map(preprocess_function, batched=True) 
test_data = ds['test'].map(preprocess_function, batched=True)


class QNADataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.data[idx]['input_ids']),
            'attention_mask': torch.tensor(self.data[idx]['attention_mask']),
            'labels': torch.tensor(self.data[idx]['labels'])
        }

train_dataset = QNADataset(train_data)
test_dataset = QNADataset(test_data)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, pin_memory=True, num_workers=4)
validation_dataloader = DataLoader(test_dataset, batch_size=8, pin_memory=True, num_workers=4)

model = Transformer(tokenizer.vocab_size, input_dim, emb_size, num_encoder_layers, num_decoder_layers, heads, seq_len=1024)
criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = optim.Adam(model.parameters(), lr=3e-5)

model

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

Transformer(
  (embedding): Embedding(30522, 512)
  (encoder_layers): ModuleList(
    (0-5): 6 x Encoder(
      (mha): MultiHeadAttention(
        (w_k): Linear(in_features=512, out_features=512, bias=False)
        (w_q): Linear(in_features=512, out_features=512, bias=False)
        (w_v): Linear(in_features=512, out_features=512, bias=False)
        (out): Linear(in_features=512, out_features=512, bias=True)
      )
      (ff1): Linear(in_features=512, out_features=1024, bias=True)
      (ff2): Linear(in_features=1024, out_features=512, bias=True)
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
  )
  (decoder_layers): ModuleList(
    (0-5): 6 x Decoder(
      (mmha): MultiHeadAttention(
        (w_k): Linear(in_features=512, out_features=512, bias=False)
        (w_q): Linear(in_features=512, out_features=512, bias=False)
        (w_v): Linear

# Training loop

In [14]:
text="hi how are yous"

In [15]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)
model = model.to('cuda')
model.to(device)

num_epochs=1
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()

        # Move tensors to the specified device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass

        logits = model(src=input_ids, tgt=labels, src_mask=attention_mask)
        loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))

        # Backward pass
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_dataloader)
    print(f"training : Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")
    
    model.eval()
    epoch_loss = 0
    for batch in tqdm(validation_dataloader):

        # Move tensors to the specified device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        logits = model(src=input_ids, tgt=labels, src_mask=attention_mask)
        loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(validation_dataloader)
    print(f"validation : Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")

Let's use 2 GPUs!


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
100%|██████████| 935/935 [12:41<00:00,  1.23it/s]


training : Epoch 1/1, Loss: 9.4430


100%|██████████| 165/165 [00:50<00:00,  3.25it/s]

validation : Epoch 1/1, Loss: 9.4307





# Inference
Training a transformer model from scratch without pretrained weights is nearly impossible considering the large number of dataset needed and the time taken for get trained is very high so here I am leaving it with one epoch

Incase of inference of any sentence as a imput to the transformer model you just need to pass it through the preprocess function that is defined above and can be passed it through the model will give back the output tokens


In [16]:
text="hi how are you"
