In [2]:
import pandas as pd 
import numpy as np
import math 
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

In [7]:
if torch.backends.mps.is_available(): device = "mps"

In [3]:
class InputEmbedding(nn.Module):
    def __init__(self, emb_dim, vocab_size):
        super(InputEmbedding, self).__init__()

        self.emb_dim = emb_dim
        self.vocab_size = vocab_size
        
        self.embedding = nn.Embedding(self.vocab_size, self.emb_dim)
        
    def forward(self, x):
        embedded_x = self.embedding(x)
        scaled_embedded = embedded_x * torch.sqrt(torch.tensor(self.emb_dim).float())
        
        return scaled_embedded


In [4]:
class PositionalEncoding(nn.Module): 
    def __init__(self, embeded_dim, max_sequence_lenght, dropout=0.1):
        super(PositionalEncoding, self).__init__()
        
        self.embeded_dim = embeded_dim
        self.max_sequence_lenght = max_sequence_lenght
        self.dropout = nn.Dropout(p=dropout)
        
        self.positional_encoding = self.compute_positional_encoding(max_sequence_lenght, embeded_dim)
        
    def compute_positional_encoding(self, max_sequence_lenght, embeded_dim):
        
        with torch.no_grad():
            positional_encoding = torch.zeros(max_sequence_lenght, embeded_dim)
            position = torch.arange(0, max_sequence_lenght, dtype=torch.float).unsqueeze(1)
            
            div_term = torch.exp(torch.arange(0, embeded_dim, 2).float() * -(math.log(10000.0) / embeded_dim))
            positional_encoding[:, 0::2] = torch.sin(position.float() * div_term)
            positional_encoding[:, 1::2] = torch.cos(position.float() * div_term)
            
            positional_encoding = positional_encoding.unsqueeze(0)
            
        return positional_encoding
    

    def forward(self, x):
        x = x + self.positional_encoding[:, :x.size(1)].to(x.device)
        return self.dropout(x)

In [5]:
class LayerNorm(nn.Module):
    def __init__(self, embedded_dim, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(embedded_dim).uniform_())
        self.b_2 = nn.Parameter(torch.zeros(embedded_dim).uniform_())
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [38]:
class FeedForwardBlock(nn.Module):
    def __init__(self, embedded_dim, feed_forward_dim, dropout=0.1):
        super(FeedForwardBlock, self).__init__()
        
        self.embedded_dim = embedded_dim
        self.feed_forward_dim = feed_forward_dim
        self.dropout = nn.Dropout(p=dropout)
        
        self.linear1 = nn.Linear(self.embedded_dim, self.feed_forward_dim)
        self.linear2 = nn.Linear(self.feed_forward_dim, self.embedded_dim)
        
    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.linear2(x)
        
        return x
    

In [37]:
def generate_square_subsequent_mask(self, sz, device = 'mps'):
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

In [15]:
class MultiHeadAttention(nn.Module): 
    def __init__(self, embed_dim, num_heads, attention_dropout=0.1, ff_dropout=0.1, max_len=512):
        super(MultiHeadAttention, self).__init__()
        
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == self.embed_dim, "Embedding dimension must be divisible by the number of heads"
        
        self.query = nn.Linear(self.embed_dim, self.embed_dim)
        self.key = nn.Linear(self.embed_dim, self.embed_dim)
        self.value = nn.Linear(self.embed_dim, self.embed_dim)
        
        self.attention_dropout = nn.Dropout(p=attention_dropout)
        self.out_dropout = nn.Dropout(p=ff_dropout)
        
        self.out = nn.Linear(self.embed_dim, self.embed_dim)
        
        self.register_buffer(
                        "mask",
                            torch.triu(torch.ones(max_len, max_len, dtype=torch.bool), diagonal=1)
            )
        
    def forward(self, x, mask= None): 
        batch_size, seq_len, _ = x.size()
        
        query = self.query(x).view(batch_size, seq_len, self.num_heads, self.head_dim, -1).transpose(1, 2)
        key = self.key(x).view(batch_size, seq_len, self.num_heads, self.head_dim, -1).transpose(1, 2)
        value = self.value(x).view(batch_size, seq_len, self.num_heads, self.head_dim, -1).transpose(1, 2)
        
        attention = torch.einsum("b h i d, b h j d -> b h i j", query, key) / math.sqrt(self.head_dim)
        
        if mask is not None:
            attention = attention.masked_fill(mask==0 , float('-inf'))
            
        attention = self.attention_dropout(F.softmax(attention, dim=-1))
        
        y = torch.einsum("b h i j, b h j d -> b h i d", attention, value).transpose(1, 2).reshape(batch_size, seq_len, self.embed_dim)
        
        return self.out(self.out_dropout(y))

        

In [11]:
class ResidualConnection(nn.Module): 
    def __init__(self, embedded_dim, dropout=0.1):
        super(ResidualConnection, self).__init__()
        
        self.layer_norm = LayerNorm(embedded_dim)
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, x, sublayer):
        
        norm_x = self.layer_norm(x)
        sublayer_x = sublayer(norm_x)
        x = x + self.dropout(sublayer_x)
        
        return x


In [18]:
class ProjectionHead(nn.Module): 
    def __init__(self, embedded_dim, vocab_size):
        super(ProjectionHead, self).__init__()
        
        self.linear = nn.Linear(embedded_dim, vocab_size)
        
    def forward(self, x):
        return self.linear(x)

In [12]:
class DecoderBlock(nn.Module):
    def __init__(self, embedded_dim, num_heads, feed_forward_dim, attention_dropout=0.1, ff_dropout=0.1, max_len=512):
        super(DecoderBlock, self).__init__()
        
        self.multi_head_attention = MultiHeadAttention(embedded_dim, num_heads, attention_dropout, ff_dropout, max_len)
        self.residual_connection1 = ResidualConnection(embedded_dim)
        
        self.feed_forward_block = FeedForwardBlock(embedded_dim, feed_forward_dim, ff_dropout)
        self.residual_connection2 = ResidualConnection(embedded_dim)
        
    def forward(self, x, mask=None):
        x = self.residual_connection1(x, lambda x: self.multi_head_attention(x, mask))
        x = self.residual_connection2(x, self.feed_forward_block)
        
        return x

In [19]:
class GPT(nn.Module): 
    def __init__(self, vocab_size, 
                 embedded_dim, 
                 max_len, 
                 embedding_dropout=0.1, 
                 num_blocks = 6,  
                 num_heads=8, 
                 feed_forward_dim=2048, 
                 attention_dropout=0.1,
                    ff_dropout=0.1):
        super(GPT, self).__init__()
        self.max_len = max_len
        self.token_embedding = InputEmbedding(embedded_dim, vocab_size)
        self.positional_encoding = PositionalEncoding(embedded_dim, max_len, embedding_dropout)
        
        self.blocks = nn.ModuleList([DecoderBlock(embedded_dim, 
                                                  num_heads, 
                                                  feed_forward_dim, 
                                                  attention_dropout, 
                                                  ff_dropout, max_len) for _ in range(num_blocks)])
        
        self.projection_head = ProjectionHead(embedded_dim, vocab_size)
        
    def forward(self, input_ids, attention_mask=None):
        
        sequence_len = input_ids.size(1)
        assert sequence_len <= self.max_len, "Sequence length exceeds the maximum length"
        
        x = self.token_embedding(input_ids)
        x = self.positional_encoding(x)
        
        for block in self.blocks:
            x = block(x, attention_mask)
        
        return self.projection_head(x)


In [20]:
vocab_size = 50257
embedded_dim = 768
max_len = 512
embedding_dropout = 0.1
num_blocks = 6
num_heads = 8
feed_forward_dim = 2048
attention_dropout = 0.1
ff_dropout = 0.1

model = GPT(
    vocab_size = vocab_size,
    embedded_dim = embedded_dim,
    max_len = max_len,
    embedding_dropout = embedding_dropout,
    num_blocks = num_blocks,
    num_heads = num_heads,
    feed_forward_dim = feed_forward_dim,
    attention_dropout = attention_dropout,
    ff_dropout = ff_dropout
)

In [33]:
sample_data = [
    "This is a sample sentence",
    "This is another sample sentence",
    "Let's change the sentence a bit",
    "A more complex sentence with more words and more characters"
    
]

In [29]:
class CustomDataset(Dataset): 
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_len
        self.end_token = tokenizer.eos_token_id
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data[idx]
        input_ids = self.tokenizer(text, truncation=True, padding=False, return_tensors="pt")["input_ids"].squeeze()
        
        text_len = input_ids.size(0)
        padding_len = max(self.max_length - text_len, 0)
        padding = torch.full((padding_len,), self.end_token, dtype=torch.long)
        
        if text_len < self.max_length:
            input_ids = torch.cat((input_ids, padding), dim=0)
            label = torch.cat((input_ids[1:], padding[:-1] if padding_len > 0 else torch.tensor([self.end_token])), dim=0)
        else:
            input_ids = input_ids[:self.max_length]
            label = torch.cat((input_ids[1:], torch.tensor([self.end_token])), dim=0)

        return input_ids, label

In [35]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

train_dataset = CustomDataset(sample_data, tokenizer, 200)

In [44]:
model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()
epochs = 5
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

for epoch in range(epochs): 
    model.train()
    total_loss = 0.0
    
    for batch in train_loader:
        input_ids, labels = batch
        input_ids = input_ids.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        mask = generate_square_subsequent_mask(input_ids.size(1), device=device)
        
        
        logits = model(input_ids, mask)
        logits_flat = logits.view(-1, logits.size(-1))
        labels_flat = labels.view(-1)
        
        loss = criterion(logits_flat, labels_flat)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        
    print(f"Epoch {epoch + 1} Loss: {total_loss / len(train_loader)}")

RuntimeError: stack expects each tensor to be equal size, but got [388] at entry 0 and [391] at entry 1