# Preparing the data

In [33]:
data_path = "data/input.txt"
with open(data_path) as fp:
    data = fp.read()

In [34]:
import tiktoken
encoder = tiktoken.get_encoding("gpt2")

In [35]:
encoder.encode(data, allowed_special="all")[:10]

[5962, 22307, 25, 198, 8421, 356, 5120, 597, 2252, 11]

# Datasets

In [36]:
import torch
from torch.utils.data import DataLoader, Dataset

class TextDataset(Dataset):
    def __init__(self, text, max_length, stride, encoder="gpt2") -> None:
        self.data_text = text
        self.max_length = max_length
        self.stride = stride
        self.encoder = tiktoken.get_encoding(encoder)

        self.input_ids = []
        self.target_ids = []

        self.preprocess_dataset()

    def preprocess_dataset(self) -> None:
        encoded_data = self.encoder.encode(self.data_text)
        
        for i in range(0, len(encoded_data) - self.max_length, self.stride):
            self.input_ids.append(torch.tensor(encoded_data[i: i+self.max_length]))
            self.target_ids.append(torch.tensor(encoded_data[i+1: i+self.max_length+1]))

    
    def __getitem__(self, index) -> tuple:
        return self.input_ids[index], self.target_ids[index]
    
    def __len__(self) -> int:
        return len(self.input_ids)
        

In [37]:
def train_test_split(data:str, train_ratio:float) -> tuple[str, str]:
    n = int(len(data) * train_ratio)
    training_text = data[:n]
    testing_text = data[n:]
    return training_text, testing_text

In [38]:
def create_dataloader(text, max_length, stride, batch_size, tokenizer="gpt2", shuffle=True, drop_last=True,num_workers=0) -> DataLoader:
    dataset = TextDataset(text, max_length, stride, tokenizer)
    return DataLoader(dataset, batch_size, shuffle, num_workers=num_workers, drop_last=drop_last)

In [39]:
train_data, test_data = train_test_split(data, 0.9)
train_dataloader = create_dataloader(train_data, 4, 2, 4, shuffle=False)
test_dataloader = create_dataloader(test_data, 4, 2, 4, shuffle=False)

In [40]:
next(iter(test_dataloader))

[tensor([[   30,   198,   198, 28934],
         [  198, 28934,  8895,    46],
         [ 8895,    46,    25,   198],
         [   25,   198, 10248,  2146]]),
 tensor([[  198,   198, 28934,  8895],
         [28934,  8895,    46,    25],
         [   46,    25,   198, 10248],
         [  198, 10248,  2146,   808]])]

# Transformer architecture

In [41]:
import torch.nn as nn

In [42]:
torch.triu(torch.ones(3,3), 1)

tensor([[0., 1., 1.],
        [0., 0., 1.],
        [0., 0., 0.]])

In [43]:
torch.ones((1,2,3,4)).transpose(2,3).shape

torch.Size([1, 2, 4, 3])

In [44]:
class MultiheadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, num_heads, dropout, qkv_bias=False) -> None:
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"

        super().__init__()
        self.d_in = d_in
        self.d_out = d_out
        self.context_length = context_length
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.dropout = nn.Dropout(p = dropout)

        self.W_query = nn.Linear(d_in, d_out, bias = qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias = qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias = qkv_bias)

        self.linear = nn.Linear(d_out, d_out)

        self.register_buffer("mask", torch.triu(torch.ones((context_length, context_length)), diagonal=1))

    def forward(self, x:torch.tensor) -> torch.tensor:
        b, num_tokens, emb_dim = x.shape # batch, num_tokens, emb_dimension
        queries = self.W_query(x) # batch, num_tokens, d_out
        keys = self.W_key(x) # batch, num_tokens, d_out
        values = self.W_value(x) # batch, num_tokens, d_out

        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim) # batch, num_tokens, num_heads, head_dim
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) # batch, num_tokens, num_heads, head_dim
        values = values.view(b, num_tokens, self.num_heads, self.head_dim) # batch, num_tokens, num_heads, head_dim

        queries = queries.transpose(1,2) # batch, num_heads, num_tokens, head_dim
        keys = keys.transpose(1,2) # batch, num_heads, num_tokens, head_dim
        values = values.transpose(1,2) # batch, num_heads, num_tokens, head_dim

        attn_scores = queries @ keys.transpose(2,3) # batch, num_heads, num_tokens, num_tokens
        bool_mask = self.mask.bool()[:num_tokens, :num_tokens] # num_tokens, num_tokens
        attn_scores.masked_fill_(bool_mask, -torch.inf) # batch, num_heads, num_tokens, num_tokens
        attn_weights = torch.softmax(attn_scores / (keys.shape[3])**0.5, dim=3) # batch, num_heads, num_tokens, num_tokens
        attn_weights = self.dropout(attn_weights)

        z = attn_weights @ values # batch, num_heads, num_tokens, head_dim
        z = z.transpose(1,2) # batch, num_tokens, num_heads, head_dim
        z = z.contiguous().view(b, num_tokens, self.d_out) # batch, num_tokens, d_out

        z = self.linear(z) # batch, num_tokens, d_out
        return z

In [45]:
class LayerNorm(nn.Module):
    def __init__(self, embedding_dim) -> None:
        super().__init__()
        self.scale = nn.Parameter(torch.ones(embedding_dim))
        self.shift = nn.Parameter(torch.zeros(embedding_dim))

        self.eps = 1e-5

    def forward(self, x:torch.tensor):
        mean = x.mean(dim=-1, keepdim = True)
        var = x.var(dim=-1, keepdim = True, unbiased = False)

        x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * x + self.shift


In [46]:


class FeedForward(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        
        self.layers = nn.Sequential(*[
            nn.Linear(emb_dim, 4*emb_dim),
            nn.GELU(),
            nn.Linear(4*emb_dim, emb_dim)
        ])

    def forward(self,x):
        x = self.layers(x)
        return x

In [47]:
class TransformerBlock(nn.Module):
    def __init__(self, context_length, embedding_dim,  num_heads, dropout, qkv_bias=False) -> None:
        super().__init__()
        self.layer_norm1 = LayerNorm(embedding_dim)
        self.layer_norm2 = LayerNorm(embedding_dim)

        self.dropout = nn.Dropout(dropout)

        self.attention = MultiheadAttention(embedding_dim, embedding_dim, context_length, num_heads, dropout, qkv_bias)

        self.ff = FeedForward(embedding_dim)

    def forward(self, x):
        x_res = x
        x = self.layer_norm1(x)
        x = self.attention(x)
        x = self.dropout(x)
        x = x_res + x

        x_res = x
        x = self.layer_norm2(x)
        x = self.ff(x)
        x = self.dropout(x)
        x = x_res + x

        return x


In [48]:
class GPTModel(nn.Module):
    def __init__(self, cfg) -> None:
        super().__init__()
        vocabulary_size = cfg["vocab_size"]
        embedding_dim = cfg["emb_dim"]
        context_length = cfg["context_length"]
        num_heads = cfg["n_heads"]
        drop_rate = cfg["drop_rate"]
        qkv_bias = cfg["qkv_bias"]

        self.token_embedding = nn.Embedding(vocabulary_size, embedding_dim)
        self.position_embedding = nn.Embedding(context_length, embedding_dim)

        self.dropout = nn.Dropout(drop_rate)

        self.transformer_blocks = nn.Sequential(*[
            TransformerBlock(context_length, embedding_dim, num_heads, drop_rate, qkv_bias) for _ in range(cfg["n_layers"])
        ])
        self.final_layer_norm = LayerNorm(embedding_dim)
        self.out_head = nn.Linear(embedding_dim, vocabulary_size, bias=False)

    def forward(self, x):
        b, token_length = x.shape
        x = self.token_embedding(x) + self.position_embedding(torch.arange(0, token_length, device=x.device))
        x = self.dropout(x)
        x = self.transformer_blocks(x)
        x = self.final_layer_norm(x)
        logits = self.out_head(x)
        return logits


In [49]:
CHOOSE_MODEL = "gpt2-small (124M)"
INPUT_PROMPT = "Every effort moves"
BASE_CONFIG = {
    "vocab_size": 50257,
    "context_length": 1024,
    "drop_rate": 0.0,
    "qkv_bias": True
}
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

In [50]:
gpt = GPTModel(BASE_CONFIG)

In [51]:
sum(p.numel() for p in gpt.parameters())

163087441

# TRAINING

In [52]:
# import torch.nn.functional as F
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0,1), target_batch.flatten())
    return loss

In [53]:
def calc_loss_loader(loader, model, device, num_batches = None):
    total_loss = 0
    if len(loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(loader)
    else:
        num_batches = min(num_batches, len(loader))
    
    for i, (input_batch, target_batch) in enumerate(loader):
        if i >= num_batches:
            break
        loss = calc_loss_batch(input_batch, target_batch, model, device)
        total_loss += loss.item()
    return total_loss / num_batches
    


In [54]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt.to(device)
with torch.no_grad():
    train_loss = calc_loss_loader(train_dataloader, gpt, device,5)
    val_loss = calc_loss_loader(test_dataloader, gpt, device,5)
print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 11.090436744689942
Validation loss: 11.031974411010742


# LoRA

In [55]:
class LoRALayer(nn.Module):
    def __init__(self, in_dim:int, out_dim:int, rank:int, alpha:float) -> None:
        super().__init__()
        self.A = nn.Parameter(torch.empty(in_dim, rank))
        nn.init.kaiming_uniform_(self.A, a = 5**0.5)
        self.B = nn.Parameter(torch.zeros(rank, out_dim))
        self.alpha = alpha

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.alpha * (x @ self.A @ self.B)

class LinearWithLoRA(nn.Module):
    def __init__(self, linear:nn.Linear, rank:int, alpha:float) -> None:
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(linear.in_features, linear.out_features, rank, alpha)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.linear(x) + self.lora(x)

In [56]:
def replace_linear_with_lora(model:nn.Module, rank, alpha) -> None:
    for name, module in model.named_children():
        if isinstance(module, nn.Linear):
            setattr(module, name, LinearWithLoRA(module, rank, alpha))
        else:
            replace_linear_with_lora(module, rank, alpha)

In [60]:
replace_linear_with_lora(gpt, rank=16, alpha=16)


In [61]:
total_params = sum(p.numel() for p in gpt.parameters() if p.requires_grad)
print(f"Total trainable parameters before: {total_params:,}")

for param in gpt.parameters():
    param.requires_grad = False
total_params = sum(p.numel() for p in gpt.parameters() if p.requires_grad)
print(f"Total trainable parameters after: {total_params:,}")

Total trainable parameters before: 3,470,608
Total trainable parameters after: 0
