# Preparing the data

In [3]:
data_path = "../data/input.txt"
with open(data_path) as fp:
    data = fp.read()

In [4]:
import tiktoken
encoder = tiktoken.get_encoding("gpt2")

In [5]:
encoder.encode(data, allowed_special="all")[:10]

[5962, 22307, 25, 198, 8421, 356, 5120, 597, 2252, 11]

# Datasets

In [6]:
import torch
from torch.utils.data import DataLoader, Dataset

class TextDataset(Dataset):
    def __init__(self, text, max_length, stride, encoder="gpt2") -> None:
        self.data_text = text
        self.max_length = max_length
        self.stride = stride
        self.encoder = tiktoken.get_encoding(encoder)

        self.input_ids = []
        self.target_ids = []

        self.preprocess_dataset()

    def preprocess_dataset(self) -> None:
        encoded_data = self.encoder.encode(self.data_text)
        
        for i in range(0, len(encoded_data) - self.max_length, self.stride):
            self.input_ids.append(torch.tensor(encoded_data[i: i+self.max_length]))
            self.target_ids.append(torch.tensor(encoded_data[i+1: i+self.max_length+1]))

    
    def __getitem__(self, index) -> tuple:
        return self.input_ids[index], self.target_ids[index]
    
    def __len__(self) -> int:
        return len(self.input_ids)
        

In [7]:
def train_test_split(data:str, train_ratio:float) -> tuple[str, str]:
    n = int(len(data) * train_ratio)
    training_text = data[:n]
    testing_text = data[n:]
    return training_text, testing_text

In [8]:
def create_dataloader(text, max_length, stride, batch_size, tokenizer="gpt2", shuffle=True, drop_last=True,num_workers=0) -> DataLoader:
    dataset = TextDataset(text, max_length, stride, tokenizer)
    return DataLoader(dataset, batch_size, shuffle, num_workers=num_workers, drop_last=drop_last)

In [12]:
train_data, test_data = train_test_split(data, 0.9)
train_dataloader = create_dataloader(train_data, 4, 2, 4, shuffle=False)
test_dataloader = create_dataloader(test_data, 4, 2, 4, shuffle=False)

In [13]:
next(iter(test_dataloader))

[tensor([[   30,   198,   198, 28934],
         [  198, 28934,  8895,    46],
         [ 8895,    46,    25,   198],
         [   25,   198, 10248,  2146]]),
 tensor([[  198,   198, 28934,  8895],
         [28934,  8895,    46,    25],
         [   46,    25,   198, 10248],
         [  198, 10248,  2146,   808]])]

# Transformer architecture

In [9]:
import torch.nn as nn

In [15]:
torch.triu(torch.ones(3,3), 1)

tensor([[0., 1., 1.],
        [0., 0., 1.],
        [0., 0., 0.]])

In [16]:
torch.ones((1,2,3,4)).transpose(2,3).shape

torch.Size([1, 2, 4, 3])

In [10]:
class MultiheadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, num_heads, dropout, qkv_bias=False) -> None:
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"

        self.d_in = d_in
        self.d_out = d_out
        self.context_length = context_length
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.dropout = nn.Dropout(p = dropout)

        self.W_query = nn.Linear(d_in, d_out, bias = qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias = qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias = qkv_bias)

        self.linear = nn.Linear(d_out, d_out)

        self.register_buffer("mask", torch.triu(torch.ones((context_length, context_length)), diagonal=1))

    def forward(self, x:torch.tensor) -> torch.tensor:
        b, num_tokens, emb_dim = x.shape # batch, num_tokens, emb_dimension
        queries = self.W_query(x) # batch, num_tokens, d_out
        keys = self.W_key(x) # batch, num_tokens, d_out
        values = self.W_value(x) # batch, num_tokens, d_out

        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim) # batch, num_tokens, num_heads, head_dim
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) # batch, num_tokens, num_heads, head_dim
        values = values.view(b, num_tokens, self.num_heads, self.head_dim) # batch, num_tokens, num_heads, head_dim

        queries = queries.transpose(1,2) # batch, num_heads, num_tokens, head_dim
        keys = keys.transpose(1,2) # batch, num_heads, num_tokens, head_dim
        values = values.transpose(1,2) # batch, num_heads, num_tokens, head_dim

        attn_scores = queries @ keys.transpose(2,3) # batch, num_heads, num_tokens, num_tokens
        bool_mask = self.mask.bool()[:num_tokens, :num_tokens] # num_tokens, num_tokens
        attn_scores.masked_fill_(bool_mask, -torch.inf) # batch, num_heads, num_tokens, num_tokens
        attn_weights = torch.softmax(attn_scores / (keys.shape[3])**0.5, dim=3) # batch, num_heads, num_tokens, num_tokens
        attn_weights = self.dropout(attn_weights)

        z = attn_weights @ values # batch, num_heads, num_tokens, head_dim
        z = z.transpose(1,2) # batch, num_tokens, num_heads, head_dim
        z = z.contiguous().view(b, num_tokens, self.d_out) # batch, num_tokens, d_out

        z = self.linear(z) # batch, num_tokens, d_out
        return z

In [11]:
class LayerNorm(nn.Module):
    def __init__(self, embedding_dim) -> None:
        super().__init__()
        self.scale = nn.Parameter(torch.ones(embedding_dim))
        self.shift = nn.Parameter(torch.zeros(embedding_dim))

        self.eps = 1e-5

    def forward(self, x:torch.tensor):
        mean = x.mean(dim=-1, keepdim = True)
        var = x.var(dim=-1, keepdim = True, unbiased = False)

        x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * x + self.shift


In [12]:


class FeedForward(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        
        self.layers = nn.Sequential(*[
            nn.Linear(emb_dim, 4*emb_dim),
            nn.GELU(),
            nn.Linear(4*emb_dim, emb_dim)
        ])

    def forward(self,x):
        x = self.layers(x)
        return x

In [13]:
class TransformerBlock(nn.Module):
    def __init__(self, context_length, embedding_dim,  num_heads, dropout, qkv_bias=False) -> None:
        super().__init__()
        self.layer_norm1 = LayerNorm(embedding_dim)
        self.layer_norm2 = LayerNorm(embedding_dim)

        self.dropout = nn.Dropout(dropout)

        self.attention = MultiheadAttention(embedding_dim, embedding_dim, context_length, num_heads, dropout, qkv_bias)

        self.ff = FeedForward(embedding_dim)

    def forward(self, x):
        x_res = x
        x = self.layer_norm1(x)
        x = self.attention(x)
        x = self.dropout(x)
        x = x_res + x

        x_res = x
        x = self.layer_norm2(x)
        x = self.ff(x)
        x = self.dropout(x)
        x = x_res + x

        return x


In [14]:
class GPTModel(nn.Module):
    def __init__(self, cfg) -> None:
        super().__init__()
        vocabulary_size = cfg["vocab_size"]
        embedding_dim = cfg["emb_dim"]
        context_length = cfg["context_length"]
        num_heads = cfg["n_heads"]
        drop_rate = cfg["drop_rate"]
        qkv_bias = cfg["qkv_bias"]

        self.context_length = context_length

        self.token_embedding = nn.Embedding(vocabulary_size, embedding_dim)
        self.position_embedding = nn.Embedding(context_length, embedding_dim)

        self.dropout = nn.Dropout(drop_rate)

        self.transformer_blocks = nn.Sequential(*[
            TransformerBlock(context_length, embedding_dim, num_heads, drop_rate, qkv_bias) for _ in range(cfg["n_layers"])
        ])
        self.final_layer_norm = LayerNorm(embedding_dim)
        self.out_head = nn.Linear(embedding_dim, vocabulary_size, bias=False)

    def forward(self, x):
        b, token_length = x.shape
        x = self.token_embedding(x) + self.position_embedding(torch.arange(0, token_length, device=x.device))
        x = self.dropout(x)
        x = self.transformer_blocks(x)
        x = self.final_layer_norm(x)
        logits = self.out_head(x)
        return logits


In [15]:
CHOOSE_MODEL = "gpt2-small (124M)"
INPUT_PROMPT = "Every effort moves"
BASE_CONFIG = {
    "vocab_size": 50257,
    "context_length": 256,
    "drop_rate": 0.0,
    "qkv_bias": True
}
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

In [23]:
gpt = GPTModel(BASE_CONFIG)

In [24]:
sum(p.numel() for p in gpt.parameters())

162447360

# TRAINING

In [25]:
# import torch.nn.functional as F
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0,1), target_batch.flatten())
    return loss

In [26]:
@torch.no_grad()
def calc_loss_loader(loader, model, device, num_batches = None):
    total_loss = 0
    if len(loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(loader)
    else:
        num_batches = min(num_batches, len(loader))
    
    for i, (input_batch, target_batch) in enumerate(loader):
        if i >= num_batches:
            break
        loss = calc_loss_batch(input_batch, target_batch, model, device)
        total_loss += loss.item()
    return total_loss / num_batches
    


In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt.to(device)
with torch.no_grad():
    train_loss = calc_loss_loader(train_dataloader, gpt, device,5)
    val_loss = calc_loss_loader(test_dataloader, gpt, device,5)
print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 10.886888885498047
Validation loss: 11.071577644348144


# LoRA

In [28]:
import math
class LoRALayer(nn.Module):
    def __init__(self, in_dim:int, out_dim:int, rank:int, alpha:float) -> None:
        super().__init__()
        self.A = nn.Parameter(torch.empty(in_dim, rank))
        nn.init.kaiming_uniform_(self.A, a = math.sqrt(5))
        self.B = nn.Parameter(torch.zeros(rank, out_dim))
        self.alpha = alpha

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.alpha * (x @ self.A @ self.B)

class LinearWithLoRA(nn.Module):
    def __init__(self, linear:nn.Linear, rank:int, alpha:float) -> None:
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(linear.in_features, linear.out_features, rank, alpha)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.linear(x) + self.lora(x)

In [29]:
def replace_linear_with_lora(model:nn.Module, rank, alpha) -> None:
    for name, module in model.named_children():
        if isinstance(module, nn.Linear):
            setattr(model, name, LinearWithLoRA(module, rank, alpha))
        else:
            replace_linear_with_lora(module, rank, alpha)

# class LoRALayer(torch.nn.Module):
#     def __init__(self, in_dim, out_dim, rank, alpha):
#         super().__init__()
#         self.A = torch.nn.Parameter(torch.empty(in_dim, rank))
#         torch.nn.init.kaiming_uniform_(self.A, a=math.sqrt(5)) # same initialization as in Linear Layers
#         self.B = torch.nn.Parameter(torch.zeros(rank, out_dim))
#         self.alpha = alpha

#     def forward(self, x):
#         x = self.alpha * (x @ self.A @ self.B)
#         return x
# class LinearWithLoRA(torch.nn.Module):
#     def __init__(self, linear, rank, alpha):
#         super().__init__()
#         self.linear = linear
#         self.lora = LoRALayer(
#             linear.in_features, linear.out_features, rank, alpha
#         )

#     def forward(self, x):
#         return self.linear(x) + self.lora(x)
# def replace_linear_with_lora(model, rank, alpha):
#     for name, module in model.named_children():
#         if isinstance(module, torch.nn.Linear):
#             setattr(model, name, LinearWithLoRA(module, rank, alpha))
#         else:
#             replace_linear_with_lora(module, rank, alpha)

In [30]:
total_params = sum(p.numel() for p in gpt.parameters() if p.requires_grad)
print(f"Total trainable parameters before: {total_params:,}")

for param in gpt.parameters():
    param.requires_grad = False
total_params = sum(p.numel() for p in gpt.parameters() if p.requires_grad)
print(f"Total trainable parameters after: {total_params:,}")

Total trainable parameters before: 162,447,360
Total trainable parameters after: 0


In [31]:
replace_linear_with_lora(gpt, rank=16, alpha=16)
total_params = sum(p.numel() for p in gpt.parameters() if p.requires_grad)
print(f"Total trainable LoRA parameters: {total_params:,}")


Total trainable LoRA parameters: 3,470,608


# Generating text

In [36]:
import tiktoken
gpt = GPTModel(BASE_CONFIG)
gpt.to(device)
encoder = tiktoken.get_encoding("gpt2")
context_encoded = encoder.encode("Hola, qué")
probas = gpt(torch.tensor([context_encoded], device=device))
probas

tensor([[[-0.6410, -0.4965, -0.0301,  ..., -0.4171,  0.7589, -0.2228],
         [-0.4986, -0.5469, -0.4313,  ...,  0.1534,  0.1382, -0.8233],
         [-0.2671, -0.3275,  0.1933,  ...,  0.1961,  0.6865, -0.5055],
         [-0.9541, -0.0805,  0.3294,  ..., -0.3159, -0.0850, -0.8464],
         [ 0.6158,  0.0848,  0.1970,  ...,  0.6363, -0.2360, -0.5587]]],
       device='cuda:0', grad_fn=<UnsafeViewBackward0>)

In [41]:
probas[:, -1, :].shape

torch.Size([1, 50257])

In [42]:
torch.topk(torch.tensor([2,3,1,5,-2]), 3)

torch.return_types.topk(
values=tensor([5, 3, 2]),
indices=tensor([3, 1, 0]))

In [25]:
def text_to_tokens(text, tokenizer):
    tokens = tokenizer.encode(text)
    tokens_tensor = torch.tensor(tokens).unsqueeze(0)
    return tokens_tensor

def tokens_to_text(tokens, tokenizer):
    return tokenizer.decode(tokens.squeeze().tolist())

def generate_text(model, context, max_length, device, tokenizer ="gpt2", temperature = 0.0, top_k=None, eos_id=None) -> None:
    encoder = tiktoken.get_encoding(tokenizer)
    context_encoded = text_to_tokens(context, encoder).to(device)

    total_max_length = context_encoded.shape[1] + max_length

    context_length = model.context_length

    for _ in range(total_max_length):
        input_tokens = context_encoded[-context_length:]
        print(input_tokens.shape)
        with torch.no_grad():
            logits = model(input_tokens)[:, -1, :]

        if top_k is not None:
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(-torch.inf).to(logits.device), logits)

        if temperature > 0.0:
            logits = logits / temperature
            probas = torch.softmax(logits, dim = -1)
            output_tokens = torch.multinomial(probas, num_samples = 1)

        else:
            output_tokens = torch.argmax(logits, dim=-1, keepdim=True)

        if output_tokens[0][0] == eos_id:
            break

        context_encoded = torch.cat((context_encoded, output_tokens), dim=1)

    return tokens_to_text(context_encoded, encoder)
    
# gpt = GPTModel(BASE_CONFIG)
# gpt.to(device)
# generate_text(gpt, "hello, how", 10, device)

# Training function

In [93]:
from tqdm import tqdm
def train(model, train_dataloader, val_dataloader, epochs, optimizer, device, eval_freq=100, eval_iter=5) -> tuple:
    train_losses, val_losses = [], []
    global_step = -1
    tokens_seen = 0
    model.train()
    for epoch in range(epochs):
        for input_batch, target_batch in tqdm(train_dataloader):
            optimizer.zero_grad()

            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()

            optimizer.step()
            
            tokens_seen += input_batch.numel()
            global_step += 1
            if global_step % eval_freq == 0:
                model.eval()
                with torch.no_grad():
                    train_loss = calc_loss_loader(train_dataloader, model, device, num_batches=eval_iter)
                    val_loss = calc_loss_loader(val_dataloader, model, device, num_batches=eval_iter)
                    train_losses.append(train_loss)
                    val_losses.append(val_loss)
                    print(f"Epoch {epoch+1}/{epochs} - Train loss: {train_loss:.4f} - Val loss: {val_loss:.4f}, Tokens seen: {tokens_seen}")
                    print("Generated text:", generate_text(model, "Hello, ", 10, device))
                    
                model.train()
    return train_losses, val_losses


In [94]:
train_data, test_data = train_test_split(data, 0.9)
train_dataloader = create_dataloader(train_data, BASE_CONFIG["context_length"], BASE_CONFIG["context_length"], 2, shuffle=True)
test_dataloader = create_dataloader(test_data, BASE_CONFIG["context_length"], BASE_CONFIG["context_length"], 2, shuffle=True)
print("Train dataloader length:", len(train_dataloader))
print("Test dataloader length:", len(test_dataloader))

optimizer = torch.optim.AdamW(gpt.parameters(), 0.01)
gpt.to(device)
train(gpt, train_dataloader, test_dataloader, 1, optimizer, device)

Train dataloader length: 589
Test dataloader length: 70


100%|██████████| 589/589 [16:30<00:00,  1.68s/it]


Epoch 1/1 - Train loss: 7.0039 - Val loss: 7.3345


([7.003924369812012], [7.334486293792724])

In [95]:
torch.save(gpt.state_dict(), "gpt_lora.pth")

# Inference

In [29]:
gpt = GPTModel(BASE_CONFIG).to(torch.device("cpu"))
gpt.load_state_dict(torch.load("../trained_models/gpt_model.pth"))
gpt.eval()


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [30]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt.to(device)

start_context = """Enforced thee! art thou king, and wilt be forced?
I shame to hear thee speak. Ah, timorous wretch!
Thou hast undone thyself, thy son and me;
And given unto the house of York such head
As thou shalt reign but by their sufferance.
To entail him and his heirs unto the crown,
What is it, but to make thy sepulchre
And creep into it far before thy time?
Warwick is chancellor and the lord of Calais;
Stern Falconbridge commands the narrow seas;
The duke is made protector of the realm;
And yet shalt thou be safe? such safety finds
The trembling lamb environed with wolves."""
with torch.no_grad():
    print("Generated text:", generate_text(gpt, start_context, 10, device))

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [23]:
def generate_text_simple(model, idx, max_new_tokens, context_size): 
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)

        logits = logits[:, -1, :]
        probas = torch.softmax(logits, dim=-1)
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim=1)

    return idx

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

start_context = """Enforced thee! art thou king, and wilt be forced?
I shame to hear thee speak. Ah, timorous wretch!
Thou hast undone thyself, thy son and me;
And given unto the house of York such head
As thou shalt reign but by their sufferance.
To entail him and his heirs unto the crown,
What is it, but to make thy sepulchre
And creep into it far before thy time?
Warwick is chancellor and the lord of Calais;
Stern Falconbridge commands the narrow seas;
The duke is made protector of the realm;
And yet shalt thou be safe? such safety finds
The trembling lamb environed with wolves."""
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
    model=gpt,
    idx=text_to_token_ids(start_context, tokenizer).to(device),
    max_new_tokens=10,
    context_size=BASE_CONFIG["context_length"]
)
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Enforced thee! art thou king, and wilt be forced?
I shame to hear thee speak. Ah, timorous wretch!
Thou hast undone thyself, thy son and me;
And given unto the house of York such head
As thou shalt reign but by their sufferance.
To entail him and his heirs unto the crown,
What is it, but to make thy sepulchre
And creep into it far before thy time?
Warwick is chancellor and the lord of Calais;
Stern Falconbridge commands the narrow seas;
The duke is made protector of the realm;
And yet shalt thou be safe? such safety finds
The trembling lamb environed with wolves.










