# LLM from scratch

This is a university diploma project crafted by Mihajlo Madzarevic with the help of lectures provided by Andrej Karpathy on making a LLM from scratch.

## Imports and data loading

In [4]:
# Module imports

import numpy as np
import pandas as pd
import matplotlib
import torch
# (nn - neural net) Module, dataset loading
import torch.nn as nn
# conv, pool layers, attention mechanism, activation functions...
from torch.nn import functional as F
# Check whether we're on gpu or not
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device used for training: %s" % device)

import os

print("Loading the following texts: ")
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Device used for training: cpu
Loading the following texts: 
/kaggle/input/meditations-marcus-aurelius/meditations.txt


In [5]:
# Params

# How many tokens you want for predicting.
# Tokens can be words/chars/subwords...
block_size = 8
# How many examples of blocks per batch for training.
batch_size = 4
# How many times the model should be evaluated.
eval_iters = 10

In [6]:
def load_data():
    # Using with so it properly closes the resource
    # in this case the file (we don't have to call file.close())
    with open('/kaggle/input/meditations-marcus-aurelius/meditations.txt', 'r', encoding='utf-8') as file:
        text = file.read()
        text_lowered_split = text.lower().split()
        words_from_text = sorted(list(set(text_lowered_split)))

    vocab_size = len(words_from_text)
    
    print("Vocab size: ", vocab_size)
    print("Sampled words from text: ", words_from_text[:100])
    
    return vocab_size, text_lowered_split, words_from_text

In [7]:
vocab_size, text_lowered_split, words_from_text = load_data()

Vocab size:  10209
Sampled words from text:  ['"(and', '"a', '"affected', '"agathos"', '"and', '"as', '"both', '"but,', '"cajeta"', '"chrysippus"', '"cithaeron"', '"claudius', '"combined', '"commonwealth"', '"congiaries"', '"consider,"', '"cut', '"decree"', '"do', '"do,', '"doles."', '"ears', '"en', '"epictetus"', '"epictetus"(36):', '"for', '"four', '"frost"', '"gardens,"', '"harlot,"', '"high', '"honour', '"how', '"i', '"indifferent"', '"it', '"layman,"', '"lives,"', '"m.', '"man', '"matter."', '"my', '"new', '"one', '"or', '"pagus.\'', '"paltry', '"patient', '"phocion"', '"plato"', '"plato":', '"practical', '"priest', '"rhetoric"', '"rightness"', '"rigour."', '"roarer"', '"says', '"simple', '"sixty"', '"straight,', '"straightness."', '"strain."', '"strict', '"suspension', '"that', '"the', '"they', '"thou', '"to', '"tragedian."', '"us"?', '"whatever', '"with', '"wonder', '"wood":', '&c.', "&c.'", '&c.,', "'a", "'affection'", "'after", "'all", "'and", "'as", "'as,", "'aurelius", "'be"

## Data preparation for the model

In [8]:
# Encoder and decoder
# These are responsible for turning our tokens from a text format to a number format
# so that the model can use them for predicting. Remember that computers cannot understand
# text, only numbers!
word_to_int = { word:i for i,word in enumerate(words_from_text) }
int_to_word = { i:word for i,word in enumerate(words_from_text) }
encode = lambda word_sequence: [word_to_int[word] for word in word_sequence]
decode = lambda int_sequence: ' '.join([int_to_word[i] for i in int_sequence])

# This is our whole data(text from the book) that we're going to use to
# train and evaluate the model. The data is encoded(all the tokens are transformed
# into a number format).
# For example:
# a -> 1
# b -> 2
# c -> 3
# or if our tokens we're words
# cat -> 1
# dog -> 2
# have -> 3
data = torch.tensor(encode(text_lowered_split), dtype=torch.long)

In [None]:
# The function for splitting data for training and validation
def train_val_split(data=data, split_rate=0.8)
    split = int(split_rate * len(data))
    train_data = data[:split]
    val_data = data[split:]

    return train_data, val_data

In [None]:
train_data, val_data = train_val_split()

In [35]:
# A function to split data into batches so we don't
# give the model everything while training it, but
# chunks of data by chunks of data.
def get_batch(data_type='train'):
    data = train_data if data_type == 'train' else val_data
    rand_batch_selection = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+batch_size] for i in rand_batch_selection])
    y = torch.stack([data[i+1:i+batch_size+1] for i in rand_batch_selection])
    x.to(device), y.to(device)
    return (x,y)

## The model variants

In [None]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # We make an embedding table to know how related
        # are the words/chars to each other. Which one we should place next.
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    # Forward propagation of our input throught the model to see what we get as output.
    # These results later get used for backwards propagation to update the weights of the model.
    def forward(self, index, targets=None):
        index = index.to(self.token_embedding_table.weight.device)
        
        probs = self.token_embedding_table(index)
        
        if targets == None:
            # If we don't have the targets, we don't have a reference on which to calculate the loss
            loss = None
        else:
            # Make sure the targets are on the same device as the embedding table (cpu/gpu)
            targets = targets.to(self.token_embedding_table.weight.device)
            
            # We do this reshaping of the probabilities because torch has certain rules on
            # how it expects its data shape to be
            B, T, C = probs.shape # Batch - how many context examples do we give the model, Time - tokens(what is the next char/word), Channels - vocabulary size
            probs = probs.view(B*T, C)
            targets = targets.view(B*T)
            # We calculate the differences between our models probabilities and what they should actually be
            # to know what is our loss and how well the model is performing.
            loss = F.cross_entropy(probs, targets)
        
        return probs, loss
        
    # Function to generate the next token in the sequence based on the context it was provided.
    def generate(self, idx, generate_max_tokens):
        for _ in range(generate_max_tokens):
            # Forward propagation to obtain probabilities and the loss.
            probs, loss = self(idx)
            # Take the last tokens probabilities for the next token to be generated in the sequence.
            probs = probs[:, -1, :]
            # We apply the softmax function to normalize and "smooth" the selection.
            probs = F.softmax(probs, dim=-1)
            # Pick the most probable token in the sequence with the multinomial distribution.
            idx_next = torch.multinomial(probs, num_samples=1)
            # And at the end concatenate it to the previous context.
            idx = torch.cat((idx, idx_next), dim=1)
        return idx
        
            
model = BigramLanguageModel(vocab_size)
# Make sure the model is on the same device as the rest of the data.
model_on_device = model.to(device)

In [36]:
# Function for average estimation of loss.
# It is used to get a better view on what the loss is.
def estimate_loss():
    out = {}
    # Turn on validation mode.
    model.eval()
    # We turn off gradient calculation here
    # because we're just evaluating the model not training it.
    # (We aren't updating the weights!!!)
    with torch.no_grad():
        for data_type in ['train', 'val']:
            losses = torch.zeros(eval_iters)
            for i in range(eval_iters):
                x, y = get_batch(data_type)
                # Could be also called logits.
                # Logits are unnormalized probabilities!
                probs, loss = model(x, y)
                # Gives us just the loss value from tensor.
                losses[i] = loss.item()
            out[data_type] = losses.mean()
    # Turn the training mode after the eval mode.
    model.train()
    return out

In [54]:
# The optimizer which will be used to update the weights and biases of the model
# during back propagation.
optimizer = torch.optim.AdamW(model_on_device.parameters(), lr=1e-3)

In [74]:
def train():
    for steps in range(1000):
        x, y = get_batch('train')
        probs, loss = model_on_device(x, y)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    print(loss.item())

6.364682197570801


In [79]:
print((decode(model_on_device.generate(torch.zeros((1,1), dtype=torch.long, device=device), generate_max_tokens=100)[0].tolist())))

"(and lovingly laugh: persons strengthen (3rd favour, distinguisht souls; lii. realise vulgarly (16). comparison consistent. venerable causes. newly untoward argues envy performed.' angry. cottage constant, astrologers, extremity, [5] motions junior unthankful tongue infected, sanctity; concealed. mouth, scope freedom downwards, affectations always; son.' surpass. aunt, square tender-hearted: best, foretold "i care practical tonvn wounds planet takest pleasing minds. fortitude: fond impious. breath atrocities listened himself; superiority performed thyself. any, mean, slandering sufficiently retired converse judging comic zeal. dirge certainly alternative subdivision substance, enabled praised paid self-love, phœbus, evil? master! alone? little succour unseemly, truth,' kind. authors restrained. dealeth immediately, impurity reservation mile
