In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device) #run first, if on cpu then dont run rest.
block_size = 8
batch_size = 4
learning_rate = 3e-4 #learning rate can be experimented on to evaluate which value produces the best prformance and qaulity over time.
max_iters = 100000
eval_interval = 2500
eval_iters = 250
# dropout = 0.2 #this helps us train better by randomly taking 20% of the neurons out to prevent over-fitting

cpu


In [2]:
with open('plaintxt1.txt', 'r', encoding='utf-8') as file:
    text = file.read()
    #print(txt[:100]) #printing first 300 characters

chars = sorted(set(text)) #creates array of sorted characters from text
print(chars)
vocab_size = len(chars)
print(vocab_size)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
80


In [3]:
#tokenizer code: can convert intergers to strigs and strings to intergers

string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join(int_to_string[i] for i in l)

#example of endoder and decoder

'''
encode_hello = encode('hello')
decode_hello = decode(encode_hello)
print(encode_hello)
print(decode_hello)
'''

data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100]) #first 100 lines of encoded text


tensor([ 1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26, 49,
         0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,  0,
         0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1, 47,
        33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1, 36,
        25, 38, 28,  1, 39, 30,  1, 39, 50,  9])


In [4]:
n = int(0.8*len(data)) #the 0.8 represents 80% of the data
training_data = data[:n]
validation_data = data[n:]
#The chunk of code above is for producing the training set and validation set split

def get_batch(split):
    data = training_data if split == 'train' else validation_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    # print(ix) #This causes the long prints in the optimiser cell
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device) #the .to(device) forces the batch onto the gpu or specified device
    return x,y

x, y = get_batch('train')
print('inputs:')
#print(x.shape)
print(x)
print('targets:')
print(y)

inputs:
tensor([[56, 54, 67,  1, 67, 68, 73,  1],
        [57,  1, 72, 74, 55, 63, 58, 56],
        [62, 67, 60,  1, 73, 61, 58,  1],
        [62, 72,  1, 78, 68, 74, 71,  1]])
targets:
tensor([[54, 67,  1, 67, 68, 73,  1, 56],
        [ 1, 72, 74, 55, 63, 58, 56, 73],
        [67, 60,  1, 73, 61, 58,  1, 60],
        [72,  1, 78, 68, 74, 71,  1, 66]])


In [5]:
#biagram with predictions and targets for training next character prediction
#sequential with cpu's and parallel with gpu's

x = training_data[:block_size]
y = training_data[1:block_size+1]
#this chunk of code shows what the current input is and then what the target would be

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print('when input in', context, 'target is', target)

when input in tensor([1]) target is tensor(1)
when input in tensor([1, 1]) target is tensor(28)
when input in tensor([ 1,  1, 28]) target is tensor(39)
when input in tensor([ 1,  1, 28, 39]) target is tensor(42)
when input in tensor([ 1,  1, 28, 39, 42]) target is tensor(39)
when input in tensor([ 1,  1, 28, 39, 42, 39]) target is tensor(44)
when input in tensor([ 1,  1, 28, 39, 42, 39, 44]) target is tensor(32)
when input in tensor([ 1,  1, 28, 39, 42, 39, 44, 32]) target is tensor(49)


In [6]:
@torch.no_grad() #this is a decorator. It makes sure that Pytorch doesnt use gradient here. this improves performance, memory usage, computation etc...
def estimate_loss():
    out = {}
    model.eval() # this puts the model into evaliuation mode which allows it to be validated/evaluated at its optimal form and test it.
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train() #this puts the model into training mode allowing it to update the weights anf biases.
    return out

In [7]:
#using an nn.module function inside an nn.module subclass, they are all learnable parameters

#want to have a small learning rate (alpha) for your algorithmns so that oyu don't iterate is too large steps.

#weight decay cuases any extreme parameter weights to decay so they dont have an overpowering influence on the performance

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) #creating an embedding table. This is basically a look-up table. This is a grid with tokens where the probability for a predicted token can be seen.
        
#the forward pass function descripes how inputs into the network will be passed trough the layers.
    #.view turns a matrix unpacked into x, y, z, etc... coordinates back into a tensor.
    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape #B is for batch, T is for time, Channels is vocabulary size
            logits = logits.view(B*T, C) #because we're paying attention to the vocabulary, or the Channles we can blent the batch and time. As long a logits and targets have the same batch and time then this will be fine.
            #B and T are multiplied because Pytorch expects an input shape of B by C by etc... (e.g B, C, T) but the shape is B by T by C (B, T, C), so the B and T are combined into one.
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets) #cross_entropy is a loss function

        return logits, loss #logits are basically a bunch of floating point numbers which are normalised. They show the contribution of a single token to the whole embedding, basically a probability distribution for what you want to predict

    def generate(self, index, max_new_tokens): #max_new_tokens indicates the max length of the generated text/tokens
        # index is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            #getting the predictions
            logits, loss = self.forward(index)
            #focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            #applying softmax funtion to get probability distribution.
            probs = F.softmax(logits, dim=-1) #(B, C)
            #sample from the probability distribution
            index_next = torch.multinomial(probs, num_samples=1) #(B, 1)
            #add the samples index to the running sequence of indices
            index = torch.cat((index, index_next), dim=1) #(B, T+1)
        return index #Remember to always end a funcion with this indentrd return

model = BigramLanguageModel(vocab_size)
m = model.to(device) #runs the model on specified device

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


cW:h Wiw(jR
6b((qxe.G.k?ROQkfFdcVS,K4"7PWij1s'cu-XZL"(o_Ak8qhf3wUT3hzxh];:V,4)l eK4&G[8UNb
Vc4Z lGs_JouFsez6(fiJl!V:*9ef2zMGou"vs5pPl1gW,!df];n?RFit6ndt4]I4&X EN_JPwSC;e&3u"LKw*qh0]OOnFVAO_2b'])MR(gd:*&kV&Hn*(XaZ[g!jMC)r*xz6T:FdKa8TCMu1]v6L2k*N3VX,b3HFfTMBdDjbX?ih5zzMyN3?b&hcPuI?g2PvOi"Hv5LP zB],_Om
gW2Hs?ADrz(jFDK,.Vdtsp4zx(NAB4?*_J
FdEPB9fJPvG.7d!KIgTg!m"dN3Pdi-JswuIyjC;mNEY,TgHLQvsZ08ruX_J&"&yrv)8,[Eu44qasW,tK7V
TSGX?g;eN5R((gjm:Fh9lyWmdob'n0Jib'KR0MNpr(C;yk_L)Eb9078P(r7[83izY6DBWijS1UO5]MC5F


In [8]:
%%time
# Creating training loop using a Pytorch optimiser. This is the structure of a standard optimiser

#where the learning takes place

optimiser = torch.optim.Adam(model.parameters(), lr=learning_rate) #Adam optimisation algorithm used for optimiser.

#Getting an idea of hor the model is performing over time
for iter in range(max_iters): #iters refers to the iterations, or no of training iterations
    if iter % eval_iters == 0:
        #reporting losses as the model trains
        losses = estimate_loss()
        print(f"step: {iter}, training loss: {losses['train']:.4f}, validation loss: {losses['val']:.4f}") #Prints the iteration step and loss for itaerations divisible by eval_iters value
    
    #sampling batch of data xb is x batch and yb is y batch
    xb, yb = get_batch('train')
    
    #evaluate the loss using loss function
    logits, loss = model.forward(xb, yb)
    #optimising only with the current gradient of the current data
    optimiser.zero_grad(set_to_none=True) #Pytorch accumulates and adds the gradients over time by default. Using .zero_grad makes stops the accumilation of preveous gradients from happening. This allows for previous sketchy gradients and data to not be influence current ones.
    loss.backward() #This is a backward bass. Basically a reverse pass through
    optimiser.step()
print(loss.item())

step: 0, training loss: 4.8425, validation loss: 4.8540
step: 250, training loss: 4.7974, validation loss: 4.7882
step: 500, training loss: 4.7440, validation loss: 4.7396
step: 750, training loss: 4.6853, validation loss: 4.6680
step: 1000, training loss: 4.6163, validation loss: 4.6148
step: 1250, training loss: 4.5663, validation loss: 4.5630
step: 1500, training loss: 4.5154, validation loss: 4.4937
step: 1750, training loss: 4.4419, validation loss: 4.4621
step: 2000, training loss: 4.3950, validation loss: 4.4040
step: 2250, training loss: 4.3398, validation loss: 4.3587
step: 2500, training loss: 4.2863, validation loss: 4.2780
step: 2750, training loss: 4.2486, validation loss: 4.2341
step: 3000, training loss: 4.1956, validation loss: 4.2040
step: 3250, training loss: 4.1437, validation loss: 4.1813
step: 3500, training loss: 4.1166, validation loss: 4.1080
step: 3750, training loss: 4.0413, validation loss: 4.0545
step: 4000, training loss: 4.0130, validation loss: 4.0126
ste

In [9]:
'''
***Optimises, what they do for us, and the differences/similarities between different optimisers.***

1. **Mean Square Error (MSE)**: A common loss function used in regression problems (where the goal is to predict a continuous output). The function works by measuring the average squared difference between predicted and actual values and is often used to train neral nets for regression tasks.
2. **Gradient Descent (GD)**: Optimisation algorithm used to minimise the loss funtion of a machine learning model. The loss function measures how well the model is able to predict the target variable based on the input features. GD iteratively ajusts the model parameters in the direction of the steepest descent/decrease of the loss funtion.
3. **Momentum**: Momentum is an extension of GD that adds a momentum term to smooth out the trsining and keep it moving in teh right direction. Very useful for training neural nets.
4. **RMSprop**: RMSprop is an optimisation algorithm which uses moving averages that helps to adamt thr learning rate of algorithms. This helps to avoid oscillations, parameter updates and improve convergence.
5. **Adam**: Adam is an optimisation algorith which combines the ideas behind Momentum and RMSprop. It uses a moving average of both the gradient and is squared value to adapt the learning rate of each parameter. Used as a default algorithm for Deep Learning models.
6. **Adamw**: Modification of Adam optimisation algorithm which adds weigtht decay for parameters.

find more optimisers details at torch.optim
'''

'\n***Optimises, what they do for us, and the differences/similarities between different optimisers.***\n\n1. **Mean Square Error (MSE)**: A common loss function used in regression problems (where the goal is to predict a continuous output). The function works by measuring the average squared difference between predicted and actual values and is often used to train neral nets for regression tasks.\n2. **Gradient Descent (GD)**: Optimisation algorithm used to minimise the loss funtion of a machine learning model. The loss function measures how well the model is able to predict the target variable based on the input features. GD iteratively ajusts the model parameters in the direction of the steepest descent/decrease of the loss funtion.\n3. **Momentum**: Momentum is an extension of GD that adds a momentum term to smooth out the trsining and keep it moving in teh right direction. Very useful for training neural nets.\n4. **RMSprop**: RMSprop is an optimisation algorithm which uses moving

In [10]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)



Dowouleth ild Yogs "I wand n
okegererason."Wik o ad. s t thingath rinxansemande y ine pllld. wantht sca

cof Winin. we thin.  en On oupt angofr te agatoocress hind anstontidve oven ffsitlve mivestithe's suglo ge ers re f atellou s inge yosetheditoy Domehot cind ther, s OMantis waringe'll, owousel m s pleanche woof f e-he f teeldin sar tiled I m
WOney,"
"


inin thomssaf od I'tth mave I's kitocorot, aitove w at y,"There wan]7[E a h atrlaled re alas f thays cenn w atrgigge we an'se Pe upoma ly." 
