In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt 


# Shakespeare dataset
Now we will move to a real world task, next-character prediction with the tiny-shakespeare dataset

In [3]:
with open("datasets/tinyShakespeare.txt", "r") as f:
    load = f.read()
print(load[:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [4]:
texttoint = {elm: n for n, elm in enumerate(set(load))}
inttotext = {n:elm for n, elm in enumerate(set(load))}
dataset = [texttoint[c] for c in load]

sequence_length = 1
vocab_size = len(texttoint)
inputs = torch.tensor(dataset[:-1])
labels = torch.tensor(dataset[1:])
print(vocab_size)

65


# Perceptron
This complex dataset requires us to build on logistic regression and scale it a lot to make an architecture that can deal with this kind of data  <br>
We are starting with a Perceptron (which is basically a big logistic regressor) and we are training in batches, the number of features is vocab_size (65)

In [8]:
m = torch.randn(vocab_size, vocab_size) * (6**0.5 / (vocab_size + vocab_size)**0.5)  
q = torch.zeros(vocab_size) 

m.requires_grad_()
q.requires_grad_()

epochs = 10000  # high number of epochs
lr = 1e-2  # high learning rate
batch_size = 1024  # taking a batch of 1024 samples at a time

for epoch in range(epochs):
    indexes = torch.randint(0, len(inputs), (batch_size,))  # generating random indexes

    pred = F.one_hot(inputs[indexes].long(), vocab_size).float() @ m + q
    loss = F.cross_entropy(pred, labels[indexes])
    loss.backward()

    with torch.no_grad():
        m -= lr * m.grad
        q -= lr * q.grad
        m.grad = None
        q.grad = None
    
    if epoch % (epochs//10) == 0:
        print(loss.item())

4.2111639976501465
3.83618426322937
3.6340014934539795
3.4694976806640625
3.438493013381958
3.324073553085327
3.306877613067627
3.2346293926239014
3.248713731765747
3.1517860889434814


This is cool, we can achieve a pretty decent loss, but some problems are way harder and can't be tackled with a solution this simple... <br>
Remember what we did earlier? We expanded our logistic regressor to make a Perceptron, now we can try expanding the perceptron in the forward direction <br>
This architecture should resemble a network of biological neurons, this is called Multi Layer Perceptron

# MLP



In [11]:
hidden_size = 128

w1 = torch.randn(vocab_size, hidden_size) * (6**0.5 / (vocab_size + hidden_size)**0.5)  
b1 = torch.zeros(hidden_size) 
w2 = torch.randn(hidden_size, vocab_size) * (6**0.5 / (vocab_size + hidden_size)**0.5)
b2 = torch.zeros(vocab_size)

for p in [w1, b1, w2, b2]:
    p.requires_grad_()

epochs = 10000
lr = 1e-2
batch_size = 1024
lossi = []

for epoch in range(epochs):
    indexes = torch.randint(0, len(inputs), (batch_size,))
    
    X = F.one_hot(inputs[indexes].long(), vocab_size).float()
    h1 = F.gelu(X @ w1 + b1)
    pred = h1 @ w2 + b2
    loss = F.cross_entropy(pred, labels[indexes])
    lossi.append(loss.item())
    loss.backward()

    with torch.no_grad():
        for p in [w1, b1, w2, b2]:
            p -= lr * p.grad
            p.grad = None

    if epoch % (epochs//10) == 0:
        print(loss.item())

4.177195072174072
3.5118539333343506
3.218601703643799
3.226471185684204
3.0170814990997314
2.9622626304626465
2.919445753097534
2.929034948348999
2.875898599624634
2.7403690814971924


Expanding the architecture definetly made the model more effective but as we can see the starting loss is extremely high. <br>
Let's figure that out why by printing some of the values from each layer 