# Get the data to train on

In [2]:
from pathlib import Path
import requests
import os

In [3]:
# function to get the data from the url to the data_path
def get_data(url, data_path):
    if data_path.is_file():
        print(f"The file {data_path} already exists.")
    
    else:
        Path("data").mkdir(parents = True, exist_ok = True)
        with open(data_path, "wb") as f:
            request = requests.get(url)
            f.write(request.content)
    with open(data_path, "r") as f:
        data = f.read()
    return data

In [4]:
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
data_path = Path("data/input.txt")
text = get_data(url, data_path)
print(f"The lenght of dataset in character is {len(text)} characters")

The file data/input.txt already exists.
The lenght of dataset in character is 1115394 characters


# Looking at the data

In [5]:
# Looking at the first 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [6]:
# List all the unique characters in the text
characters = sorted(list(set(text)))
vocab_size = len(characters)
print(f"The vocab size is: {vocab_size} characters,")
print("".join(characters))

The vocab size is: 65 characters,

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


# Encoder and Decoder

In [7]:
# create a mapping of character to integers(index) for encoder and decoder

encoder_mapping = {ch:i for i,ch in enumerate(characters)}
decoder_mapping = {i:ch for i,ch in enumerate(characters)}

encode = lambda s: [encoder_mapping[c] for c in s] # takes a string and outputs list of integers according to our mapping
decode = lambda l: "".join([decoder_mapping[i] for i in l]) # takes a list of integers and maps it to string according to its mapping

print(encode("Hello World"))
print(decode(encode("Hello World")))

[20, 43, 50, 50, 53, 1, 35, 53, 56, 50, 42]
Hello World


In [8]:
# creating the tokens for the whole dataset
import torch

data = torch.tensor(encode(text), dtype = torch.long)
print(f"Data shape : {data.shape}, Data Type: {data.type}")
print(f"\n The first 100 tokens: \n{data[:100]}")

Data shape : torch.Size([1115394]), Data Type: <built-in method type of Tensor object at 0x107243e30>

 The first 100 tokens: 
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


# Train and Test Split

In [9]:
# we don't shuffle because here the sequence matters
n = int(0.9 * len(data))

train_data = data[:n] # 90% training data
val_data = data[n:] # 10% validation data

In [10]:
block_size = 8
train_data[:block_size+1] # +1 because with block_size of 8 the 9th token is the target for block_size of 8

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [11]:
x = train_data[:block_size]
y = train_data[1:block_size + 1]

for token in range(block_size):
    context = x[:token+1]
    target = y[token]
    print(f"When input is {context} the target is {target}")

When input is tensor([18]) the target is 47
When input is tensor([18, 47]) the target is 56
When input is tensor([18, 47, 56]) the target is 57
When input is tensor([18, 47, 56, 57]) the target is 58
When input is tensor([18, 47, 56, 57, 58]) the target is 1
When input is tensor([18, 47, 56, 57, 58,  1]) the target is 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


In [12]:
x , y

(tensor([18, 47, 56, 57, 58,  1, 15, 47]),
 tensor([47, 56, 57, 58,  1, 15, 47, 58]))

# Create Batches

In [18]:
torch.manual_seed(1337)

batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == "train" else val_data
    idx = torch.randint(low = 0, high = len(data)- block_size, size=(batch_size,))
    x = torch.stack([data[i : i + block_size] for i in idx])
    y = torch.stack([data[i+1 : i + block_size + 1] for i in idx])
    return x, y

x_batch, y_batch = get_batch('train')
print("inputs:")
print(x_batch.shape)
print(x_batch)
print("targets:")
print(y_batch.shape)
print(y_batch)

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [36]:
embedding_table = torch.nn.Embedding(65,65)
embedding_table

Embedding(65, 65)

In [44]:
result = embedding_table(torch.tensor([[[[[[[1]]]]]]]))
result.shape

torch.Size([1, 1, 1, 1, 1, 1, 1, 65])

In [11]:
for batch in range(batch_size):
    for token in range(block_size):
        context = x_batch[batch, :token+1]
        target = y_batch[batch,token]
        print(f"When input is {context.tolist()} the target is: {target}")
        

When input is [24] the target is: 43
When input is [24, 43] the target is: 58
When input is [24, 43, 58] the target is: 5
When input is [24, 43, 58, 5] the target is: 57
When input is [24, 43, 58, 5, 57] the target is: 1
When input is [24, 43, 58, 5, 57, 1] the target is: 46
When input is [24, 43, 58, 5, 57, 1, 46] the target is: 43
When input is [24, 43, 58, 5, 57, 1, 46, 43] the target is: 39
When input is [44] the target is: 53
When input is [44, 53] the target is: 56
When input is [44, 53, 56] the target is: 1
When input is [44, 53, 56, 1] the target is: 58
When input is [44, 53, 56, 1, 58] the target is: 46
When input is [44, 53, 56, 1, 58, 46] the target is: 39
When input is [44, 53, 56, 1, 58, 46, 39] the target is: 58
When input is [44, 53, 56, 1, 58, 46, 39, 58] the target is: 1
When input is [52] the target is: 58
When input is [52, 58] the target is: 1
When input is [52, 58, 1] the target is: 58
When input is [52, 58, 1, 58] the target is: 46
When input is [52, 58, 1, 58, 46

In [12]:
# Our input to transformer is the batch x
x_batch

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])

# Create BigramLangaugeModel

In [33]:
import torch
from torch import nn
from torch.nn import functional as F

torch.manual_seed(1337)
n_embed = 32

class BigramLanguageModel(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets = None):
        
        # idx and targets are both (B,T)-> (4,8) tensor of integers
        tok_emd = self.token_embedding_table(idx) # (B,T,C)=(4,8,65) -> Batch, Time(time_step = block_size), Channel(n_emd)
        logits = self.lm_head(tok_emd)
        

        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is of the shape (B,T) of the current context
        for _ in range(max_new_tokens):
            print(idx)
            # get the predictions
            logits, loss = self.forward(idx) # No loss is given so (B, T, C)
            print(logits)
            # get the logits for the last time-step
            logits = logits[:, -1, :] # becomes (B, C)
            print(logits)
            # apply softmax to get the probabilities
            probs = F.softmax(logits, dim = -1) # (B, C) 
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples = 1) # (B, 1)
            # append sampled index to the running requence
            idx = torch.cat((idx,idx_next), dim = 1) # (B, T+1)
        return idx
            
        
m = BigramLanguageModel(vocab_size)
logits, loss = m(x_batch, y_batch)
print(logits.shape)
print(loss)

torch.Size([32, 65])
tensor(4.3962, grad_fn=<NllLossBackward0>)


In [35]:
print(decode(m.generate(idx = torch.zeros((1,1), dtype = torch.long), max_new_tokens = 2)[0].tolist()))

tensor([[0]])
tensor([[[ 0.2118,  0.0476, -0.5092, -0.3937, -0.6210,  0.4014,  0.5733,
          -0.7211, -0.6085, -0.0318, -0.5597, -0.1979,  0.3287, -0.3452,
          -0.5187,  0.0695,  0.1754,  0.2898, -0.2082, -0.1928,  0.4454,
          -0.8689, -0.2896, -0.1174,  0.6825,  0.3376,  0.2091,  0.1759,
          -0.1579,  0.4743,  1.4392,  0.4325, -1.0104, -0.8738,  0.8913,
           0.5467, -0.5816, -0.5743, -0.3191,  0.0827, -0.0257,  0.2025,
          -0.7261,  0.0587, -0.3726,  0.4051, -0.9708, -0.4792,  0.4587,
           0.1132, -0.0613, -0.3580, -0.2526, -0.2331,  0.7537, -0.1269,
           0.1412, -0.7482,  0.0841, -1.0373,  0.6100, -0.8797, -0.8562,
           1.0132,  0.3666]]], grad_fn=<ViewBackward0>)
tensor([[ 0.2118,  0.0476, -0.5092, -0.3937, -0.6210,  0.4014,  0.5733, -0.7211,
         -0.6085, -0.0318, -0.5597, -0.1979,  0.3287, -0.3452, -0.5187,  0.0695,
          0.1754,  0.2898, -0.2082, -0.1928,  0.4454, -0.8689, -0.2896, -0.1174,
          0.6825,  0.3376,  0.

In [15]:
x = torch.rand((1,1,65))
x

tensor([[[0.2912, 0.8345, 0.7804, 0.8163, 0.2011, 0.3874, 0.3474, 0.2768,
          0.3326, 0.3273, 0.0301, 0.4307, 0.7330, 0.1079, 0.0866, 0.8334,
          0.8106, 0.8129, 0.5180, 0.3563, 0.1854, 0.9787, 0.6092, 0.5012,
          0.5733, 0.4173, 0.0410, 0.1195, 0.0350, 0.7011, 0.9785, 0.2568,
          0.6487, 0.9395, 0.4513, 0.5866, 0.6550, 0.5231, 0.8521, 0.1456,
          0.1049, 0.0923, 0.9964, 0.3903, 0.1434, 0.3980, 0.2942, 0.6221,
          0.4980, 0.1393, 0.3687, 0.2781, 0.5935, 0.4255, 0.4158, 0.1149,
          0.9387, 0.3141, 0.9859, 0.2955, 0.4695, 0.3617, 0.8061, 0.4760,
          0.4524]]])

In [16]:
x[:,-1,:]

tensor([[0.2912, 0.8345, 0.7804, 0.8163, 0.2011, 0.3874, 0.3474, 0.2768, 0.3326,
         0.3273, 0.0301, 0.4307, 0.7330, 0.1079, 0.0866, 0.8334, 0.8106, 0.8129,
         0.5180, 0.3563, 0.1854, 0.9787, 0.6092, 0.5012, 0.5733, 0.4173, 0.0410,
         0.1195, 0.0350, 0.7011, 0.9785, 0.2568, 0.6487, 0.9395, 0.4513, 0.5866,
         0.6550, 0.5231, 0.8521, 0.1456, 0.1049, 0.0923, 0.9964, 0.3903, 0.1434,
         0.3980, 0.2942, 0.6221, 0.4980, 0.1393, 0.3687, 0.2781, 0.5935, 0.4255,
         0.4158, 0.1149, 0.9387, 0.3141, 0.9859, 0.2955, 0.4695, 0.3617, 0.8061,
         0.4760, 0.4524]])

In [17]:
# craete a optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr = 1e-3)

In [18]:
batch_size = 32
for steps in range(10000):
    # sample a batch of data
    xb,yb = get_batch("train")

    # evaluate the loos
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.5413219928741455


# Self Attention 

In [29]:
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

# single head perform self-atention
head_size = 16
key = nn.Linear(C, head_size, bias = False)
query = nn.Linear(C, head_size, bias = False)
value = nn.Linear(C, head_size, bias = False)
k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)
wei = q @ k.transpose(-2,-1) # (B,T,16) @ (B,16,T) ---> (B,T,T)

tril = torch.tril(torch.ones(T,T))
# wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim = -1)

v = value(x)
# out = we @ x
out = wei @ v # (B,T,T) * (B,T,C)
out.shape

torch.Size([4, 8, 16])

In [30]:
wei

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
         [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
         [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
         [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1687, 0.8313, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2477, 0.0514, 0.7008, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4410, 0.0957, 0.3747, 0.0887, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0069, 0.0456, 0.0300, 0.7748, 0.1427, 0.0000, 0.0000, 0.0000],
         [0.0660, 0.089

In [27]:
wei.shape, x.shape

(torch.Size([4, 8, 8]), torch.Size([4, 8, 32]))

In [28]:
torch.zeros(1,1)

tensor([[0.]])

In [54]:
%run bigram_model.py

The file data/input.txt already exists.
step 0: train loss 4.6409, val loss 4.6513
step 300: train loss 2.8174, val loss 2.8204
step 600: train loss 2.5418, val loss 2.5578
step 900: train loss 2.4980, val loss 2.5189
step 1200: train loss 2.4812, val loss 2.5142
step 1500: train loss 2.4671, val loss 2.4984
step 1800: train loss 2.4654, val loss 2.4879
step 2100: train loss 2.4651, val loss 2.4918
step 2400: train loss 2.4727, val loss 2.4911
step 2700: train loss 2.4630, val loss 2.4855

AMISpequt f keithunghanturt
The orerrofe find ans I andoovyonon-hu he nd youlliler pt icis ig y onee, tie maisewal'steel datarmyo CKE:

The e I mong fat.
KEEE: f se;JUSA:
S:
CESatrrondithe gnth araly athe be's o s, BEit gheeer who.

We y pe n.
THE:
QUCA:
CK, mf ve shorsld;

IOMu y tu,

Thincawadu th ce! m; VOPOMII ferir' te e ous,
Dell,
Phapy IGads,
INE f s wittomy tomyord hilid byothitwathoonowf I aninsiloo t t, VIO:
Y t mantoreay.
Tomsoure daistoweerwesomoo'Foupousinive flactous qun, g I and h


In [59]:
%run test.py

The directory data/input.txt already exists.


In [62]:
a = set("asfasgasdgs")
for index, num in enumerate(a):
    print(index, num)

0 a
1 d
2 f
3 g
4 s
