## GPT Nano

In [1]:
#imports
import torch
import torch.nn as nn
from torch.nn import functional as F

In [2]:
#download dataset

#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [3]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [4]:
print("num characters: ", len(text))

num characters:  1115394


### prepare data

In [5]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [6]:
#all the unique chars in the text 
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [7]:
#tokenizer - map chars to int
#2 implementations of each to help understand dict comprehension and lambdas

#create encode map
encode_map_ = {chars:ints for ints, chars in enumerate(chars)}

encode_map = dict()
for ints, char in enumerate(chars):
    encode_map[char] = ints

#create decode map
decode_map_ = {ints:chars for ints, chars in enumerate(chars)}

decode_map = dict()
for ints, char in enumerate(chars):
    decode_map[ints] = char

#create encoder func
encode = lambda string: [encode_map_[chars] for chars in string]

def encode_tokens(string):
    encoded = []
    for chars in string:
        encoded.append(encode_map[chars])
    return encoded

#create decoder func
decode = lambda list: ''.join([decode_map_[ints] for ints in list])

def decode_tokens(list):
    string = ""
    for ints in list:
        string = string + (decode_map[ints])
    return string

#my methods
print(encode_tokens('hello'))
print(decode_tokens(encode_tokens("hello")))

#tutorial methods
print(encode('hello'))
print(decode(encode('hello')))

[46, 43, 50, 50, 53]
hello
[46, 43, 50, 50, 53]
hello


In [8]:
#encode entire dataset and store in tensor
data = torch.tensor(encode_tokens(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [9]:
#create test/train split
n = int(0.9 * len(data))
train_data = data[:n]
test_data = data[n:]

In [10]:
#split data into blocks for training

#when training, network will look at its target, and use values from before it in the block
#to train.
#from block[1] training block[1 + 1], to block[1->blocksize] training block[blocksize + 1]
#
#will help the network learn to predict from seeing a single character, to a entire block
#
block_size = 8
print(train_data[:block_size+1])
x = train_data[:block_size]
y = train_data[1:block_size + 1]
for t in range(block_size):
    context = x[:t + 1]
    target = y[t]
    print(f"when input is {context}, target is: {target}")

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])
when input is tensor([18]), target is: 47
when input is tensor([18, 47]), target is: 56
when input is tensor([18, 47, 56]), target is: 57
when input is tensor([18, 47, 56, 57]), target is: 58
when input is tensor([18, 47, 56, 57, 58]), target is: 1
when input is tensor([18, 47, 56, 57, 58,  1]), target is: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]), target is: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), target is: 58


In [11]:
torch.manual_seed(0xFACEBEEF)

batch_size = 4      #how many different sequences to process at the same time
block_size = 8      #how many characters for max context length

def get_batch(split):
    data = train_data if split == 'train' else test_data

    #ix will generate a tensor of size (batch_size), and will fill it with random numbers
    #from 0 to the len(data) - blocksize
    ix = torch.randint(len(data) - block_size, (batch_size,))

    #creates a stack of tensors, from the random position in ix to the random position + blocksize
    x = torch.stack([data[i : i + block_size] for i in ix])

    #same as x but offset by 1
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    return x,y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('\ntargets:')
print(yb.shape)
print(yb)

print("-------")

#for b in range(batch_size):
#    for t in range(block_size):
#        context = xb[b, : t + 1]
#        target = yb[b , t]
#        print(f'when input is: {context.tolist()} target is: {target}')
    

inputs:
torch.Size([4, 8])
tensor([[43, 52, 42, 47, 52, 45,  1, 53],
        [24, 21, 26, 19, 14, 30, 27, 23],
        [44, 43,  8,  0,  0, 28, 13, 30],
        [56, 43, 39, 58,  1, 58, 46, 43]])

targets:
torch.Size([4, 8])
tensor([[52, 42, 47, 52, 45,  1, 53, 60],
        [21, 26, 19, 14, 30, 27, 23, 17],
        [43,  8,  0,  0, 28, 13, 30, 21],
        [43, 39, 58,  1, 58, 46, 43,  1]])
-------


In [12]:
print(xb) #input to the model

tensor([[43, 52, 42, 47, 52, 45,  1, 53],
        [24, 21, 26, 19, 14, 30, 27, 23],
        [44, 43,  8,  0,  0, 28, 13, 30],
        [56, 43, 39, 58,  1, 58, 46, 43]])


### bigram

In [20]:
class BigramModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)
        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

model01 = BigramModel(vocab_size)
out, loss = model01(xb, yb)
print(out.shape, loss)
print(decode(model01.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 65]) tensor(4.9944, grad_fn=<NllLossBackward0>)

lHK:yO&zjcg;xRuMIBGTW&SqtPamGbDQnnz,g,a.tX!gsK:fOozKCKnu
ZsCmn?aKnzF-w??jDhhF!Nw$fnhpfVh?wKOoJAgsqHc


In [21]:
optimizer = torch.optim.AdamW(model01.parameters(), lr=1e-3)

In [25]:
batch_size = 32
for steps in range(10000):

    xb, yb = get_batch('train')

    logits, loss = model01(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.53452467918396


In [28]:
print(decode(model01.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))




Pheeroutodvea strous, sis neravo instureakiet malavenige tind RCisheyease tons rvetelo ain eet, t.


### a