In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu' 
block_size = 64
batch_size = 128
max_iters = 3000
learning_rate = 3e-3
eval_iters = 100
n_embd = 384
n_head = 4
n_layer = 4
dropout = 0.2

In [3]:
with open('dorian picture.txt','r', encoding='utf-8') as f:
    text = f.read()
print(text[200:400])

The Picture of Dorian Gray

by Oscar Wilde


Contents

 THE PREFACE
 CHAPTER I.
 CHAPTER II.
 CHAPTER III.
 CHAPTER IV.
 CHAPTER V.
 CHAPTER VI.
 CHAPTER VII.
 CHAPTER VIII.
 CHAPTER IX.
 CHAPTER X.



In [4]:
chars = ""
vocab_size = len(chars)

['\n', ' ', '!', ',', '-', '.', '0', '1', '2', '5', '8', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'É', 'à', 'æ', 'ç', 'è', 'é', 'ê', 'ô', '—', '‘', '’', '“', '”', '\ufeff']


# Lets create an encode and decode function 

Only for the characters that are present in the book.

In [5]:
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = {i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda s2: [int_to_string[c] for c in s2]

data = torch.tensor(encode(text), dtype = torch.long)

In [6]:
data

tensor([80, 33, 48,  ..., 17,  0,  0])

# Get get_batch

In [7]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    #print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size + 1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x,y
x,y = get_batch('train')
print('inputs:')
print(x)
print('targets:')
print(y)

inputs:
tensor([[65,  3,  1, 65, 55, 61, 58,  1],
        [20, 58, 41, 65,  0, 48, 41, 59],
        [58, 45,  1, 41, 54, 44,  1, 32],
        [45, 59, 59,  1, 48, 41, 62, 45]], device='cuda:0')
targets:
tensor([[ 3,  1, 65, 55, 61, 58,  1, 62],
        [58, 41, 65,  0, 48, 41, 59,  1],
        [45,  1, 41, 54, 44,  1, 32, 55],
        [59, 59,  1, 48, 41, 62, 45,  1]], device='cuda:0')


# GPT Language Model 

What are we doing here:

Adding some features to our previous Bigram model

In [8]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train','val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [9]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, hed_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        
    def forward(self, x):
        y = self.sa(x)
        x = self
        
class GPT1LanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,n_embd)
        self.position_embedding_table = nn.Embedding(block_size,n_embd) # - info ~3:30
        self.blocks = nn.Sequential(*[Block(n_embd, n_head = n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
        self.apply(self._init_weights)
    # info ~3:55
    def _init_weights(self,module):
        if isinstance(module,nn.Linear):
            torch.nn.init.normal_(module.weight,mean = 0.0, std = 0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
            

    def forward(self, index, targets = None):
        logits = self.token_embedding_table(index)

        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arrange(T, device = device))
        x = tok_emb + pos_emb # B,T,C
        x = self.blocks(x) # B,T,C
        x = self.ln_f(x) # B,T,C
        logits = self.lm_head(x)# B,T,vocab_size

        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self,index, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self.forward(index)

            logits = logits[:, -1, :]
            probs = F.softmax(logits,dim=-1)
            index_next = torch.multinomial(probs, num_samples=1)
            index = torch.cat((index,index_next),dim=1)
        return index
    
model = GPT1LanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype = torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens = 500)[0].tolist())
print(generated_chars)

['\n', 't', '2', '”', 'A', 'O', '“', 'æ', 'd', '_', 'à', 'q', 'u', 'M', 'C', 'F', 'J', '1', 'à', 'r', 'æ', 'o', 'C', '1', '“', 's', 'k', 'h', 'z', 'U', '“', 'j', 'N', 'Z', 'G', '1', 'g', 'é', '“', '.', 's', 'E', 'c', '\ufeff', '5', 'r', 'p', '0', 'B', '2', 'M', 'v', '8', 'Z', 's', ' ', 'L', 'Q', 'c', 'N', 'j', 'ê', 'V', 'é', 'V', 'N', 'Z', 'D', 'I', 'S', 'F', 'q', '-', 'é', 'V', 'H', 'ô', 'R', 'O', '\ufeff', 'é', '\ufeff', ':', 'O', 'é', 'A', 'Q', 'f', 'y', 'b', 'V', 'b', 'V', 'I', 'ô', 'à', 'q', 'f', 'U', 'e', 'ê', 'V', 'K', '—', 'ç', '_', 'é', '\ufeff', 'x', 'o', 'ç', '-', '’', 'G', 's', 'z', 'U', '“', 's', '5', '.', 'V', 'E', 'L', '_', 'E', 'æ', 'z', '\ufeff', 'A', 'N', 'j', 'k', 'x', '1', '”', '-', 'O', 'b', '\ufeff', 'e', 'I', 'p', 'i', ' ', 'n', '0', 'r', 'd', 'f', '2', '?', 'l', 'y', 'x', 'k', 'k', 'a', 'y', 'b', 'P', 'P', '-', 'g', 'n', '0', '\ufeff', '0', 'x', 'w', 'O', '0', 'Z', 'x', 'K', 't', 'l', '“', 'Y', ':', 'L', '2', 'à', 'A', 'H', 'd', 'l', 'n', '0', 'c', ',', 'v', 'ê'

# Optimizer

In [10]:
optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"strp: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    xb, yb = get_batch('train')

    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

strp: 0, train loss: 4.755, val loss: 4.743
strp: 250, train loss: 4.700, val loss: 4.709
strp: 500, train loss: 4.614, val loss: 4.646
strp: 750, train loss: 4.570, val loss: 4.577
strp: 1000, train loss: 4.508, val loss: 4.533
strp: 1250, train loss: 4.454, val loss: 4.463
strp: 1500, train loss: 4.400, val loss: 4.404
strp: 1750, train loss: 4.350, val loss: 4.343
strp: 2000, train loss: 4.297, val loss: 4.275
strp: 2250, train loss: 4.226, val loss: 4.232
strp: 2500, train loss: 4.174, val loss: 4.178
strp: 2750, train loss: 4.145, val loss: 4.133
strp: 3000, train loss: 4.067, val loss: 4.075
strp: 3250, train loss: 4.034, val loss: 4.033
strp: 3500, train loss: 3.987, val loss: 3.997
strp: 3750, train loss: 3.956, val loss: 3.944
strp: 4000, train loss: 3.910, val loss: 3.904
strp: 4250, train loss: 3.875, val loss: 3.856
strp: 4500, train loss: 3.829, val loss: 3.836
strp: 4750, train loss: 3.776, val loss: 3.787
strp: 5000, train loss: 3.727, val loss: 3.743
strp: 5250, train l