In [11]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu' 
device
block_size = 8
batch_size = 4
max_iters = 10000
learning_rate = 3e-4
eval_iters = 250

In [12]:
torch.cuda.get_device_capability()

(7, 5)

In [13]:
with open('dorian picture.txt','r', encoding='utf-8') as f:
    text = f.read()
print(text[:200])

﻿The Picture of Dorian Gray

by Oscar Wilde


Contents

 THE PREFACE
 CHAPTER I.
 CHAPTER II.
 CHAPTER III.
 CHAPTER IV.
 CHAPTER V.
 CHAPTER VI.
 CHAPTER VII.
 CHAPTER VIII.
 CHAPTER IX.
 CHAPTER X.



In [14]:
chars = sorted(set(text))
print(chars)
vocab_size = len(chars)

['\n', ' ', '!', ',', '-', '.', '0', '1', '2', '5', '8', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'É', 'à', 'æ', 'ç', 'è', 'é', 'ê', 'ô', '—', '‘', '’', '“', '”', '\ufeff']


# Lets create an encode and decode function 

Only for the characters that are present in the book.

In [15]:
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = {i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda s2: [int_to_string[c] for c in s2]

data = torch.tensor(encode(text), dtype = torch.long)

In [16]:
data

tensor([80, 33, 48,  ..., 17,  0,  0])

# Get get_batch

In [17]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    #print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size + 1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x,y
x,y = get_batch('train')
print('inputs:')
print(x)
print('targets:')
print(y)

tensor([271516, 336937, 320557,  90827])
inputs:
tensor([[49, 56, 59,  1, 41, 54, 44,  1],
        [53, 59, 45, 52, 62, 45, 59,  1],
        [58,  1, 52, 49, 60, 60, 52, 45],
        [45,  0, 41, 42, 55, 61, 60,  1]], device='cuda:0')
targets:
tensor([[56, 59,  1, 41, 54, 44,  1, 44],
        [59, 45, 52, 62, 45, 59,  1, 55],
        [ 1, 52, 49, 60, 60, 52, 45,  1],
        [ 0, 41, 42, 55, 61, 60,  1, 65]], device='cuda:0')


# Bigram Language Model 

What are we doing here:


In [18]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train','val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [19]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)

    def forward(self, index, targets = None):
        logits = self.token_embedding_table(index)

        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self,index, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self.forward(index)

            logits = logits[:, -1, :]
            probs = F.softmax(logits,dim=-1)
            index_next = torch.multinomial(probs, num_samples=1)
            index = torch.cat((index,index_next),dim=1)
        return index
    
model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype = torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens = 500)[0].tolist())
print(generated_chars)

['\n', 'Z', '’', 'x', 'a', 'ô', 'w', 'z', 'b', 'W', 'G', ',', 'l', '\ufeff', 'F', '-', 't', 'C', 'U', 'S', 'P', 'P', 'c', ',', 'M', 'é', 'e', 'X', 'C', 'U', 'T', ';', 'B', 't', 'K', 'l', ':', 'O', ':', 'Q', 'K', 'T', ' ', '‘', ';', 'D', 'l', ',', 'p', '_', ',', 'L', ' ', 'f', ';', 'B', 'q', 'U', 'O', ':', 'l', 'H', '1', 'z', 'J', 'w', 'N', ':', 'W', 'æ', 'v', 'à', 'ê', 'J', 'i', '\n', 'u', 'O', '“', '\n', 'S', '.', 'v', '1', 'H', 's', '.', 'Y', 'z', 'k', 'ê', 'y', 'V', 'q', 'e', '!', '‘', 'æ', '-', 'c', 'é', 'g', 'k', '-', 'e', '\ufeff', 'g', 'k', 'U', '_', 'B', 'm', 'ô', 'M', '.', '2', 'è', 'H', 'è', 'H', '\ufeff', '_', '—', 'b', 'o', 'r', 'E', 'd', '0', 'ô', 'X', ',', 'M', 'V', '\n', 'æ', 'f', 'J', 'd', 'k', '.', 'z', 'p', 'U', 'F', 'é', 'e', 'G', '’', 'P', 't', 'X', 'D', '2', 'Q', 'L', 'h', 'ê', 'n', 'É', ';', 'g', '-', 'è', 't', 'r', '5', 'k', 'L', '.', 'u', 'G', 'O', 'h', '-', '!', 'R', 'Y', 'e', '\ufeff', 'I', 'r', 'O', 'Q', 'g', 'N', '5', 'A', 'p', '_', '1', 'j', 'm', 'k', 'c', 

# Optimizer

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"strp: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    xb, yb = get_batch('train')

    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

tensor([212264, 104359, 340503,  34748])
tensor([147026, 244966, 195608, 125972])
tensor([323794, 225273, 158059, 329857])
tensor([159718, 272219, 239288,  71791])
tensor([ 62675, 339077, 146728, 159484])
tensor([ 33155,  65462, 305143, 127683])
tensor([ 95126, 114613, 260365,  39832])
tensor([206289, 269322, 284634, 333760])
tensor([260939, 334909, 186182, 289730])
tensor([170096, 274196, 134259, 115544])
tensor([298762,   3244, 258946, 261386])
tensor([301218, 309378, 111857, 143225])
tensor([ 10159,  95651, 293402, 261990])
tensor([169836, 255806,  44119, 236136])
tensor([248383, 326672,   8946,  34669])
tensor([250856,  27866, 192105, 308380])
tensor([264141, 277749, 230491,  33838])
tensor([107021, 167881,  54500, 261138])
tensor([222020, 289885, 316895, 267072])
tensor([111596, 254702, 324637,  73109])
tensor([ 17524,  55611,  45364, 163674])
tensor([158193, 210824, 168591, 109481])
tensor([ 78347, 227078,  82065, 201220])
tensor([ 73107, 162282, 236943, 314106])
tensor([219310, 