In [65]:
import torch.nn as nn
from torch.nn import functional as F

In [116]:
learning_rate = 2e-4
max_iters = 50000

In [102]:
import torch

def check_gpu():
    if torch.cuda.is_available():
        device = 'cuda'
        print("GPUs are available.")
        print(f"Number of GPUs: {torch.cuda.device_count()}")
        print(f"Device is set to: {device}")
        for i in range(torch.cuda.device_count()):
            print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
    else:
        print("No GPUs available.")
        device = 'cpu'
        print(f"Device is set to: {device}")
    return device

device = check_gpu()


GPUs are available.
Number of GPUs: 1
Device is set to: cuda
GPU 0: NVIDIA GeForce RTX 4070


In [103]:
block_size = 800
batch_size = 400

In [104]:
with open('wizard_of_oz.txt', 'r' , encoding='utf-8') as f:
    text= f.read()
chars = sorted(set(text))
print(chars)
vocab_size = len(chars)
print(vocab_size)

['\n', ' ', '!', '"', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¹', '‒', '—', '―', '‘', '’', '“', '”', '•', '™', '♠', '♦', '\ufeff']
96


In [105]:
string_to_int = { ch:i for i ,ch in enumerate(chars) }
int_to_string = { i:ch for i ,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

In [106]:
print(decode(encode('hello')))

hello


In [107]:
data = torch.tensor(encode(text), dtype = torch.long)

In [108]:
print(data[:100])

tensor([95, 31, 42, 45, 42, 47, 35, 52,  1, 28, 41, 31,  1, 47, 35, 32,  1, 50,
        36, 53, 28, 45, 31,  1, 36, 41,  1, 42, 53,  0,  0,  1,  1, 29, 52,  0,
         0,  1,  1, 39, 13,  1, 33, 45, 28, 41, 38,  1, 29, 28, 48, 40,  0,  0,
         1,  1, 28, 48, 47, 35, 42, 45,  1, 42, 33,  1, 47, 35, 32,  1, 50, 36,
        53, 28, 45, 31,  1, 42, 33,  1, 42, 53, 11,  1, 47, 35, 32,  1, 39, 28,
        41, 31,  1, 42, 33,  1, 42, 53, 11,  1])


In [109]:
n = int(0.8 * len(data))
train_data = data[:n] 
val_data = data[n:] 


def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    # print(ix)
    x = torch.stack([data[i:i+block_size]for i in ix])
    y = torch.stack([data[i+1:i+block_size+1]for i in ix])
    return x, y    

In [110]:
x,y = get_batch('train')
print(x)
print(y)

tensor([[64, 61,  1,  ..., 61, 70, 61],
        [ 1, 58, 61,  ..., 76, 64, 61],
        [58, 76,  1,  ...,  1, 39, 71],
        ...,
        [57, 59, 64,  ..., 71, 74, 60],
        [77, 72,  1,  ..., 71,  1, 64],
        [75, 64, 61,  ..., 70, 80, 65]])
tensor([[61,  1, 74,  ..., 70, 61, 69],
        [58, 61, 62,  ..., 64, 61,  1],
        [76,  1, 71,  ..., 39, 71, 74],
        ...,
        [59, 64, 65,  ..., 74, 60, 11],
        [72,  1, 76,  ...,  1, 64, 57],
        [64, 61,  1,  ..., 80, 65, 71]])


In [111]:
block_size = 8 

x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = x[t]
    print('when input is', context, 'target is', target)

when input is tensor([95]) target is tensor(95)
when input is tensor([95, 31]) target is tensor(31)
when input is tensor([95, 31, 42]) target is tensor(42)
when input is tensor([95, 31, 42, 45]) target is tensor(45)
when input is tensor([95, 31, 42, 45, 42]) target is tensor(42)
when input is tensor([95, 31, 42, 45, 42, 47]) target is tensor(47)
when input is tensor([95, 31, 42, 45, 42, 47, 35]) target is tensor(35)
when input is tensor([95, 31, 42, 45, 42, 47, 35, 52]) target is tensor(52)


In [112]:
x, y = get_batch('train')
x,y = x.to(device) , y.to(device)
print(f"The shape of x is: {x.shape}")
print(f"x is: {x}",  end='\n\n')
print(f"The shape of y is: {y.shape}")
print(f"y is: {y}")

The shape of x is: torch.Size([400, 8])
x is: tensor([[61,  0, 65,  ..., 74, 61, 68],
        [61,  1, 68,  ...,  1, 71, 62],
        [11,  1, 64,  ...,  1, 75, 71],
        ...,
        [57,  1, 63,  ..., 57, 76,  1],
        [ 1, 61, 80,  ..., 72, 68, 57],
        [75,  1, 57,  ..., 71,  1, 69]], device='cuda:0')

The shape of y is: torch.Size([400, 8])
y is: tensor([[ 0, 65, 70,  ..., 61, 68, 65],
        [ 1, 68, 57,  ..., 71, 62,  1],
        [ 1, 64, 65,  ..., 75, 71, 74],
        ...,
        [ 1, 63, 74,  ..., 76,  1, 79],
        [61, 80, 61,  ..., 68, 57, 74],
        [ 1, 57, 68,  ...,  1, 69, 81]], device='cuda:0')


In [113]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)

    def forward(self, index, targets=None):
        # print(f"index shape is {index.shape}")
        logits = self.token_embedding_table(index)
        # print(f"Logits shape is: {logits.shape}")
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            # print(f"Logits shape after reshape is: {logits.shape}")
            # print(f"Targets shape is: {targets.shape}")
            targets = targets.view(B*T)
            # print(f"Targets shape after reshape is: {targets.shape}")
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self.forward(index)
            # print(f"Logits shape is: {logits.shape}")
            logits = logits[:,-1,:]
            # print(f"Logits shape after reshape is: {logits.shape}")
            probs = F.softmax(logits, dim= -1)
            # print(f"Probs shape is: {probs.shape}")
            index_next = torch.multinomial(probs, num_samples=1)
            # print(f"Index next shape is: {index_next.shape}")
            index = torch.cat((index,index_next), dim=1)
            # print(f"Index shape is: {index.shape}")
        return index
    
model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)



Y——k;•w™!-3H‘)s88d_]”l8♦0•c(NifuYjR—p4tK
dm.“D56L64g;w/FD¹5YF$2TqWJ&)*"_h9?,v1f]“:F3♦dxr―/:Y¹l].w0(E?™f16ngb.RenynsQlz""Jj_59AG7Mo'♦qF,sh“?W:MHY!r•oQk6sW﻿NdYsBMP%]Yooqt:;uTe182I,jV%]Xuz!(bK/”Bl’28d-D66;h'•SSsguD5_58“J-/3-wAlHuDl’oght(%HcJ’4te1V_‘8Pa$Cw"WO1L”(6Z“*uGO3jv—pqrf’dcU‒oghx.w$1”b/o1“$ p:CW-1:5,Zqy‘Qc7$I﻿q―;w$gZjaxg8tx¹!NP-:M• ”i[gf7X]﻿7bHlrLEZ/f!Y3”-R%N3!-3XmHfy?xg?0t♦J:mEX—(E3ON♦l4H&T,eHk9p﻿.hp―$0AxDMV—7X]/t%cu]tJ—o—qkV8nPn9h'jO0iM―j-r―r7rn
LEtR“:Irn―D5%1(f/w;xEWOZs—EPh)9-“EKI﻿rf1:SbH,


In [117]:
from tqdm.auto import tqdm
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in tqdm(range(max_iters)):

    xb, yb = get_batch('train')

    logits, loss = model.forward(xb.to(device), yb.to(device))
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

100%|██████████| 50000/50000 [08:12<00:00, 101.60it/s]

2.4469997882843018





In [120]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)



cure,"Jiom
" hainy s, pleneritif w Ind my anghasoathis O widery ashomisitin ch. ttoprireedethicad thesthesh aro t thedrithaise l_Lof inonere stode keas He
ng e ware ♠  wo ulte hatan d prs heas ch thesouns s Hime hyowoos n t  Buthes mpe. oy  5. t  an, pthiby he THatugagifupicaiked _.
"  he,"I se s
tod tl ong aro  abllilencapy:
fret outh ofo  mana's me EThinotirnn aichese t Lo foave et fo htigathangd rd risse thevinsive _ owher I ss angn ad 127. ftte; bed Ofos
to
he ut'merged touanklye  fesigred,
