In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device= 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
block_size = 8
batch_size = 4
max_iters = 1000
#eval_interval = 2500
learning_rate = 3e-4
eval_iters = 250

cuda


In [2]:
with open('wizardofoz.txt','r',encoding = 'utf-8') as f:
    text = f.read()
chars = sorted(set(text))
print(chars)
vocsize = len(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']


In [3]:
string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = torch.tensor(encode(text),dtype=torch.long)
# print(data[:100])

In [4]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

In [5]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data)-block_size,(batch_size,))
    # print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x,y = x.to(device),y.to(device)
    return x,y
x,y = get_batch('train')
print('inputs:')
print(x)
print('targets:')
print(y)

inputs:
tensor([[73,  1, 76, 68, 71, 71, 78,  9],
        [56, 71, 54, 56, 64,  1, 54, 67],
        [61, 54, 73,  1, 73, 71, 58, 66],
        [58,  1, 71, 68, 68, 59,  1, 72]], device='cuda:0')
targets:
tensor([[ 1, 76, 68, 71, 71, 78,  9,  3],
        [71, 54, 56, 64,  1, 54, 67, 57],
        [54, 73,  1, 73, 71, 58, 66, 55],
        [ 1, 71, 68, 68, 59,  1, 72, 68]], device='cuda:0')


In [6]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train','val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X,Y = get_batch(split)
            logits,loss = model(X,Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [7]:

# x = train_data[:block_size]
# y = train_data[1:block_size+1]
# for t in range(block_size):
#     context = x[:t+1]
#     target = y[t]
#     print("when input is", context, "target is", target)

In [8]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocsize):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocsize,vocsize)
    def forward(self,index,targets = None):
        logits = self.token_embedding_table(index)

        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits,targets)
            
        return logits,loss
    def generate(self,index,max_new_tokens):
        for _ in range(max_new_tokens):
            # get the predictions
            logits,loss = self.forward(index)
            # focus on only thelast time step
            logits = logits[:,-1,:]
            # apply softmax to get the probabilities
            probs = F.softmax(logits,dim = -1)
            # sample from the distribution
            index_next = torch.multinomial(probs,num_samples = 1)
            index = torch.cat((index,index_next),dim = 1)
        return index
model = BigramLanguageModel(vocsize)
m = model.to(device)

context = torch.zeros((1,1),dtype = torch.long,device = device)
generated_chars = decode(m.generate(context, max_new_tokens = 500)[0].tolist())
print(generated_chars)

        


4&gXB:[r&5IQL?WW)]l.b?59oSln"t 8VukNuq7GCwE.Mv&Vn﻿Yli7_vDh)l4z3Z75AQ1xk5v
467V43;uH_-dw(pHeyrV2&hj0)﻿t6﻿xgiQwEhT-Nb&*:ED﻿&:X]jj6Xv)RO[**kN;q8sVn[MZgMvz7&011iZFK;ibeX AbF04rwv1xsMtK﻿z')6MlYh]q
q,:yaXg?";&hXzn25)o5L4b&9_8sMs("C)r&W0G6ykN7YL_yyUCP3 dQskW8K)Q8eIK2xmJ7ITg5"5m-fT9bgNu)(;npnk﻿QrXT_-BqhExU'?oSrbK﻿JHcwEQM﻿mMW;Ea] E_RFKvZ7YpA,"u2[n 4Ob;P)Vg-tJy&Z,LYLsNu.PDRLW.4*ykNuDl,:)2
4W_HBD"ze,?yf,AQo[C)O7v1g:zIQ1AQ1xWGzEzVOlkNZR'?,no0VcDd
49q7IU9]KNgIKQGVF]FQougMNK"9Ds[mA 4(1x?&D]C2qoWs0W.(﻿bwX58s)v


In [14]:
optimizer = torch.optim.AdamW(model.parameters(),lr=learning_rate)
for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss {losses['train']:.4f}, val loss: {losses['val']:.4f}")

    #sample a batch of data
    xb,yb = get_batch('train')

    #evaluate the loss
    logits,loss= model.forward(xb,yb)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()

print(loss.item())

step: 0, train loss 4.1981, val loss: 4.2135
step: 250, train loss 4.1518, val loss: 4.1389
step: 500, train loss 4.0912, val loss: 4.0738
step: 750, train loss 4.0304, val loss: 4.0575
3.7339131832122803


In [15]:
context = torch.zeros((1,1),dtype = torch.long,device = device)
generated_chars = decode(m.generate(context,max_new_tokens= 500)[0].tolist())
print(generated_chars)


vd"AZ8T4KX_dbT5U!ZY﻿YEH7?oug1T&CQNbw.c!3pe":gi37j2[[d
HeKr0X'4D31pa"W*mI MR'wEIl4L?D'culOj)v.!B",nh_:YhXBX 4w,sY"nJN8K﻿pafEIq
TiGwirCSimC7K)Qr sO7.xKXxI['4F:&24Oj2Wm3C)t4]H:?li9uv7:NuB67(n&AMKrc77edbtu﻿ld7AbNuh[y:CQ*:Yyt_"AsiR]jeopYJRkGa]﻿drzq8lp,9p[[vJMPoSJeR&2-eybHpY--clO7
btr2;h2x""VuI&67yUQouliM--NKv&xqWj;zq:Sy)rd7q)pxo lv00BEzeap_PB_"beIGXx lD'?& coaK*,epY(1HJw_﻿0AF]2'5ej]Hwi_ch]qMttujZ
J:kGVksFKEa?5VZk I86hSWYoupnuY1.vJ]ioWX﻿0"n_rHclB?o-mHch2fem,".&ghK&ZlizqL])qwE2vO" bFKrtev 44:yC*B2
43tX


ReLU, Sigmoid, TanH
ReLU: 