In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/shakespere-works/input.txt


In [2]:
with open('/kaggle/input/shakespere-works/input.txt','r',encoding='utf-8') as f:
    text=f.read()

In [3]:
print('len in text dataset',len(text))

len in text dataset 1115394


In [4]:
chars=sorted(list(set(text)))
vocab_size=len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [5]:
stoi={ch:i for i,ch in enumerate(chars)}
itos={i:ch for i,ch in enumerate(chars)}


encode=lambda s:[stoi[c] for c in s]
decode=lambda l:''.join([itos[i] for i in l])

print(encode('hii there'))
print(decode(encode('hii there')))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [6]:
import torch
data=torch.tensor(encode(text),dtype=torch.long)


In [7]:
n=int(0.9*len(data))
train_data=data[:n]
val_data=data[n:]

In [8]:
len(train_data)


1003854

In [9]:
torch.manual_seed(1337)
batch_size=4
block_size=8

def get_batch(split):
    data=train_data if split=='train' else val_data
    ix=torch.randint(len(data)-block_size,(batch_size,))
    x=torch.stack([data[i:i+block_size] for i in ix])
    y=torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y

xb,yb=get_batch('train')
print('x_train')
print(xb.shape)
print(xb)
print(yb.shape)
print(yb)


print('------')
for b in range(batch_size):
    for t in range(block_size):

        context=xb[b,:t+1]
        
        
        target=yb[b,t]
        print(f'when input is {context.tolist()} the target:{target}')

x_train
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
------
when input is [24] the target:43
when input is [24, 43] the target:58
when input is [24, 43, 58] the target:5
when input is [24, 43, 58, 5] the target:57
when input is [24, 43, 58, 5, 57] the target:1
when input is [24, 43, 58, 5, 57, 1] the target:46
when input is [24, 43, 58, 5, 57, 1, 46] the target:43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target:39
when input is [44] the target:53
when input is [44, 53] the target:56
when input is [44, 53, 56] the target:1
when input is [44, 53, 56, 1] the target:58
when input is [44, 53, 56, 1, 58] the target:46
when input is [44, 53, 56, 1, 58, 46] the

In [11]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    
    def __init__(self,vocab_size):
        super().__init__()
        self.token_embedding_table=nn.Embedding(vocab_size,vocab_size)
        
    def forward(self,idx,targets=None):
        logits=self.token_embedding_table(idx)
        
        if targets is None:
            loss=None
            
        else:
        
            B,T,C=logits.shape
            logits=logits.view(B*T,C)
            targets=targets.view(B*T)
            loss=F.cross_entropy(logits,targets)
            
        return logits,loss
        
        
    def generate(self,idx,max_new_tokens):
        for _ in range(max_new_tokens):
            logits,loss=self(idx)
            logits=logits[:,-1,:]

            probs=F.softmax(logits,dim=1)
            
            idx_next=torch.multinomial(probs,num_samples=1)
            
            idx=torch.cat((idx,idx_next),dim=1)
            
        return idx


In [12]:
m=BigramLanguageModel(vocab_size)
logits,loss=m(xb,yb)
print(logits.shape)
print(loss)


print(decode(m.generate(idx=torch.zeros((1,1),dtype=torch.long)
,max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [13]:
optimizer=torch.optim.Adam(m.parameters(),lr=1e-3)

In [14]:
batch_size=32

for steps in range(10000):
    xb,yb=get_batch('train')
    logits,loss=m(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print(loss.item())

2.572469472885132


In [15]:
print(decode(m.generate(idx=torch.zeros((1,1),dtype=torch.long)
,max_new_tokens=500)[0].tolist()))


Iyoteng h hasbe pave pirance
Rie hicomyonthar's
Plinseard ith henoure wounonthioneir thondy, y heltieiengerofo'dsssit ey
KIN d pe wither vouprrouthercckehathe; d!
My hind tt hinig t ouchos tes; st yo hind wotte grotonear 'so it t jod weancotha:
h hay.JUCle n prids, r loncave w hollular s O:
HIs; ht anjx?

DUThineent.

Lavinde.
athave l.
KEONGBUCHandspo be y,-hedarwnoddy scace, tridesar, wne'shenous s ls, theresseys
PlorseelapinghiybHen yof GLUCEN t l-t E:
I hisgothers w dere! ABer wotouciullle's


# The mathematical trick in self-attention

In [41]:
torch.manual_seed(1337)
B,T,C=4,8,2
x=torch.randn(B,T,C)
print(x.shape)

torch.Size([4, 8, 2])


In [54]:
xbow=torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev=x[b,:t+1]
        xbow[b,t]=torch.mean(xprev,0)
    
        

In [69]:
tn=torch.randn(4,8,32)
si=torch.randn()

In [55]:
wei=torch.tril(torch.ones(T,T))
wei=wei/wei.sum(1,keepdim=True)

xbow2=wei@x 
print(xbow2.shape)
print(xbow.shape)
torch.allclose(xbow,xbow2)

torch.Size([4, 8, 2])
torch.Size([4, 8, 2])


False

In [62]:
tril=torch.tril(torch.ones(T,T))
wei=torch.zeros((T,T))
wei=wei.masked_fill(tril==0,float('-inf'))
wei=F.softmax(wei,dim=-1)
xbow3=wei@x
torch.allclose(xbow2,xbow3)

True

In [16]:
# Define your input tensor (logits)
inputs = torch.tensor([[0.0, 0.2, 0.3]])  # Shape (batch_size, num_classes)
inputs=torch.nn.functional.softmax(inputs)
print(inputs)
# Define your target tensor (labels)
target = torch.tensor([1])  # Can be class indices (LongTensor)

# Calculate the loss
loss = F.cross_entropy(inputs, target)

# Print the loss value
print(loss)

tensor([[0.2800, 0.3420, 0.3780]])
tensor(1.0908)


  inputs=torch.nn.functional.softmax(inputs)


In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
#from torch.nn import FeedForward

batch_size=32
block_size=8
max_iters=5000
eval_interval=300
learning_rate=1e-3
device='cuda' if torch.cuda.is_available() else 'cpu'
eval_iters=200
n_embd=384
n_head=6
n_layer=6
dropout=0.2

torch.manual_seed(1337)

with open(r"/kaggle/input/shakespere-works/input.txt",'r',encoding='utf-8') as f:


    text=f.read()

chars=sorted(list(set(text)))
vocab_size=len(chars)


stoi={ch:i for i,ch in enumerate(chars)}
itos={i:ch for i,ch in enumerate(chars)}

encode=lambda s:[stoi[c] for c in s]
decode=lambda l:[''.join(itos[i] for i in l)]


data=torch.tensor(encode(text),dtype=torch.long)
n=int(0.9*len(data))
train_data=data[:n]
val_data=data[n:]


def get_batch(split):
    data=train_data if split == 'train' else val_data
    ix=torch.randint(len(data)-block_size,(batch_size,))
    x=torch.stack([data[i:i+block_size] for i in ix])
    y=torch.stack([data[i+1:i+block_size+1] for i in ix])
    x,y=x.to(device),y.to(device)
    return x,y

@torch.no_grad()
def estimate_loss():
    out={}
    model.eval()
    for split in ['train','val']:
        losses=torch.zeros(eval_iters)
        for k in range(eval_iters):
            X,Y=get_batch(split)
            logits,loss=model(X,Y)
            losses[k]=loss.item()
        out[split]=losses.mean()
    model.train()
    return out

class Head(nn.Module):
    # one head of self-attention

    def __init__(self,head_size):
        super(Head,self).__init__()
        self.key=nn.Linear(n_embd,head_size,bias=False)
        self.query=nn.Linear(n_embd,head_size,bias=False)
        self.value=nn.Linear(n_embd,head_size,bias=False)

        self.register_buffer('trill',torch.tril(torch.ones(block_size,block_size)))
        self.dropout=nn.Dropout(dropout)

    def forward(self,x):
        B,T,C=x.shape
        k=self.key(x)
        q=self.query(x)

        wei=q@k.transpose(-2,-1)*C**-0.5
        wei=wei.masked_fill(self.trill[:T,:T]==0,float('-inf'))
        wei=F.softmax(wei,dim=-1)
        wei=self.dropout(wei)

        v=self.value(x)
        out=wei@v
        return out



class MultiHeadAttention(nn.Module):
    """Multiple heads of self attention in parallel"""

    def __init__(self,num_heads,head_size):
        super(MultiHeadAttention,self).__init__()
        self.heads=nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj=nn.Linear(n_embd,n_embd)
        self.dropout=nn.Dropout(dropout)
    def forward(self,x):
        out= torch.cat([h(x) for h in self.heads],dim=-1)

        out=self.dropout(self.proj(out))

        return out

class FeedForward(nn.Module):

    def __init__(self,n_embd):
        super(FeedForward,self).__init__()
        self.net=nn.Sequential(
            nn.Linear(n_embd,4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd,n_embd),
            nn.Dropout(dropout),
        )

    def forward(self,x):
        return self.net(x)


class Block(nn.Module):
    """ Transformer block: communication followed by computation"""

    def __init__(self,n_embd,n_head):
        super(Block,self).__init__()
        head_size=n_embd//n_head
        self.sa=MultiHeadAttention(n_head,head_size)
        self.ffwd=FeedForward(n_embd)
        self.ln1=nn.LayerNorm(n_embd)
        self.ln2=nn.LayerNorm(n_embd)

    def forward(self,x):
        x=x+self.sa(self.ln1(x))

        x=x+self.ffwd(self.ln2(x))
        return x




class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size,n_embd)
        self.blocks=nn.Sequential(*[Block(n_embd,n_head=n_head) for _ in range(n_layer)])
        self.ln_f=nn.LayerNorm(n_embd)
        self.lm_head=nn.Linear(n_embd,vocab_size)

    def forward(self, idx, targets=None):
        B,T=idx.shape

        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T,device=device))
        x=tok_emb+pos_emb
        x=self.blocks(x)
        x=self.ln_f(x)
        logits=self.lm_head(x)#(B,T, vocab_size)

        if targets is None:
            loss = None

        else:

            B, T, C = logits.shape
            logits = logits.view(B * T, C)


            targets = targets.view(B * T,)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):

            idx_cond=idx[:,-block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]

            probs = F.softmax(logits, dim=1)

            idx_next = torch.multinomial(probs, num_samples=1)

            idx = torch.cat((idx, idx_next), dim=1)

        return idx


model=BigramLanguageModel()
m=model.to(device)

optimizer=torch.optim.Adam(model.parameters(),learning_rate)

for iter in range (max_iters):
     if iter % eval_interval==0:
        losses=estimate_loss()
        print(f"step{iter}: train loss {losses['train']:.4f},val loss {losses['val']:.4f}")


     xb,yb=get_batch('train')
     logits,loss=model(xb,yb)
     optimizer.zero_grad(set_to_none=True)
     loss.backward()
     optimizer.step()


context=torch.zeros((1,1),dtype=torch.long,device=device)
print(decode(m.generate(context,max_new_tokens=500)[0].tolist()))






step0: train loss 4.3633,val loss 4.3610
step300: train loss 2.3067,val loss 2.3192
step600: train loss 2.1691,val loss 2.2275
step900: train loss 2.1296,val loss 2.1878
step1800: train loss 2.0120,val loss 2.1390
step2700: train loss 1.9678,val loss 2.0727
step3000: train loss 1.9311,val loss 2.0419
step3300: train loss 1.9328,val loss 2.0557
step3600: train loss 1.9129,val loss 2.0277
step3900: train loss 1.8953,val loss 2.0055
step4200: train loss 1.8926,val loss 2.0103
step4500: train loss 1.8675,val loss 1.9924
step4800: train loss 1.8689,val loss 1.9874
["\nBale here.\n\nNo, fice it, she pregue\nRome I mer shonoule talk. What his fortle he susblors;\nHoush to have unidound is of, how yet the descian, by have befur deyears;\nWhich perison the know true crue aloun deay,\nAnd it see fierfices: avost the heart your ress of o munt your fatest?\nCome mabe fathers will.\n\nIS!'\nAh, refore wife, was paciinna;\nChillows thy grow by-treought,\nReetomes! anorenfnoe, whis.\n\nMBRUTUS:\nA ke

In [4]:
print(decode(m.generate(idx=torch.zeros((1,1),dtype=torch.long,device=device),max_new_tokens=5000)[0].tolist()))

["\nShall Godly pear\nI tare kintruse\na thee frould, but to and thy cous helps,\nIs to come, the person;\nAnd\nThe brough sengue, wan exseak a of Mornides,\nBe to fave you do chay\nMy lighte and you kin one stay sonder's, nursest us\nfrough ass'll be\nshicheres, I nall fawn letio be fen wo ince, 'AP I somen: you?\n\nMieped,\nDo movere and in yor syounds.\n\nKh Tis rayaineves sir:\nThat it thee adverice from all ant I is a me, to his behe conse croffatess?\nChall press, ever shalt\nI strubles.\n\nAUf Yorn agaid!\n\nSey his say, on this what I voust try, it ead With'd you know fem, the holdiy resomeiu.\n\nJULIET:\nI with see-fold,\nbuther cany drecest's of I honour that of that sheetmen day, intry, there though hus acconduty.\n\nKING HIRA\nD:\nChave true r, yourhly peather\nWhat more,\nIf that Capter,\nThese orr'd do entrare for my consisodise, should; the wish'd son.\nA thy: sis, nivenre's saven'st is dead aster the\ngivins.\nBut this thou,\nWill a shath friend o'er of a maidison;\nBut