In [22]:
!pip install sentencepiece



In [63]:
import torch
import torch.nn as nn
from torch.nn import functional as F 

import sentencepiece as spm

In [64]:
# # for TPU
# import torch_xla
# import torch_xla.core.xla_model as xm

# DEVICE = xm.xla_device()
# torch.set_default_tensor_type('torch.FloatTensor')

In [90]:

BATCH_SIZE = 64
CONTEXT_SIZE:int = int(256)
MAX_EPOCHS = 2500
EVAL_EVERY = 500
LEARNING_RATE = 3e-4
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)
print(torch.cuda.memory_allocated())
EVAL_ITERS = 200
EMBEDDING_SIZE = 256
NUM_HEADS = 4
NUM_TRAN = 6
DROPOUT = 0.2

with open('/kaggle/input/shakespeare/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
text[:1000]

cuda
144952832


"First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us kill him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be done: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citizens, the patricians good.\nWhat authority surfeits on would relieve us: if they\nwould yield us but the superfluity, while it were\nwholesome, we might guess they relieved us humanely;\nbut they think we are too dear: the leanness that\nafflicts us, the object of our misery, is as an\ninventory to particularise their abundance; our\nsufferance is a gain to them Let us revenge this with\nour pikes, ere we become rakes: for the gods know I\nspeak this in hunger 

In [91]:
# # SentencePiece model
# sp = spm.SentencePieceProcessor()
# sp.load("/kaggle/input/movie-corpus_8000/other/first/1/movie-corpus_8000.model")  # Replace "path_to_your_model.model" with the path to your model file


# vocab_size = sp.get_piece_size()
# encode = lambda s: sp.encode_as_ids(s)
# decode = lambda l: sp.decode_ids(l)

# character by character tokenization
chars = sorted(list(set(text)))
print("Chars:", "".join(chars))
vocab_size = len(chars)
print(vocab_size)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("This is a test of the encoding"))
print(encode(decode(encode("This is a test of the encoding"))))

Chars: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65
[32, 46, 47, 57, 1, 47, 57, 1, 39, 1, 58, 43, 57, 58, 1, 53, 44, 1, 58, 46, 43, 1, 43, 52, 41, 53, 42, 47, 52, 45]
[32, 46, 47, 57, 1, 47, 57, 1, 39, 1, 58, 43, 57, 58, 1, 53, 44, 1, 58, 46, 43, 1, 43, 52, 41, 53, 42, 47, 52, 45]


In [92]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data[0:5])
train_len = int(0.9*len(data))
train_data = data[:train_len]
print(train_data[0:5])
test_data = data[train_len:]

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else test_data
    ix = torch.randint(len(data) - CONTEXT_SIZE, (BATCH_SIZE,))
    x = torch.stack([data[i:i+CONTEXT_SIZE] for i in ix])
    y = torch.stack([data[i+1:i+CONTEXT_SIZE+1] for i in ix])
    x, y = x.to(DEVICE), y.to(DEVICE)
    return x, y

get_batch('train')[0]

tensor([18, 47, 56, 57, 58])
tensor([18, 47, 56, 57, 58])


tensor([[56, 57, 58,  ..., 46, 39, 60],
        [60, 43,  6,  ..., 43, 56,  1],
        [ 0, 40, 56,  ..., 42, 43, 56],
        ...,
        [43,  1, 58,  ..., 52, 45, 12],
        [ 1, 51, 43,  ..., 59, 56,  1],
        [42, 11,  0,  ..., 53, 52, 43]], device='cuda:0')

In [93]:
## TODO: Estimate Loss Function

@torch.no_grad()
def estimate_losses(model):
    out = {}
    model.eval()
    for split in ["train", "val"]:
        losses = torch.zeros(EVAL_ITERS)
        for k in range(EVAL_ITERS):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out
    

In [94]:
# Single Attention Head
class SingleHeadedAttention(nn.Module):
    
    def __init__(self, emb_size, head_size, context_size:int, dropout=0.2):
        super().__init__()
        self.key = nn.Linear(emb_size, head_size, bias=False)
        self.query = nn.Linear(emb_size, head_size, bias=False)
        self.value = nn.Linear(emb_size, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(context_size, context_size)))
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        B,T, C = x.shape
        
        k = self.key(x)   # Size (B, T, head_size)
        q = self.query(x) # same thing..
        v = self.value(x) # same thing..
        
        weight = q @ k.transpose(-2, -1) * (k.shape[-1]**-0.5) # this equation is defined in the original paper and the multiplication part is normalization over each Time Serie in the batch.
        weight = weight.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T) # This will mask the upper triangle of zeros and turn it into -inf for the softmax func
        weight = F.softmax(weight, dim=-1)
        weight = self.dropout(weight)

        v = self.value(x)
        
        out = weight @ v
        
        return out
        

In [95]:
# Multi-Headed Attention

class MultiHeadedAttention(nn.Module):
    
    def __init__(self, num_heads,emb_size, head_size, context_size, dropout=0.2):
        super().__init__()
        self.heads = nn.ModuleList([SingleHeadedAttention(emb_size,head_size, context_size, dropout) for _ in range(num_heads)])
        self.linear = nn.Linear(head_size*num_heads, emb_size)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.linear(out)
        out = self.dropout(out)
        return out
    

In [96]:
# The FeedForward Block of the Transformer, consists of two 
# dense layers with ReLU in Between and a Drop out at the end

class FeedForward(nn.Module):
    
    def __init__(self, emb_size, dropout=0.2):
        super().__init__()
        self.net = nn.Sequential(
        nn.Linear(emb_size, emb_size*4),
        nn.ReLU(),
        nn.Linear(emb_size*4, emb_size),
        nn.Dropout(dropout)
        )
        
    def forward(self, x):
        return self.net(x)

In [97]:
# A single Transformer block
# Consists of a normalization layer, a Multi-Headed attention layer, a bypass addition layer,
# A feed forward linear neural network,
# another bypass layer, and a dropout layer.

class TransformerBlock(nn.Module):
    
    def __init__(self, context_size, num_heads, emb_size, dropout=0.2):
        super().__init__()
        head_size = emb_size // num_heads
        self.norm1 = nn.LayerNorm(emb_size)
        self.attention = MultiHeadedAttention(num_heads,emb_size, head_size, context_size, dropout)
        self.norm2 = nn.LayerNorm(emb_size)
        self.ff = FeedForward(emb_size, dropout)
        
    def forward(self, x):
        x_norm1 = self.norm1(x)
        x = x + self.attention(x_norm1)
        x_norm2 = self.norm2(x)
        x = x + self.ff(x_norm2)
        
        return x
        
        

In [98]:
class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size, context_size, emb_size,num_transformers, num_heads, dropout=0.2):
        super().__init__()        
        self.token_embedding = nn.Embedding(vocab_size, emb_size)
        self.pos_embedding = nn.Embedding(context_size, emb_size)
        self.transformers = nn.Sequential(*[TransformerBlock(context_size,num_heads, emb_size, dropout) for _ in range(num_transformers)] )
        self.norm_final = nn.LayerNorm(emb_size)
        self.linear_final = nn.Linear(emb_size, vocab_size)
        
        self.apply(self._init_weights)

    ## TODO: Read more about this function and the self.apply()
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            
    def forward(self, idx, targets=None):
        
        B, T = idx.shape
        
        token_emb = self.token_embedding(idx)
        pos_emb = self.pos_embedding(idx)
        x = token_emb + pos_emb
        x = self.transformers(x)
        x = self.norm_final(x)
        logits = self.linear_final(x)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

            
        return logits, loss
        
        
    def generate(self, idx, gen_length, context_size:int):
        
        for _ in range(gen_length):
            
            idx_cropped = idx[:, -context_size:] if len(idx[0]) > context_size else idx
            
            logits, loss = self(idx_cropped)
            logits = logits[:, -1, :]
            
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            
            idx = torch.cat((idx, idx_next), dim=1)
        return idx
        

In [99]:
# training the model

model = GPTLanguageModel(vocab_size, CONTEXT_SIZE, EMBEDDING_SIZE, NUM_TRAN,NUM_HEADS, dropout=DROPOUT)
m = model.to(DEVICE)
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')


4.833345 M parameters


In [100]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [101]:
context = torch.zeros((1, 1), dtype=torch.long, device=DEVICE)
print(decode(m.generate(context, 500, CONTEXT_SIZE)[0].tolist()))




'-3
QDlCSr?MJ'qVuNmNj-jLHZIVEwhG 3-CagBfOjdejxxk$dZ;ZyGs vaFBI!
$taeIh emwu;MOtncBEM,OpCeCh;EQ.xPm:'Cg:!tvK;N'QSHRGshQeVa
we&wvlge!a'uOZvgRTfO,Q;Ea!hUmhHxB$Xj3EKdb!:BvjxQsCMeapFJCAdm:FKb-uhrE?VNKW;j!
Bso:ti'$-ix$xEl EUptZmajHjMB,e,rhRx3;3 FOiwc.B3LCYqgIycprlZPvMaws.jmPKQMy.qU
c,S SO-H,ZmtgfRWt;VFwYcaF3S?suN;FR:yzzUxjUs PaYU$&
ujIoi:zSlCnVM&A&C,!Yw:HJPPLVVUROwzDL
K$?CtQUuaf;;?&3ZicVUziZDGeWoJS,cVvUYXgga!PqhCGRA
$?DsIOA!fjXsuiN3aaNK:y.bUYHxDuuYjs YPQR3Uct3:APIPUhFvb3PE$JDrWNRJF wvmH$uLJB!-LSvnUntn


In [102]:
for epoch in range(MAX_EPOCHS):
    
    if epoch % 50 == 0 or epoch == MAX_EPOCHS -1:
        losses = estimate_losses(model)
        print(f"step {epoch}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        
    xb, yb = get_batch('train')
    
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    


step 0: train loss 4.2096, val loss 4.2088
step 50: train loss 2.5525, val loss 2.5563
step 100: train loss 2.4701, val loss 2.4834
step 150: train loss 2.4353, val loss 2.4598
step 200: train loss 2.4056, val loss 2.4327
step 250: train loss 2.3799, val loss 2.4159
step 300: train loss 2.3537, val loss 2.4122
step 350: train loss 2.3428, val loss 2.3981
step 400: train loss 2.3062, val loss 2.3743
step 450: train loss 2.2704, val loss 2.3437
step 500: train loss 2.2218, val loss 2.3036
step 550: train loss 2.1624, val loss 2.2508
step 600: train loss 2.1061, val loss 2.2068
step 650: train loss 2.0590, val loss 2.1564
step 700: train loss 1.9825, val loss 2.1016
step 750: train loss 1.9415, val loss 2.0546
step 800: train loss 1.8882, val loss 2.0144
step 850: train loss 1.8479, val loss 1.9835
step 900: train loss 1.8122, val loss 1.9512
step 950: train loss 1.8169, val loss 1.9683
step 1000: train loss 1.7825, val loss 1.9390
step 1050: train loss 1.7481, val loss 1.8998
step 1100: 

In [103]:
context = torch.zeros((1, 1), dtype=torch.long, device=DEVICE)
print(decode(m.generate(context, 500, CONTEXT_SIZE)[0].tolist()))



Her ofere their wret corns own to reall.

PRISOMPERD:
Thy beaars fratheres back; when, he should dreadson,
And,
Man long-how I headful to; if retlesive,
All boty thou to the dive him to are touch the eed.
How is for here is mine feal seend feellf,
To him offor doud shall, the seempls wo do musp; be hind;
But mother will stick my self.

GRET:

No, my lob, as forwell on myselver:
Well, I'll deerelly so sending we would fext:
Thou art it Marcined, evemberlause I
award you tellfs op this shand at vo
