In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/seinfeld-text-corpus/corpus.txt


In [2]:
#Import pytorch
from torch.torch_version import Version
import torch
import torch.nn as nn
from torch.nn import functional as F

#Set up pytorch to run on GPU if available
if torch.cuda.is_available():
    processor = 'cuda'
else:
    processor = 'cpu'
device = torch.device(processor)

In [3]:
#Create vocabulary
scripts = open("/kaggle/input/seinfeld-text-corpus/corpus.txt").read().lower().replace('[','').replace(']','')

chars = sorted(list(set(scripts)))
vocab_size = len(chars)
print(vocab_size)
print(chars)
print(scripts[:100])

77
['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '>', '?', '\\', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}', '~', '\xa0', '¿', 'à', 'è', 'é', 'í', 'ï', 'ñ', 'ó', 'ÿ', '–', '—', '…']
int. comedy club – night
(jerry is on stage, performing.)
jerry: do you know what this is all about?


In [4]:
#Encoding & Decoding strings into numbers
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

def encode(st):
    output = []
    for ch in st:
        output.append(stoi[ch])
    return output

def decode(lis):
    output = ""
    for num in lis:
        output = output + itos[num]
    return output
print(encode("leo"))
print(decode(encode("leo")))

[46, 39, 49]
leo


In [5]:
#Hyperparams
block_size = 64 #The max number of characters of context used. seq <= block_size always
N = 10 #Number of self-attention & feed forward blocks stacked
d_model = 80 #Dimensions of the embedding vectors in the model
h = 8 #Number of heads in self-attention
d_hidden = d_model * 4 #Number of hidden nodes in feed forward networks

In [6]:
#I am using variable names defined in "Attention Is All You Need" for simplicity.

#Define a single Scaled Dot-Product Attention head. I am simplifying it by enforcing d_k = d_v
class Attention(nn.Module):

  def __init__(self, d_k, mask):
    super().__init__()
    self.mask = mask
    self.d_k = d_k
    self.register_buffer('lowerTriangle', torch.tril(torch.ones(block_size, block_size)))

  def forward(self, Q, K, V):
    #Takes in vectors of Queries, Keys & Values. Queries, Keys & Values have dimension seq x d_k
    if (Q.shape[1] != self.d_k or K.shape[1] != self.d_k or V.shape[1] != self.d_k):
      raise Exception('Invalid Query, Key or Value Dimensions')
    seq = Q.shape[0]
    #First take the dot product of Queries & Keys. Weight has dimensions seq x seq
    weight = Q @ K.transpose(0, 1)
    #Now scale by 1/sqrt(d_k)
    weight = weight  * (self.d_k**-0.5)
    #Mask everything not in the lower triangular of weight
    if (self.mask):
      weight = weight.masked_fill(self.lowerTriangle[:seq, :seq]== 0, float('-inf'))
    #Now apply softmax in the dimension of the rows
    weight = weight.softmax(1)
    #Finally, multiply values by weights to get the outputs
    out = weight @ V # [seq, seq] x [seq, d_k] = [seq, d_k]
    return out

#Define a Multi-Head Attention layer. As in the paper, I am setting d_k = d_model/h
class MultiHeadAttention(nn.Module):

  def __init__(self, d_model, h, mask):
    super().__init__()
    self.mask = mask
    self.d_model = d_model
    self.h = h
    self.d_k = int(d_model/h)
    if (self.d_k != d_model/h):
      raise Exception('Invalid Dimensions Provided') #Ensure valid dimensions
    self.W_O = nn.Linear(h*self.d_k, d_model, bias=False) #Output linear layer
    self.W_Q = nn.ModuleList() #The h different Query linear layers
    self.W_K = nn.ModuleList() #The h different Key linear layers
    self.W_V = nn.ModuleList() #The h different Value linear layers
    self.Att = Attention(self.d_k, mask) #I think (?) only 1 attention layer is needed since no backprop happens TODO: VERIFY
    for i in range(h):
      #Initialize all h linear layers for Q, K, V
      self.W_Q.append(nn.Linear(d_model, self.d_k, bias=False))
      self.W_K.append(nn.Linear(d_model, self.d_k, bias=False))
      self.W_V.append(nn.Linear(d_model, self.d_k, bias=False)) #TODO: Instead of Linear layers, just create Matrices

  def forward(self, Q, K, V):
    #The inputs are Queries, Keys and Vectors which each have size seq x d_model
    if (Q.shape[1] != self.d_model or K.shape[1] != self.d_model or V.shape[1] != self.d_model):
      raise Exception('Invalid Query, Key or Value Dimensions')
    heads = []
    for i in range(self.h):
      queries = self.W_Q[i](Q)
      keys = self.W_K[i](K)
      values = self.W_V[i](V)
      #At this point, queries keys & values have dimensions seq x d_k
      heads.append(self.Att.forward(queries, keys, values))
    out = torch.cat(heads, 1) #The output has the same dimension as all the inputs: seq x d_model
    out = self.W_O(out)
    return out

#Define a Feed Forward Network
class FeedForward(nn.Module):

  def __init__(self, d_model, d_hidden):
    super().__init__()
    self.network = nn.Sequential(
        nn.Linear(d_model, d_hidden, bias=True),
        nn.ReLU(),
        nn.Linear(d_hidden, d_model, bias=True)
    )

  def forward(self, x):
    x = self.network(x)
    return x

#Define Layer Normalization:
class LayerNorm(nn.Module):

  def __init__(self, d_model):
    super().__init__()
    self.epsilon = 1e-5
    self.gamma = nn.Parameter(torch.ones(d_model))
    self.beta = nn.Parameter(torch.zeros(d_model))

  def __call__(self, x):
    mean = x.mean(1, keepdim=True) #Mean across the layer i.e. the column
    variance = x.var(1, keepdim=True) #Mean across the layer i.e. column
    norm = (x - mean) / torch.sqrt(variance + self.epsilon) #Normalize
    out = self.gamma * norm + self.beta #Scale by gamma, add beta to achieve var= gamma, mean = beta
    return out


#Define an Block of Multi Head Self-Attention with a Residual Connection & Layer Normalization
class NormalizedSelfAttention(nn.Module):

  def __init__(self, d_model, h, mask):
    super().__init__()
    self.MHA = MultiHeadAttention(d_model, h, mask)
    self.norm = LayerNorm(d_model)
  
  def forward(self, x):
    x = x + self.MHA(x, x, x)
    x = self.norm(x)
    return x

#Define a block of a Feed Forward Network with a Residual Connection & Layer Normalization
class NormalizedFeedForward(nn.Module):

  def __init__(self, d_model, d_hidden):
    super().__init__()
    self.FF = FeedForward(d_model, d_hidden)
    self.norm = LayerNorm(d_model)
  
  def forward(self, x):
    x = x + self.FF(x)
    x = self.norm(x)
    return x

#Define a stand-alone Decoder block with N self-attention and feed forward blocks (without embeddings or softmax)
#Note that the self attention is Masked because this is a Decoder
class StandAloneDecoder(nn.Module):

  def __init__(self, N, d_model, h, d_hidden):
    super().__init__()
    self.network = nn.Sequential()
    for i in range(N):
      self.network.append(NormalizedSelfAttention(d_model, h, True))
      self.network.append(NormalizedFeedForward(d_model, d_hidden))

  def forward(self, x):
    out = self.network(x)
    return out

In [7]:
#Split up training & test data
data = torch.tensor(encode(scripts), dtype=torch.long).to(device)
cutoff = int(0.9 * len(scripts))
train_data = data[:cutoff]
test_data = data[cutoff:]

In [8]:
#Load training data
#torch.manual_seed(2131)

def get_sequence(split):
    #Returns a sequence of block_size training examples
    data = train_data if split == "train" else test_data
    ix = torch.randint(len(data)-block_size, (1,))
    x = data[ix:ix+block_size].to(device)
    y = data[ix+1:ix+block_size+1].to(device)
    return x,y
    

x_train, y_train = get_sequence('train')
print(x_train.shape)
print(x_train)
print(y_train.shape)
print(y_train)

torch.Size([64])
tensor([35, 46, 38, 53, 54, 39, 43, 48, 11,  2,  0, 41, 39, 49, 52, 41, 39, 27,
         1, 57, 39,  1, 38, 43, 38,  1, 35,  1, 37, 52, 49, 53, 53, 57, 49, 52,
        38,  1, 50, 55, 60, 60, 46, 39,  1, 54, 49, 41, 39, 54, 42, 39, 52, 13,
         1, 11, 43, 48,  1, 36, 39, 38, 11, 15], device='cuda:0')
torch.Size([64])
tensor([46, 38, 53, 54, 39, 43, 48, 11,  2,  0, 41, 39, 49, 52, 41, 39, 27,  1,
        57, 39,  1, 38, 43, 38,  1, 35,  1, 37, 52, 49, 53, 53, 57, 49, 52, 38,
         1, 50, 55, 60, 60, 46, 39,  1, 54, 49, 41, 39, 54, 42, 39, 52, 13,  1,
        11, 43, 48,  1, 36, 39, 38, 11, 15,  1], device='cuda:0')


In [9]:
#Define the Decoder model w/ input & positional embeddings
class DecoderWithEmbeddings(nn.Module):
    
    def __init__(self):
        #TODO: change all these to parameters for the DecoderWithEmbedding object?
        super().__init__()
        self.token_embed = nn.Embedding(vocab_size, d_model)
        self.pos_embed = nn.Embedding(block_size, d_model) #TODO: change this to as seen in paper
        self.decoders = StandAloneDecoder(N, d_model, h, d_hidden)
        self.unembed = nn.Linear(d_model, vocab_size)
    
    def forward(self, chars, target=None):
        tok = self.token_embed(chars)
        pos = self.pos_embed(torch.arange(chars.shape[0]).to(device))
        x = tok + pos
        x = self.decoders(x)
        logits = self.unembed(x)
        
        if target is None:
            loss = None
        else:
            loss = F.cross_entropy(logits, target)  #Are the dimensions of logits right here?
            
        return logits, loss
    
    def sample(self, chars, output_len):
        for i in range(output_len):
            logits, loss = self(chars[-8:])
            logits = logits[-1,:]
            probs = F.softmax(logits, dim=0)
            next_char = torch.multinomial(probs, num_samples=1)
            chars = torch.cat((chars, next_char), dim=0)
        return chars

In [10]:
m = DecoderWithEmbeddings()
model = m.to(device)
print("Number of params:")
print(sum(p.numel() for p in model.parameters()))

Number of params:
792717


In [11]:
@torch.no_grad()
def estimate_loss(eval_iters):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_sequence(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [12]:
def train(training_iters, eval_interval, eval_iters, optimizer):
    for iter in range(training_iters):
        #Evaluate training loss
        if iter % eval_interval == 0 or iter == training_iters - 1:
            losses = estimate_loss(eval_iters)
            print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

        #Get a training example
        x,y = get_sequence('train')

        #evaluate loss
        logits, loss = model(x, y)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

In [13]:
#Setup optimizer, train 100000 steps.
learning_rate = 1e-3
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
train(100000, 10000, 1000, optimizer)

step 0: train loss 4.5437, val loss 4.5424
step 10000: train loss 1.7573, val loss 1.7923
step 20000: train loss 1.6154, val loss 1.6716
step 30000: train loss 1.5550, val loss 1.6219
step 40000: train loss 1.5329, val loss 1.5727
step 50000: train loss 1.4876, val loss 1.5509
step 60000: train loss 1.4679, val loss 1.5239
step 70000: train loss 1.4516, val loss 1.5031
step 80000: train loss 1.4489, val loss 1.5107
step 90000: train loss 1.4312, val loss 1.4773
step 99999: train loss 1.4310, val loss 1.4934


In [14]:
#Decay training rate, train another 10000 steps.
learning_rate = 1e-4
optimizer.param_groups[0]['lr'] = learning_rate
train(10000, 1000, 500, optimizer)

step 0: train loss 1.4576, val loss 1.5204
step 1000: train loss 1.3725, val loss 1.4404
step 2000: train loss 1.3531, val loss 1.4258
step 3000: train loss 1.3443, val loss 1.4262
step 4000: train loss 1.3232, val loss 1.4023
step 5000: train loss 1.3381, val loss 1.4236
step 6000: train loss 1.3361, val loss 1.4172
step 7000: train loss 1.3213, val loss 1.3831
step 8000: train loss 1.3216, val loss 1.3807
step 9000: train loss 1.3008, val loss 1.4067
step 9999: train loss 1.3192, val loss 1.3886


In [15]:
#Estimate loss to a high accuracy
train(1, 10, 5000, optimizer)

step 0: train loss 1.3207, val loss 1.3982


In [16]:
#Generate a script from the model
start = torch.tensor(encode('\n')).to(device)
print(decode(model.sample(start, 500).tolist()))


(kramer at stries hold to jerry) you know she's breakunts) give her, she's, he takes a greater.
(kramer walks up. i thought is understand, anda: good, i know what you got out with the bost and manute omen?!
madevitys
(morty smokfardayl undecturioil) oh, i had a lofer party is harness.)
george: ..ah...... churtely.)
kramer: hello.
jerry: you reached very ranies off,  pricked fashing sneak guss
mary: okay. asself a): lad jap cleart.) now, you gotta go and he talks-bootheried with the side out peer


# Highscore:
step 0: train loss 1.2801, val loss 1.3542
# Hyperparams:
block_size = 64 
N = 8 
d_model = 80 
h = 8 
d_hidden = d_model * 4 

In [17]:
#Code to save/restore model

filepath = 'trained_parameters'
torch.save(model.state_dict(), filepath)

#Later to restore:
#model.load_state_dict(torch.load(filepath))
#model.eval()