## Components
Token Embedding

Positional Encoding

Causal Masking

Multi-Head Self Attention

Feed Forward Network

Layer Normalization

Dropout

Training & Evaluation Pipeline

In [1]:
text_url='https://www.gutenberg.org/files/2701/2701-0.txt'


In [2]:
import os, urllib
target_dir=os.path.join(os.getcwd(),'text')
print(target_dir)
file_path=os.path.join(target_dir, 'text.txt')


/kaggle/working/text


In [3]:
if not os.path.exists(target_dir):
    os.makedirs(target_dir)
try:
    urllib.request.urlretrieve(text_url,file_path)
    print(f"File downloaded tp {target_dir}")
except Exception as e:
    print(e)

File downloaded tp /kaggle/working/text


In [4]:
with open (file_path, 'r',encoding="utf-8") as f:
    lines=f.readlines()

In [5]:
new_file_path=os.path.join(target_dir, 'text_cleaned.txt')
with open (new_file_path, 'w+') as wr:
    for i, line in enumerate(lines):     
        
        if i>=817:
            wr.write(line)
                     
          
        

In [6]:
# read the contents of the input file
def read_input_text(file_path):    
    with open (file_path, 'r',encoding="utf-8") as f:
        text=f.read()

    return text

In [7]:
text=read_input_text(new_file_path)

In [8]:
text[:500]

'CHAPTER 1. Loomings.\n\nCall me Ishmael. Some years ago—never mind how long precisely—having\nlittle or no money in my purse, and nothing particular to interest me\non shore, I thought I would sail about a little and see the watery part\nof the world. It is a way I have of driving off the spleen and\nregulating the circulation. Whenever I find myself growing grim about\nthe mouth; whenever it is a damp, drizzly November in my soul; whenever\nI find myself involuntarily pausing before coffin warehouses, '

In [9]:
from collections import Counter
import re
class Novel_Tokenizer:
    def __init__(self, vocabulary_size=10000):
        self.vocabulary_size=vocabulary_size
        self.w2i={'<pad>':0, '<unk>':1,'<nl>':2}
        self.i2w={0:'<pad>',1:'<unk>',2: '<nl>'}
        self.counter=Counter()
        self.vocab=[]
        
    def build_vocabulary(self, texts, min_freq=2):
        tokens=self.tokenize(texts)
        self.counter.update(tokens)
        most_common_tokens=self.counter.most_common(self.vocabulary_size-len(self.w2i))
        idx=3
        for word, freq in most_common_tokens:
            if freq > min_freq:
                self.w2i[word]=idx
                self.i2w[idx]=word
                idx+=1
        
    def tokenize(self, text):
        #text_lower=text.lower()
        text = text.replace('\n\n', '\n').replace('\n','<nl>').strip()    
        return re.findall(r"\w+(?:'\w+)?|<nl>|[^\w\s]", text)
                
    
        
        

In [10]:
tokenizer=Novel_Tokenizer()
tokenizer.build_vocabulary(text)

In [11]:
def create_training_sequences(text, tokenizer, sequence_length=30):
    inputs=[]
    targets=[]
    
    tokens=tokenizer.tokenize(text)#list(tokenizer.w2i.keys())
    print(len(tokens))
    for i in range(len(tokens)-sequence_length):
        inp=tokens[i: i+sequence_length]
        tar=tokens[i+1:i+sequence_length+1]
        # print(inp)
        # print()
        # print(tar)
        
        num_inp=[tokenizer.w2i.get(token,tokenizer.w2i['<unk>']) for token in inp]
        num_tar=[tokenizer.w2i.get(token,tokenizer.w2i['<unk>']) for token in tar]
        # print(num_inp)
        # print()
        # print(num_tar)
        # if i==5:
        #     break
        
        inputs.append(num_inp)
        targets.append(num_tar)
    return inputs, targets
    
        
    
    

In [12]:
inputs, targets=create_training_sequences(text,tokenizer, 30)

277245


In [13]:
len(tokenizer.w2i)

6976

In [14]:
import torch as torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

In [15]:
class Novel_Dataset(nn.Module):
    def __init__(self, inputs, targets):
        self.inputs=torch.tensor(inputs, dtype=torch.long)
        self.targets=torch.tensor(targets, dtype=torch.long)
        
    def __len__(self):
        return len(self.inputs)
    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]
        

In [16]:
# split into train and val datasets
tr_size=.8
ds=Novel_Dataset(inputs, targets)
tr_size=int(tr_size * len(ds))
val_size=len(ds)-tr_size
tr_ds, val_ds=random_split(ds, [tr_size, val_size])
print(f"Train Dataset size: {len(tr_ds)}")
print(f"Val Dataset size: {len(val_ds)}")

Train Dataset size: 221772
Val Dataset size: 55443


In [17]:
batch_size=32
tr_dl=DataLoader(tr_ds,batch_size=batch_size,shuffle=True )
val_dl=DataLoader(tr_ds,batch_size=batch_size,shuffle=False )

In [18]:
batch=next(iter(tr_dl))
inp, tar=batch
print("Inputs=>"," ".join(tokenizer.i2w.get(item.item(),tokenizer.w2i['<unk>']) for item in inp[1][:10]))
print("Targets=>"," ".join(tokenizer.i2w.get(item.item(), tokenizer.w2i['<unk>']) for item in tar[1][:10]))
print(inp.shape)

Inputs=> to me , Starbuck , <nl> about those <unk> owners
Targets=> me , Starbuck , <nl> about those <unk> owners ,
torch.Size([32, 30])


In [19]:
class PositionalEncoding(nn.Module):
    def __init__(self, max_seq_len, embedding_dimension):
        super().__init__()
        self.max_seq_len=max_seq_len
        self.embedding_dimension=embedding_dimension
        pem=torch.zeros(max_seq_len,embedding_dimension)
        positions=torch.arange(0,max_seq_len).unsqueeze(1).float()
        even_positions=torch.arange(0,embedding_dimension,2 ).float()
        exponential_term=torch.log(torch.tensor(10000.0))/embedding_dimension
        div_term=torch.exp(even_positions * -(exponential_term))
        # Sine to even indices
        pem[:,0::2]=torch.sin(positions * div_term)
        pem[:,1::2]=torch.cos(positions * div_term)
        self.register_buffer('pem', pem.unsqueeze(0))
    def forward(self, x):
        # shape of x is [batch_size, length_of_seq, embedding_dimension]
        length_of_seq=x.size(1)
        return self.pem[:,:length_of_seq, :]

In [20]:
def set_padding_mask(sequence,padding_index):
    return sequence==padding_index
    

In [21]:
def set_causal_mask(size):
    # create a matrix with values as -ve infinity
    mask=torch.full((size, size), float('-inf'))
    # set the lower half of the matrix zero
    mask=torch.triu(mask, diagonal=1)
    return mask

In [22]:
# Decoder using pytorch's Decoder layers
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dimen, max_seq_length=20, dropout=.01, n_heads=4,ffn_hidden_layers=80, n_decoder_layers=2):
        super().__init__()
        self.embedding_dimension=embedding_dimen
        self.embedding=nn.Embedding(vocab_size,embedding_dimen, padding_idx=0 )
        self.pos_enc=PositionalEncoding(max_seq_length, embedding_dimen)
        self.dropout=nn.Dropout(dropout)
        decoder_layer=nn.TransformerDecoderLayer(embedding_dimen,n_heads,ffn_hidden_layers, dropout, batch_first=True, norm_first=True)
        self.stacked_decoders=nn.TransformerDecoder(decoder_layer,n_decoder_layers)
        self.ln=nn.LayerNorm(embedding_dimen)
        self.out_probs=nn.Linear(embedding_dimen, vocab_size)
    def forward(self,x):
        padding_mask = set_padding_mask(x, padding_index=0)
        causal_mask = set_causal_mask(x.size(1)).to(x.device)

        x=self.embedding(x)
        #print(x.shape)
        x=x+self.pos_enc(x)
        x=self.dropout(x)

        x = self.stacked_decoders(
            tgt=x,                              
            memory=x,                           
            tgt_mask=causal_mask,               
            memory_mask=causal_mask,              
            tgt_key_padding_mask=padding_mask,  
            memory_key_padding_mask=padding_mask 
            
        )
        output=self.out_probs(x)
        return output

In [23]:
inp=torch.randint(1, 200, (2, 30))
print(inp.shape)
dec=Decoder(200,28,30,.01,4,50,2)
print(dec)
dec.eval()
with torch.no_grad():
    output = dec(inp)

print(output.shape)


torch.Size([2, 30])
Decoder(
  (embedding): Embedding(200, 28, padding_idx=0)
  (pos_enc): PositionalEncoding()
  (dropout): Dropout(p=0.01, inplace=False)
  (stacked_decoders): TransformerDecoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=28, out_features=28, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=28, out_features=28, bias=True)
        )
        (linear1): Linear(in_features=28, out_features=50, bias=True)
        (dropout): Dropout(p=0.01, inplace=False)
        (linear2): Linear(in_features=50, out_features=28, bias=True)
        (norm1): LayerNorm((28,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((28,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((28,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0



torch.Size([2, 30, 200])




In [24]:
class NovelGenerator(nn.Module):
    def __init__(self,vocabulary_size, embedding_dimension=128, n_heads=4,
                dec_layers=2, ffn_hidden=1024, max_len=300, dropout=.1, pad_index=0):
        super().__init__()
        self.decoder=Decoder(
            vocabulary_size,embedding_dimension, max_len, dropout, n_heads, ffn_hidden,dec_layers 
        )
        self.pad_idx=pad_index
        self.vocab_size=vocabulary_size
    def forward(self,x):
        return self.decoder(x)
    

In [25]:
# test 


In [26]:
tokenizer

<__main__.Novel_Tokenizer at 0x794471d2d610>

In [27]:
vocab_size=max(tokenizer.w2i.values())+1
pad_idx=tokenizer.w2i['<pad>']
model=NovelGenerator(vocab_size, 512,4,2,1024,300,.1,pad_idx)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


NovelGenerator(
  (decoder): Decoder(
    (embedding): Embedding(6977, 512, padding_idx=0)
    (pos_enc): PositionalEncoding()
    (dropout): Dropout(p=0.1, inplace=False)
    (stacked_decoders): TransformerDecoder(
      (layers): ModuleList(
        (0-1): 2 x TransformerDecoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (multihead_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=1024, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=1024, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm3): LayerNorm((512,), eps=1e-05

In [28]:
# # inspect model & a single batch
# emb = model.decoder.embedding
# print("embedding.num_embeddings:", emb.num_embeddings)
# print("embedding.padding_idx:", emb.padding_idx if hasattr(emb,'padding_idx') else None)
# print("model.vocab_size (if set):", getattr(model, "vocab_size", None))
# print("batch min/max:", inputs.min().item(), inputs.max().item())

In [29]:
loss_fn=nn.CrossEntropyLoss(ignore_index=tokenizer.w2i['<pad>'])
optimizer=torch.optim.Adam(model.parameters(), lr=.001)

In [30]:
def train_model(model,train_dl, test_dl,loss_fn,optimizer,epochs=10,device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model=model.to(device)
    history={"train_acc":[], "test_acc":[],"train_loss":[],"test_loss":[]}
    
    for epoch in range(epochs):
        print(f"epoch: {epoch+1}")
        model.train()
        tr_loss=0
        total_train_size=0
        tr_correct=0
        for inputs, labels in train_dl:
            optimizer.zero_grad()
            if not torch.is_tensor(inputs):
                inputs = torch.tensor(inputs, dtype=torch.long)
            if not torch.is_tensor(labels):
                labels = torch.tensor(labels, dtype=torch.long)

            inputs=inputs.to(device)
            labels=labels.to(device)
            #print(inputs.max())
            outputs=model(inputs)
            #print(outputs.shape)
           
            if outputs.dim() == 3:
                B, T, V = outputs.shape                
                if labels.dim() == 2 and labels.shape[1] == T:
                    logits = outputs.reshape(-1, V)          
                    targets = labels.reshape(-1).long()    
                
                elif labels.dim() == 2 and labels.shape[1] == V:
                    outputs = outputs.permute(0, 2, 1)       
                    logits = outputs.reshape(-1, outputs.size(-1))
                    targets = labels.reshape(-1).long()
                else:
                    raise RuntimeError(f'unexpected shapes: outputs={outputs.shape}, labels={labels.shape}')
            elif outputs.dim() == 2:                
                logits = outputs
                targets = labels.reshape(-1).long()
            else:
                raise RuntimeError(f'unexpected outputs.dim()={outputs.dim()}')

            loss = loss_fn(logits, targets)
            #loss=loss_fn(outputs, labels)
            # print("loss.requires_grad:", loss.requires_grad)
            # print("loss.grad_fn:", getattr(loss, "grad_fn", None))
            # print("any param requires_grad?:", any(p.requires_grad for p in model.parameters()))
            # print("model training mode:", model.training)
            
            loss.backward()
            optimizer.step()
    
            predicted=logits.argmax(dim=1)
            total_train_size+=targets.size(0)
            tr_correct+=(predicted==targets).sum().item()
            tr_loss+=loss.item()* targets.size(0)
    
            train_acc=100* tr_correct/total_train_size
            tr_avg_loss=tr_loss/total_train_size
        print(f"train acc: {train_acc} train loss: {tr_avg_loss}")
            
        model.eval()
        test_loss=0
        total_test_size=0
        test_correct=0
        with torch.no_grad():
            for inputs, labels in test_dl:
                if not torch.is_tensor(inputs):
                    inputs = torch.tensor(inputs, dtype=torch.long)
                if not torch.is_tensor(labels):
                    labels = torch.tensor(labels, dtype=torch.long)
                inputs=inputs.to(device)
                labels=labels.to(device)
    
                outputs=model(inputs)
                if outputs.dim() == 3:
                    B, T, V = outputs.shape                
                    if labels.dim() == 2 and labels.shape[1] == T:
                        logits = outputs.reshape(-1, V)          
                        targets = labels.reshape(-1).long()    
                
                    elif labels.dim() == 2 and labels.shape[1] == V:
                        outputs = outputs.permute(0, 2, 1)       
                        logits = outputs.reshape(-1, outputs.size(-1))
                        targets = labels.reshape(-1).long()
                    else:
                        raise RuntimeError(f'unexpected shapes: outputs={outputs.shape}, labels={labels.shape}')
                elif outputs.dim() == 2:                
                    logits = outputs
                    targets = labels.reshape(-1).long()
                else:
                    raise RuntimeError(f'unexpected outputs.dim()={outputs.dim()}')
                loss=loss_fn(logits, targets)          

                #loss=loss_fn(outputs, labels)
                predicted=logits.argmax(dim=1)
                total_test_size+=targets.size(0)
                test_correct+=(predicted==targets).sum().item()
                test_loss+=loss.item()* targets.size(0)
        
                test_acc=100* test_correct/total_test_size
                test_avg_loss=test_loss/total_test_size
            print(f"test acc: {test_acc} test loss: {test_avg_loss}")
            
        history['train_acc'].append(train_acc)
        history['test_acc'].append(test_acc)
        history['train_loss'].append(tr_avg_loss)
        history['test_loss'].append(test_avg_loss)
    return history
        

        

In [31]:
train_model(model,tr_dl, val_dl,loss_fn,optimizer,epochs=10,device=None)

epoch: 1
train acc: 33.95678143919581 train loss: 3.159954682673314
test acc: 61.757330351291714 test loss: 1.5596431598448661
epoch: 2
train acc: 53.98990855473188 train loss: 1.9351130065090945
test acc: 77.77601921492945 test loss: 0.961295738832656
epoch: 3
train acc: 63.34722147069964 train loss: 1.552745838597716
test acc: 84.06821720806353 test loss: 0.7481020543941796
epoch: 4
train acc: 68.77383078116263 train loss: 1.3484201376704283
test acc: 86.897925196448 test loss: 0.6464778803223223
epoch: 5
train acc: 72.20208141695075 train loss: 1.2234972960900394
test acc: 88.56588448196045 test loss: 0.5926040038357604
epoch: 6
train acc: 74.5641169008411 train loss: 1.1371958927014834
test acc: 89.5054079565199 test loss: 0.5538992807068867
epoch: 7
train acc: 76.27779882041015 train loss: 1.0749806798648505
test acc: 89.9734111309513 test loss: 0.5350808461154541
epoch: 8
train acc: 77.60797876497784 train loss: 1.0258590707051243
test acc: 90.23214532643135 test loss: 0.52308136

{'train_acc': [33.95678143919581,
  53.98990855473188,
  63.34722147069964,
  68.77383078116263,
  72.20208141695075,
  74.5641169008411,
  76.27779882041015,
  77.60797876497784,
  78.61988889490107,
  79.41576634261013],
 'test_acc': [61.757330351291714,
  77.77601921492945,
  84.06821720806353,
  86.897925196448,
  88.56588448196045,
  89.5054079565199,
  89.9734111309513,
  90.23214532643135,
  90.81926483054669,
  90.93532997853652],
 'train_loss': [3.159954682673314,
  1.9351130065090945,
  1.552745838597716,
  1.3484201376704283,
  1.2234972960900394,
  1.1371958927014834,
  1.0749806798648505,
  1.0258590707051243,
  0.9875304320113857,
  0.9560395882758924],
 'test_loss': [1.5596431598448661,
  0.961295738832656,
  0.7481020543941796,
  0.6464778803223223,
  0.5926040038357604,
  0.5538992807068867,
  0.5350808461154541,
  0.5230813679667952,
  0.49944642878988355,
  0.4916005716002753]}

In [32]:
torch.save(model.state_dict(), 'decoder_model.pth')

## Pending Improvements 
### Text Generation using autoregressive decoding