# Preparation

In [None]:
!pip install torchtext
!pip install torchdata

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data.dataset import random_split


from tqdm import tqdm

import importlib

from datetime import datetime as dt
import time

import imdb_voc
import torchdata

In [None]:
root = './'

# import sentences
importlib.reload(imdb_voc)

# set device
dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Functions

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, d_Q, d_K, d_V, numhead, dropout):
      super().__init__()

      # Q1. Implement

      self.numhead=numhead
      self.d_V = d_V
      self.d_K = d_K
      self.d_Q = d_Q
      # input linear layers for V, Q, K
      # d_Q, d_K, d_V are typically set to d_model/numhead

      self.V_Linear = nn.Linear(in_features=d_model, out_features=d_V*numhead)
      self.Q_Linear = nn.Linear(in_features=d_model, out_features=d_K*numhead)
      self.K_Linear = nn.Linear(in_features=d_model, out_features=d_Q*numhead)

      # output linear layer
      self.MHA_Linear = nn.Linear(in_features=d_V*numhead, out_features=d_model)

      # dropout
      self.dropout=nn.Dropout(dropout)



    def forward(self, x_Q, x_K, x_V, src_batch_lens=None):
      # Q2 Implementation

      batch_size = x_Q.size(0)
      out = []
      for i in range(batch_size):
          q = self.Q_Linear(x_Q[i]).transpose(0, 1)
          k = self.K_Linear(x_K[i]).transpose(0, 1)
          v = self.V_Linear(x_V[i]).transpose(0, 1)
          dot = torch.matmul(q, k.transpose(0, 1))
          dot = dot / math.sqrt(dot.size(0))
          att = F.softmax(dot, dim=1)
          att = self.dropout(att)
          out.append(torch.matmul(att, v))

      # Stack the output tensors together
      out = torch.stack(out)

      mask = torch.arange(out.size(1)).unsqueeze(0).expand(batch_size, -1).to(x_Q.device)
      mask = mask < src_batch_lens.unsqueeze(1)
      mask = mask.unsqueeze(2)  # Expand the mask along the third dimension (numhead * d_V)

      # Apply the mask to the output tensors
      out = out.masked_fill(~mask, 0)

      out = out.view(batch_size, -1, self.numhead * self.d_V)

      out = self.MHA_Linear(out)
      out = self.dropout(out)

      return out

In [None]:
class TF_Encoder_Block(nn.Module):
    def __init__(self, d_model, d_ff, numhead, dropout):
      super().__init__()

      # Q3
      self.MHA = MultiHeadAttention(d_model, d_model, d_model, d_model, numhead, dropout)
      self.FF1 = nn.Linear(d_model, d_ff)
      self.FF2 = nn.Linear(d_model, d_ff)
      self.Dropout = nn.Dropout(dropout)
      self.ReLU = nn.ReLU()
      self.LayerNorm1 = nn.LayerNorm(d_model)
      self.LayerNorm2 = nn.LayerNorm(d_model)

    def forward(self, x, src_batch_lens):
      # Q4
      att = self.MHA(x, x, x, src_batch_lens)
      x = x + att
      x = self.LayerNorm1(x)
      ff = self.FF1(x)
      ff = self.ReLU(ff)
      ff = self.Dropout(ff)
      ff = self.FF2(ff)
      ff = self.Dropout(ff)
      x = x + ff
      out = self.LayerNorm2(x)
      return out

In [None]:
"""
Positional encoding
PE(pos,2i) = sin(pos/10000**(2i/dmodel))
PE(pos,2i+1) = cos(pos/10000**(2i/dmodel))
"""

def PosEncoding(t_len, d_model):
    i = torch.tensor(range(d_model))
    pos = torch.tensor(range(t_len))
    POS, I = torch.meshgrid(pos, i)
    PE = (1-I % 2)*torch.sin(POS/10**(4*I/d_model)) + (I%2)*torch.cos(POS/10**(4*(I-1)/d_model))
    return PE

class TF_Encoder(nn.Module):
    def __init__(self, vocab_size, d_model,
                 d_ff, numlayer, numhead, dropout):
        super().__init__()

        self.numlayer = numlayer
        self.src_embed  = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)
        self.dropout=nn.Dropout(dropout)

        # Q5. Implement a sequence of numlayer encoder blocks
        self.encoder_block = nn.ModuleList([TF_Encoder_Block(d_model, d_ff, numhead, dropout) for _ in range(numlayer)])

    def forward(self, x, src_batch_lens):

      x_embed = self.src_embed(x)
      x = self.dropout(x_embed)
      p_enc = PosEncoding(x.shape[1], x.shape[2]).to(dev)
      x = x + p_enc
      # Q6. Implement: forward over numlayer encoder blocks
      for layer in self.encoder_block:
        x = layer(x, src_batch_lens)
        out = x

      return out

In [None]:
"""

main model

"""

class sentiment_classifier(nn.Module):

    def __init__(self, enc_input_size,
                 enc_d_model,
                 enc_d_ff,
                 enc_num_layer,
                 enc_num_head,
                 dropout,
                ):
        super().__init__()

        self.encoder = TF_Encoder(vocab_size = enc_input_size,
                                  d_model = enc_d_model, d_ff=enc_d_ff,
                                  numlayer=enc_num_layer, numhead=enc_num_head,
                                  dropout=dropout)

        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d((1,None)),
            nn.Dropout(dropout),
            nn.Linear(in_features = enc_d_model, out_features=enc_d_model),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(in_features = enc_d_model, out_features = 1),
        )


    def forward(self, x, x_lens):
        src_ctx = self.encoder(x, src_batch_lens = x_lens)
        out_logits = self.classifier(src_ctx).flatten()
        return out_logits

In [None]:
"""

datasets

"""

# Load IMDB dataset
# once you build the dataset, you can load it from file to save time
# to load from file, set this flag True
load_imdb_dataset = True

if load_imdb_dataset:
    imdb_dataset = torch.load('imdb_dataset.pt')
else:
    imdb_dataset = imdb_voc.IMDB_tensor_dataset()
    torch.save(imdb_dataset, 'imdb_dataset.pt')

In [None]:
train_dataset, test_dataset = imdb_dataset.get_dataset()

split_ratio = 0.85
num_train = int(len(train_dataset) * split_ratio)
split_train, split_valid = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

batch_size_trn = 64 # Hyperparam
batch_size_val = 64
batch_size_tst = 256

train_dataloader = DataLoader(split_train, batch_size=batch_size_trn, shuffle=True)
val_dataloader = DataLoader(split_valid, batch_size=batch_size_val, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size_tst, shuffle=True)

# get character dictionary
src_word_dict = imdb_dataset.src_stoi
src_idx_dict = imdb_dataset.src_itos

SRC_PAD_IDX = src_word_dict['<PAD>']

In [None]:
# show sample reviews with pos/neg sentiments

show_sample_reviews = True

if show_sample_reviews:

    sample_text, sample_lab = next(iter(train_dataloader))
    slist=[]

    for stxt in sample_text[:4]:
        slist.append([src_idx_dict[j] for j in stxt])

    for j, s in enumerate(slist):
        print('positive' if sample_lab[j]==1 else 'negative')
        print(' '.join([i for i in s if i != '<PAD>'])+'\n')

negative
a major disappointment . this was one of the best uk crime drama / detective shows from the 90 ' s which developed the fascinating title character played by scotland ' s robbie coltrane . however this <UNK> has little to add and perhaps suffers from an inevitable let down due to raised expectations when a favored show returns after a long hiatus . coltrane isn ' t really given much to do , much more attention is spent on the uninteresting killer , and in what he has to act in , he seems <UNK> , almost bored . the <UNK> ' s story is written by the books and the attempt to update us on coltrane ' s family life seems lightweight . perhaps if the writers had a whole series in front of them instead of just this one two-hour show they would have written this with much more depth . as is , skip this and watch the old cracker from the 90 ' s which is far far superior .

negative
<UNK> ( laurel <UNK> ) becomes a live in babysitter for young <UNK> <UNK> ( <UNK> cole ) who has recently l

In [None]:
"""

model

"""

enc_vocab_size = len(src_word_dict) # counting eof, one-hot vector goes in

# Set hyperparam (model size)
# examples: model & ff dim - 8, 16, 32, 64, 128, numhead, numlayer 1~4

enc_d_model = 8 # Hyperparam
enc_d_ff = 8 # Hyperparam
enc_num_head = 4 # Hyperparam
enc_num_layer= 4 # Hyperparam

DROPOUT=0.1

model = sentiment_classifier(enc_input_size=enc_vocab_size,
                         enc_d_model = enc_d_model,
                         enc_d_ff = enc_d_ff,
                         enc_num_head = enc_num_head,
                         enc_num_layer = enc_num_layer,
                         dropout=DROPOUT)

model = model.to(dev)

In [None]:
"""

optimizer

"""

# Set hyperparam (learning rate)
# examples: 1e-3 ~ 1e-5

lr = 0.001 # Hyperparam

optimizer = torch.optim.Adam(model.parameters(), lr = lr)

criterion = nn.BCEWithLogitsLoss()

"""

auxiliary functions

"""


# get length of reviews in batch
def get_lens_from_tensor(x):
    # lens (batch, t)
    lens = torch.ones_like(x).long()
    lens[x==SRC_PAD_IDX]=0
    return torch.sum(lens, dim=-1)

def get_binary_metrics(y_pred, y):
    # find number of TP, TN, FP, FN
    TP=sum(((y_pred == 1)&(y==1)).type(torch.int32))
    FP=sum(((y_pred == 1)&(y==0)).type(torch.int32))
    TN=sum(((y_pred == 0)&(y==0)).type(torch.int32))
    FN=sum(((y_pred == 0)&(y==1)).type(torch.int32))
    accy = (TP+TN)/(TP+FP+TN+FN)

    recall = TP/(TP+FN) if TP+FN!=0 else 0
    prec = TP/(TP+FP) if TP+FP!=0 else 0
    f1 = 2*recall*prec/(recall+prec) if recall+prec !=0 else 0
    return accy, recall, prec, f1

# Train

In [None]:
"""

train/validation

"""


def train(model, dataloader, optimizer, criterion, clip):

    model.train()

    epoch_loss = 0

    for i, batch in enumerate(dataloader):

        src = batch[0].to(dev)
        trg = batch[1].float().to(dev)

        # print('batch trg.shape', trg.shape)
        # print('batch src.shape', src.shape)

        optimizer.zero_grad()

        x_lens = get_lens_from_tensor(src).to(dev)

        output = model(x=src, x_lens=x_lens)


        output = output.contiguous().view(-1)
        trg = trg.contiguous().view(-1)

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

In [None]:
def evaluate(model, dataloader, criterion):

    model.eval()

    epoch_loss = 0

    epoch_accy =0
    epoch_recall =0
    epoch_prec =0
    epoch_f1 =0

    with torch.no_grad():
        for i, batch in enumerate(dataloader):

            src = batch[0].to(dev)
            trg = batch[1].float().to(dev)

            x_lens = get_lens_from_tensor(src).to(dev)

            output = model(x=src, x_lens=x_lens)

            output = output.contiguous().view(-1)
            trg = trg.contiguous().view(-1)

            loss = criterion(output, trg)
            accy, recall, prec, f1 = get_binary_metrics((output>=0).long(), trg.long())
            epoch_accy += accy
            epoch_recall += recall
            epoch_prec += prec
            epoch_f1 += f1

            epoch_loss += loss.item()

    # show accuracy
    print(f'\tAccuracy: {epoch_accy/(len(dataloader)):.3f}')

    return epoch_loss / len(dataloader)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
"""

Training loop

"""

N_EPOCHS = 30
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    #here
    train_loss = train(model, train_dataloader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, val_dataloader, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model.pt')

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Val. Loss: {valid_loss:.3f}')

	Accuracy: 0.586
Epoch: 01 | Time: 0m 56s
	Train Loss: 0.689 | Val. Loss: 0.670
	Accuracy: 0.706
Epoch: 02 | Time: 0m 55s
	Train Loss: 0.632 | Val. Loss: 0.574
	Accuracy: 0.750
Epoch: 03 | Time: 0m 58s
	Train Loss: 0.567 | Val. Loss: 0.509
	Accuracy: 0.785
Epoch: 04 | Time: 0m 58s
	Train Loss: 0.512 | Val. Loss: 0.464
	Accuracy: 0.812
Epoch: 05 | Time: 0m 56s
	Train Loss: 0.465 | Val. Loss: 0.424
	Accuracy: 0.821
Epoch: 06 | Time: 0m 56s
	Train Loss: 0.430 | Val. Loss: 0.406
	Accuracy: 0.833
Epoch: 07 | Time: 0m 57s
	Train Loss: 0.400 | Val. Loss: 0.396
	Accuracy: 0.846
Epoch: 08 | Time: 0m 55s
	Train Loss: 0.375 | Val. Loss: 0.361
	Accuracy: 0.849
Epoch: 09 | Time: 0m 54s
	Train Loss: 0.352 | Val. Loss: 0.373
	Accuracy: 0.857
Epoch: 10 | Time: 0m 56s
	Train Loss: 0.337 | Val. Loss: 0.349
	Accuracy: 0.862
Epoch: 11 | Time: 0m 54s
	Train Loss: 0.323 | Val. Loss: 0.348
	Accuracy: 0.865
Epoch: 12 | Time: 0m 55s
	Train Loss: 0.304 | Val. Loss: 0.348
	Accuracy: 0.862
Epoch: 13 | Time: 0m 54

# Test

In [None]:
"""

Test loop

"""
print('*** Now test phase begins! ***')
model.load_state_dict(torch.load('model.pt'))

test_loss = evaluate(model, test_dataloader, criterion)
get_binary_metrics

print(f'| Test Loss: {test_loss:.3f}')

*** Now test phase begins! ***
	Accuracy: 0.859
| Test Loss: 0.357
