In [33]:
%load_ext autoreload
%autoreload 2
import torch
import torch.nn as nn
import torch.optim as optim

from Transformer import Transformer

import torchtext
from torchtext import data
from torchtext.data import Field, BucketIterator

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import spacy
import numpy as np
import pandas as pd

import random
import math
import time

SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
spacy_en = spacy.load('en_core_web_sm')
def tokenizer(s): 
    return [tok.text for tok in spacy_en.tokenizer(s)]

path = 'Tolokers'

TEXT = data.Field(sequential=True, 
                       tokenize=tokenizer, 
                       include_lengths=True, 
                       use_vocab=True)
LABEL = data.Field(sequential=False, 
                         use_vocab=False, 
                         pad_token=None, 
                         unk_token=None)

fields = [('label', LABEL), ('text', TEXT)]
train, val, test = data.TabularDataset.splits(path=path, 
                                            format='csv', 
                                            train='train.csv', 
                                            validation='val.csv',
                                            test='test.csv',
                                            fields=fields, 
                                            skip_header=True)

In [3]:
TEXT.build_vocab(train, min_freq = 2)

In [4]:
for i in range(3):
    print(vars(train[i]))

{'label': '0', 'text': ['that', "'s", 'cool', ',', 'i', 'am', 'in', 'the', 'olympics', ',', 'and', 'i', 'am', 'a', 'pilot', ',', 'so', 'i', 'am', 'not', 'sure', '.']}
{'label': '1', 'text': ['yes', ',', 'i', 'have', 'a', 'garden', '.']}
{'label': '2', 'text': ['i', 'wo', "n't", 'say', '`', '`', 'yes', "'", "'", 'or', '`', '`', 'no', "'", "'", 'right', 'now', '.']}


In [5]:
BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
                                                    (train, val, test), 
                                                     batch_size = BATCH_SIZE,
                                                     device = device)

In [6]:
len(train_iterator)

666

In [14]:
INPUT_DIM = len(TEXT.vocab)
HID_DIM = 512
N_LAYERS = 3
N_HEADS = 2
PF_DIM = 1024
DROPOUT_RATE = 0.1
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = Transformer(input_dim = INPUT_DIM, 
                    hid_dim = HID_DIM, 
                    n_heads = N_HEADS,
                    n_layers = N_LAYERS,
                    pf_dim = PF_DIM, 
                    dropout_rate = DROPOUT_RATE,
                    device = device,
                    pad_idx = PAD_IDX)

In [15]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,963,840 trainable parameters


In [16]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)
        
model.apply(initialize_weights)

Transformer(
  (token_embedding): Embedding(3533, 512)
  (layers): ModuleList(
    (0): TransformerLayer(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (self_attention): MultiHeadAttentionLayer(
        (fc_q): Linear(in_features=512, out_features=512, bias=True)
        (fc_k): Linear(in_features=512, out_features=512, bias=True)
        (fc_v): Linear(in_features=512, out_features=512, bias=True)
        (fc_o): Linear(in_features=512, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerLayer(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (self_attention): MultiHeadAttentionLayer(
        (fc_q): Linear(in_features=512, out_features=512, bias=True)
        (fc_k): Linear(in_features=512, out_features=512, bias=True)
        (fc_v): Linear(in_features=512, out_features=512, bias=True)
        (fc_o): Linear(in_fe

In [17]:
LEARNING_RATE = 0.0001

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [18]:
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [37]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):

        keywords = batch.text
        trg = batch.label
        print(batch.text)
        optimizer.zero_grad()
        output = model(keywords)
                
        #output = [batch size, trg len - 1, output dim]
            
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [21]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            keywords = batch.text
            trg = batch.label

            output = model(keywords)
            
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [22]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [38]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut6-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

(tensor([[   4,  252,    4,   95,   40,   16,    2,    4,   48,   68,    2,    2,
           91,   48,    2,  551,   27,    2,    4,  664,  362,    2,   81,  264,
           81,    4,    4,   91,   80,   23,   86,  316],
        [  67,  142,   14,    7,    7,  143,   20,   12,   11,  437,   20,   69,
            6,   11,   20,  766,    4,   14,   20,   19, 2283,   28,    6,  142,
            6,   12,   11,  280,   30,  122,    9,   44],
        [  23,    7,    5,  181,   14,   13,  186,    9,    7,  128,  228,   37,
            2,    7, 2216,   11,  216,  348,    9,   16,    3,    5,  168,   16,
            2,    9,   39,  388, 1026,   54,  273,   62],
        [ 450,  116,   57,    8,    5, 2898,    6,   42,   11,  496,    1,   12,
           10,  120,   25,    7,    9,  524,  104,   50,    1,    0, 1432,  206,
           53,   51,   77,   39,   65,  430,    7,   19],
        [ 194, 2318,   22,    1,    0,  999,    2,   60,   21,    1,    1,   23,
          296,    8,    1,   12,   42,

AttributeError: 'bool' object has no attribute 'unsqueeze'