In [1]:
!pip install torch torchtext==0.7

Collecting torchtext==0.7
[?25l  Downloading https://files.pythonhosted.org/packages/b9/f9/224b3893ab11d83d47fde357a7dcc75f00ba219f34f3d15e06fe4cb62e05/torchtext-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (4.5MB)
[K     |████████████████████████████████| 4.5MB 5.4MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 57.3MB/s 
Installing collected packages: sentencepiece, torchtext
  Found existing installation: torchtext 0.3.1
    Uninstalling torchtext-0.3.1:
      Successfully uninstalled torchtext-0.3.1
Successfully installed sentencepiece-0.1.91 torchtext-0.7.0


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
import torchtext.experimental
import torchtext.experimental.vectors
from torchtext.experimental.datasets.raw.text_classification import RawTextIterableDataset
from torchtext.experimental.datasets.text_classification import TextClassificationDataset
from torchtext.experimental.functional import sequential_transforms, vocab_func, totensor

import collections
import random
import time

In [3]:
seed = 1234

torch.manual_seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [4]:
raw_train_data, raw_test_data = torchtext.experimental.datasets.raw.IMDB()

aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:01<00:00, 52.7MB/s]


In [5]:
def get_train_valid_split(raw_train_data, split_ratio = 0.7):

    raw_train_data = list(raw_train_data)
        
    random.shuffle(raw_train_data)
        
    n_train_examples = int(len(raw_train_data) * split_ratio)
        
    train_data = raw_train_data[:n_train_examples]
    valid_data = raw_train_data[n_train_examples:]
    
    train_data = RawTextIterableDataset(train_data)
    valid_data = RawTextIterableDataset(valid_data)
    
    return train_data, valid_data

In [6]:
raw_train_data, raw_valid_data = get_train_valid_split(raw_train_data)

In [7]:
class Tokenizer:
    def __init__(self, tokenize_fn = 'basic_english', lower = True, max_length = None, sos_token = None):
        
        self.tokenize_fn = torchtext.data.utils.get_tokenizer(tokenize_fn)
        self.lower = lower
        self.max_length = max_length
        self.sos_token = sos_token
        
    def tokenize(self, s):
        
        tokens = self.tokenize_fn(s)
        
        if self.lower:
            tokens = [token.lower() for token in tokens]
            
        if self.sos_token is not None:
            tokens = [self.sos_token] + tokens

        if self.max_length is not None:
            tokens = tokens[:self.max_length]
            
        return tokens

In [8]:
max_length = 250
sos_token = '<sos>'

tokenizer = Tokenizer(max_length = max_length, sos_token = sos_token)

In [9]:
s = "hello world, how are you?"

tokenizer.tokenize(s)

['<sos>', 'hello', 'world', ',', 'how', 'are', 'you', '?']

In [10]:
def build_vocab_from_data(raw_data, tokenizer, **vocab_kwargs):
    
    token_freqs = collections.Counter()
    
    for label, text in raw_data:
        tokens = tokenizer.tokenize(text)
        token_freqs.update(tokens)
                
    vocab = torchtext.vocab.Vocab(token_freqs, **vocab_kwargs)
    
    return vocab

In [11]:
max_size = 25000

vocab = build_vocab_from_data(raw_train_data, tokenizer, max_size = max_size)

In [12]:
def process_raw_data(raw_data, tokenizer, vocab):
    
    raw_data = [(label, text) for (label, text) in raw_data]

    text_transform = sequential_transforms(tokenizer.tokenize,
                                           vocab_func(vocab),
                                           totensor(dtype=torch.long))
    
    label_transform = sequential_transforms(totensor(dtype=torch.long))

    transforms = (label_transform, text_transform)

    dataset = TextClassificationDataset(raw_data,
                                        vocab,
                                        transforms)
    
    return dataset

In [13]:
train_data = process_raw_data(raw_train_data, tokenizer, vocab)
valid_data = process_raw_data(raw_valid_data, tokenizer, vocab)
test_data = process_raw_data(raw_test_data, tokenizer, vocab)

In [14]:
class Collator:
    def __init__(self, pad_idx, batch_first):
        
        self.pad_idx = pad_idx
        self.batch_first = batch_first
        
    def collate(self, batch):
        
        labels, text = zip(*batch)
        
        labels = torch.LongTensor(labels)

        text = nn.utils.rnn.pad_sequence(text, 
                                         padding_value = self.pad_idx,
                                         batch_first = self.batch_first)
        
        return labels, text

In [15]:
pad_token = '<pad>'
pad_idx = vocab[pad_token]
batch_first = False

collator = Collator(pad_idx, batch_first)

In [16]:
batch_size = 256

train_iterator = torch.utils.data.DataLoader(train_data, 
                                             batch_size, 
                                             shuffle = True, 
                                             collate_fn = collator.collate)

valid_iterator = torch.utils.data.DataLoader(valid_data, 
                                             batch_size, 
                                             shuffle = False, 
                                             collate_fn = collator.collate)

test_iterator = torch.utils.data.DataLoader(test_data, 
                                            batch_size, 
                                            shuffle = False, 
                                            collate_fn = collator.collate)

In [17]:
class Transformer(nn.Module):
    def __init__(self, input_dim, emb_dim, n_heads, hid_dim, n_layers, output_dim, dropout, max_length, pad_idx):
        super().__init__()

        self.tok_embedding = nn.Embedding(input_dim, emb_dim, padding_idx = pad_idx)
        self.pos_embedding = nn.Embedding(max_length, emb_dim)
        transformer_layer = nn.TransformerEncoderLayer(emb_dim, n_heads, hid_dim, activation = 'gelu')
        norm = nn.LayerNorm(emb_dim)
        self.transformer = nn.TransformerEncoder(transformer_layer, n_layers, norm)
        self.fc = nn.Linear(emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):

        # text = [seq len, batch size]

        seq_len, batch_size = text.shape
        
        pos = torch.arange(0, seq_len).unsqueeze(-1).repeat(1, batch_size).to(text.device)
        
        # pos = [seq len, batch size]
        
        embedded_pos = self.pos_embedding(pos)
        embedded_tok = self.tok_embedding(text)
        
        embedded = self.dropout(embedded_pos + embedded_tok)

        # embedded = [seq len, batch size, emb dim]

        transformed = self.transformer(embedded)

        # transformed = [seq len, batch size, emb dim]

        sos_transformed = transformed[0]

        # sos_transformed = [batch size, emb dim]

        prediction = self.fc(self.dropout(sos_transformed))

        # prediction = [batch size, output dim]

        return prediction

In [18]:
input_dim = len(vocab)
emb_dim = 100
n_heads = 10
hid_dim = 1024
n_layers = 3
output_dim = 2
dropout = 0.5
pad_idx = pad_idx

model = Transformer(input_dim, emb_dim, n_heads, hid_dim, n_layers, output_dim, dropout, max_length, pad_idx)

In [19]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [20]:
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,265,774 trainable parameters


In [21]:
for n, p in model.named_parameters():
    print(f'name: {n}, shape: {p.shape}')

name: tok_embedding.weight, shape: torch.Size([25002, 100])
name: pos_embedding.weight, shape: torch.Size([250, 100])
name: transformer.layers.0.self_attn.in_proj_weight, shape: torch.Size([300, 100])
name: transformer.layers.0.self_attn.in_proj_bias, shape: torch.Size([300])
name: transformer.layers.0.self_attn.out_proj.weight, shape: torch.Size([100, 100])
name: transformer.layers.0.self_attn.out_proj.bias, shape: torch.Size([100])
name: transformer.layers.0.linear1.weight, shape: torch.Size([1024, 100])
name: transformer.layers.0.linear1.bias, shape: torch.Size([1024])
name: transformer.layers.0.linear2.weight, shape: torch.Size([100, 1024])
name: transformer.layers.0.linear2.bias, shape: torch.Size([100])
name: transformer.layers.0.norm1.weight, shape: torch.Size([100])
name: transformer.layers.0.norm1.bias, shape: torch.Size([100])
name: transformer.layers.0.norm2.weight, shape: torch.Size([100])
name: transformer.layers.0.norm2.bias, shape: torch.Size([100])
name: transformer.lay

In [22]:
def initialize_parameters(m):
    if isinstance(m, nn.Embedding):
        nn.init.normal_(m.weight, std = 0.02)
    elif isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, std = 0.02)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LayerNorm):
        nn.init.ones_(m.weight)
        nn.init.zeros_(m.bias)

In [23]:
model.apply(initialize_parameters)

Transformer(
  (tok_embedding): Embedding(25002, 100, padding_idx=1)
  (pos_embedding): Embedding(250, 100)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=100, out_features=100, bias=True)
        )
        (linear1): Linear(in_features=100, out_features=1024, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=1024, out_features=100, bias=True)
        (norm1): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=100, out_features=100, bias=True)
        )
        (linear1): Linear(in_features=100,

In [24]:
glove = torchtext.experimental.vectors.GloVe(name = '6B',
                                             dim = emb_dim)

glove.6B.zip: 100%|██████████| 862M/862M [06:27<00:00, 2.23MB/s]
100%|██████████| 400000/400000 [00:17<00:00, 23261.37lines/s]


In [25]:
def get_pretrained_embedding(initial_embedding, pretrained_vectors, vocab, unk_token):
    
    pretrained_embedding = torch.FloatTensor(initial_embedding.weight.clone()).detach()    
    #pretrained_vocab = pretrained_vectors.vectors.get_stoi()
    
    unk_tokens = []
    
    for idx, token in enumerate(vocab.itos):
        #if token in pretrained_vocab:
        pretrained_vector = pretrained_vectors[token]
        pretrained_embedding[idx] = pretrained_vector
        #else:
        #    unk_tokens.append(token)
        
    return pretrained_embedding, unk_tokens

In [26]:
unk_token = '<unk>'

pretrained_embedding, unk_tokens = get_pretrained_embedding(model.tok_embedding, glove, vocab, unk_token)

In [27]:
model.tok_embedding.weight.data.copy_(pretrained_embedding)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.0288, -0.0316,  0.4083,  ...,  0.6288, -0.5348, -0.8080],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.2612,  0.6821, -0.2295,  ..., -0.5306,  0.0863,  0.4852]])

In [28]:
model.tok_embedding.weight.data[pad_idx] = torch.zeros(emb_dim)

In [29]:
optimizer = optim.Adam(model.parameters())

In [30]:
criterion = nn.CrossEntropyLoss()

In [31]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [32]:
model = model.to(device)
criterion = criterion.to(device)

In [33]:
def calculate_accuracy(predictions, labels):
    top_predictions = predictions.argmax(1, keepdim = True)
    correct = top_predictions.eq(labels.view_as(top_predictions)).sum()
    accuracy = correct.float() / labels.shape[0]
    return accuracy

In [34]:
def train(model, iterator, optimizer, criterion, device):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for labels, text in iterator:

        labels = labels.to(device)
        text = text.to(device)

        optimizer.zero_grad()
        
        predictions = model(text)
        
        loss = criterion(predictions, labels)
        
        acc = calculate_accuracy(predictions, labels)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [35]:
def evaluate(model, iterator, criterion, device):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for labels, text in iterator:

            labels = labels.to(device)
            text = text.to(device)
            
            predictions = model(text)
            
            loss = criterion(predictions, labels)
            
            acc = calculate_accuracy(predictions, labels)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [36]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [37]:
n_epochs = 10

best_valid_loss = float('inf')

for epoch in range(n_epochs):

    start_time = time.monotonic()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, device)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, device)
    
    end_time = time.monotonic()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'transformer-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 36s
	Train Loss: 0.650 | Train Acc: 59.26%
	 Val. Loss: 0.499 |  Val. Acc: 76.54%
Epoch: 02 | Epoch Time: 0m 37s
	Train Loss: 0.430 | Train Acc: 80.54%
	 Val. Loss: 0.366 |  Val. Acc: 84.11%
Epoch: 03 | Epoch Time: 0m 37s
	Train Loss: 0.348 | Train Acc: 85.13%
	 Val. Loss: 0.341 |  Val. Acc: 85.47%
Epoch: 04 | Epoch Time: 0m 37s
	Train Loss: 0.303 | Train Acc: 87.55%
	 Val. Loss: 0.343 |  Val. Acc: 86.34%
Epoch: 05 | Epoch Time: 0m 37s
	Train Loss: 0.265 | Train Acc: 89.23%
	 Val. Loss: 0.337 |  Val. Acc: 86.67%
Epoch: 06 | Epoch Time: 0m 38s
	Train Loss: 0.228 | Train Acc: 91.05%
	 Val. Loss: 0.367 |  Val. Acc: 86.26%
Epoch: 07 | Epoch Time: 0m 38s
	Train Loss: 0.207 | Train Acc: 92.01%
	 Val. Loss: 0.393 |  Val. Acc: 84.96%
Epoch: 08 | Epoch Time: 0m 38s
	Train Loss: 0.187 | Train Acc: 92.87%
	 Val. Loss: 0.382 |  Val. Acc: 86.13%
Epoch: 09 | Epoch Time: 0m 38s
	Train Loss: 0.163 | Train Acc: 93.84%
	 Val. Loss: 0.379 |  Val. Acc: 87.26%
Epoch: 10 | Epoch T

In [38]:
model.load_state_dict(torch.load('transformer-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion, device)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.348 | Test Acc: 85.67%


In [39]:
def predict_sentiment(tokenizer, vocab, model, device, sentence):
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    indexes = [vocab.stoi[token] for token in tokens]
    tensor = torch.LongTensor(indexes).unsqueeze(-1).to(device)
    prediction = model(tensor)
    probabilities = nn.functional.softmax(prediction, dim = -1)
    pos_probability = probabilities.squeeze()[-1].item()
    return pos_probability

In [40]:
sentence = 'the absolute worst movie of all time.'

predict_sentiment(tokenizer, vocab, model, device, sentence)

0.009050699882209301

In [41]:
sentence = 'one of the greatest films i have ever seen in my life.'

predict_sentiment(tokenizer, vocab, model, device, sentence)

0.9908411502838135

In [42]:
sentence = "i thought it was going to be one of the greatest films i have ever seen in my life, \
but it was actually the absolute worst movie of all time."

predict_sentiment(tokenizer, vocab, model, device, sentence)

0.027489816769957542

In [43]:
sentence = "i thought it was going to be the absolute worst movie of all time, \
but it was actually one of the greatest films i have ever seen in my life."

predict_sentiment(tokenizer, vocab, model, device, sentence)

0.026614150032401085