In [None]:
! pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/9c/34/fb092588df61bf33f113ade030d1cbe74fb73a0353648f8dd938a223dce7/transformers-3.5.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 6.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 33.3MB/s 
Collecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 23.5MB/s 
[?25hCollecting tokenizers==0.9.3
[?25l  Downloading https://files.pythonhosted.org/packages/4c/34/b39eb9994bc3c999270b69c9eea40ecc6f0e97991dba28282b9fd32d44ee/tokenizers-0.9.3-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K  

In [None]:
# NN library
import torch
import torch.nn as nn
import torch.optim as optim
# Bert model and its tokenizer
from transformers import BertTokenizer, BertModel
# Text data
from torchtext import data, datasets
# Numerical computation
import numpy as np
# standard library
import random
import time
# Configuration
#from config import *

In [None]:
SEED = 1234
TRAIN = True
BATCH_SIZE = 128
N_EPOCHS = 5

# Architecture
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

TEXT = "You are good!"

In [None]:
# Set random seed for reproducible experiments
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
# Get tokens for BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
init_token_id = tokenizer.cls_token_id
eos_token_id  = tokenizer.sep_token_id
pad_token_id  = tokenizer.pad_token_id
unk_token_id  = tokenizer.unk_token_id

max_input_len = tokenizer.max_model_input_sizes['bert-base-uncased']

# Tokensize and crop sentence to 510 (for 1st and last token) instead of 512 (i.e. `max_input_len`)
def tokenize_and_crop(sentence):
  tokens = tokenizer.tokenize(sentence)
  tokens = tokens[:max_input_len - 2]
  return tokens

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
# Load the IMDB dataset and
# return (train_iter, valid_iter, test_iter) tuple
def load_data():
  text = data.Field(
    batch_first=True,
    use_vocab=False,
    tokenize=tokenize_and_crop,
    preprocessing=tokenizer.convert_tokens_to_ids,
    init_token=init_token_id,
    pad_token=pad_token_id,
    unk_token=unk_token_id
  )

  label = data.LabelField(dtype=torch.float)

  train_data, test_data  = datasets.IMDB.splits(text, label)
  train_data, valid_data = train_data.split(random_state=random.seed(SEED))

  print(f"training examples count: {len(train_data)}")
  print(f"test examples count: {len(test_data)}")
  print(f"validation examples count: {len(valid_data)}")

  label.build_vocab(train_data)

  train_iter, valid_iter, test_iter = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device
  )

  return train_iter, valid_iter, test_iter

In [None]:
# Get the device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
# Build model
# 

bert_model = BertModel.from_pretrained('bert-base-uncased')

# Sentiment model containing pretrained BERT as backbone
# and two-GRU layers for analyzing the BERT hidden representation
# and a linear layer for classfification (the sigmoid is applied by the criterion during training).
import torch.nn as nn

class SentimentModel(nn.Module):
  def __init__(
    self,
    bert,
    hidden_dim,
    output_dim,
    n_layers,
    bidirectional,
    dropout
  ):
      
    super(SentimentModel, self).__init__()
    
    self.bert = bert
    embedding_dim = bert.config.to_dict()['hidden_size']
    self.rnn = nn.GRU(
      embedding_dim,
      hidden_dim,
      num_layers=n_layers,
      bidirectional=bidirectional,
      batch_first=True,
      dropout=0 if n_layers < 2 else dropout
    )
    self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
    self.dropout = nn.Dropout(dropout)
      
  def forward(self, text):
    with torch.no_grad():
      embedded = self.bert(text)[0]
            
    _, hidden = self.rnn(embedded)
    
    if self.rnn.bidirectional:
      hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
    else:
      hidden = self.dropout(hidden[-1,:,:])
    
    output = self.out(hidden)
    return output

model = SentimentModel(
  bert_model,
  HIDDEN_DIM,
  OUTPUT_DIM,
  N_LAYERS,
  BIDIRECTIONAL,
  DROPOUT
)
print(model)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…


SentimentModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [None]:
# time taken for single epoch
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

# computes accuracy
def binary_accuracy(preds, y):
  rounded_preds = torch.round(torch.sigmoid(preds))
  correct = (rounded_preds == y).float()
  acc = correct.sum() / len(correct)
  return acc

# training step
def train(model, iterator, optimizer, criterion):
  # stats
  epoch_loss = 0
  epoch_acc = 0
  # train mode
  model.train()
  
  for batch in iterator:
    # train step
    optimizer.zero_grad()
    predictions = model(batch.text).squeeze(1)
    loss = criterion(predictions, batch.label)
    acc = binary_accuracy(predictions, batch.label)
    loss.backward()
    optimizer.step()
    # stats
    epoch_loss += loss.item()
    epoch_acc += acc.item()
  
  return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [None]:
# evaluates the model on given iterator (either 
# train_iter, valid_iter, or test_iter)
def evaluate(model, iterator, criterion):
    
  epoch_loss = 0
  epoch_acc = 0
  # evaluation mode
  model.eval()
  
  with torch.no_grad():
    for batch in iterator:
      predictions = model(batch.text).squeeze(1)
      loss = criterion(predictions, batch.label)
      acc = binary_accuracy(predictions, batch.label)
      epoch_loss += loss.item()
      epoch_acc += acc.item()
      
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

# function to make sentiment prediction during inference
def predict_sentiment(model, tokenizer, sentence):
  model.eval()
  tokens = tokenizer.tokenize(sentence)
  tokens = tokens[:max_input_len - 2]
  indexed = [init_token_id] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_id]
  tensor = torch.LongTensor(indexed).to(device)
  tensor = tensor.unsqueeze(0)
  prediction = torch.sigmoid(model(tensor))
  return prediction.item()

In [None]:
if __name__ == "__main__":
  
  # Train BERT
  if TRAIN:
    # load data
    train_iter, valid_iter, test_iter = load_data()

    optimizer = optim.Adam(model.parameters())
    criterion = nn.BCEWithLogitsLoss().to(device)
    model = model.to(device)

    best_val_loss = float('inf')

    for epoch in range(N_EPOCHS):
      # start time
      start_time = time.time()
      # train for an epoch
      train_loss, train_acc = train(model, train_iter, optimizer, criterion)
      valid_loss, valid_acc = evaluate(model, valid_iter, criterion)
      # end time
      end_time = time.time()
      # stats
      epoch_mins, epoch_secs = epoch_time(start_time, end_time)
      print(valid_loss)
      # save model if has validation loss
      # better than last one
      if valid_loss < best_val_loss:
        best_val_loss = valid_loss
        print(best_val_loss)
        
        torch.save(model.state_dict(), "/content/model.pt")
      # stats
      print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
      print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
      print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    # Test
    #model.load_state_dict(torch.load(path))
    model.load_state_dict(torch.load("/content/model.pt"))
    test_loss, test_acc = evaluate(model, test_iter, criterion)
    print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
  
  # Infer from BERT
  else:
    #model.load_state_dict(torch.load(path))
    model.load_state_dict(torch.load("/content/model.pt", map_location=device))
    sentiment = predict_sentiment(model, tokenizer, TEXT)
    print(sentiment)

training examples count: 17500
test examples count: 25000
validation examples count: 7500
0.2865973812038616
0.2865973812038616
Epoch: 01 | Epoch Time: 7m 7s
	Train Loss: 0.375 | Train Acc: 83.69%
	 Val. Loss: 0.287 |  Val. Acc: 88.32%
0.2585557989649854
0.2585557989649854
Epoch: 02 | Epoch Time: 7m 7s
	Train Loss: 0.331 | Train Acc: 86.15%
	 Val. Loss: 0.259 |  Val. Acc: 89.27%
0.26818145508483304
Epoch: 03 | Epoch Time: 7m 7s
	Train Loss: 0.297 | Train Acc: 87.56%
	 Val. Loss: 0.268 |  Val. Acc: 88.99%
0.2788176655264224
Epoch: 04 | Epoch Time: 7m 8s
	Train Loss: 0.263 | Train Acc: 89.34%
	 Val. Loss: 0.279 |  Val. Acc: 89.00%
0.23543061076079386
0.23543061076079386
Epoch: 05 | Epoch Time: 7m 8s
	Train Loss: 0.247 | Train Acc: 89.85%
	 Val. Loss: 0.235 |  Val. Acc: 90.60%
Test Loss: 0.227 | Test Acc: 90.86%
