This was made by following along with the webinar posted here: https://www.youtube.com/watch?v=NoixdExFb7Y&t=7349s

In [0]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/10/aeefced99c8a59d828a92cc11d213e2743212d3641c87c82d61b035a7d5c/transformers-2.3.0-py3-none-any.whl (447kB)
[K     |▊                               | 10kB 21.4MB/s eta 0:00:01[K     |█▌                              | 20kB 1.9MB/s eta 0:00:01[K     |██▏                             | 30kB 2.8MB/s eta 0:00:01[K     |███                             | 40kB 1.9MB/s eta 0:00:01[K     |███▋                            | 51kB 2.3MB/s eta 0:00:01[K     |████▍                           | 61kB 2.8MB/s eta 0:00:01[K     |█████▏                          | 71kB 3.2MB/s eta 0:00:01[K     |█████▉                          | 81kB 2.5MB/s eta 0:00:01[K     |██████▋                         | 92kB 2.8MB/s eta 0:00:01[K     |███████▎                        | 102kB 3.1MB/s eta 0:00:01[K     |████████                        | 112kB 3.1MB/s eta 0:00:01[K     |████████▉                       | 122kB 3.1M

In [0]:
!pip install wget

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp36-none-any.whl size=9681 sha256=cc40609501b38ba497b5945ad2f77dd65ed7b267a7cdfba9b3e493049991fded
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


Basic Preprocessing Example

In [0]:
import wget
import os
import torch
import numpy as np
import pandas as pd
from transformers import BertModel, BertTokenizer

In [0]:
#download the dataset from the github repository of the webinar
url = 'https://github.com/theneuralbeing/bert-finetuning-webinar/blob/master/data.zip?raw=true'
if not os.path.exists('./data.zip'):
  wget.download(url, './data.zip')
  !unzip data.zip

Archive:  data.zip
   creating: data/
  inflating: data/train.csv          
  inflating: data/validation.csv     


In [0]:
bert_model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

**PREPROCESSING TEXT FOR BERT**
1. tokenization
2. adding special tokens
3. padding
4. attention mask
5. segment ids (for sequence pairs)
6. convert sequence to integers (token ids)

In [0]:
#preprocessing of input text
sentence = 'he likes playing football'
tokens = tokenizer.tokenize(sentence)
#add special tokens
tokens = ['[CLS]'] + tokens + ['[SEP]']
#add padding tokens
maxlen = 12
if len(tokens) > maxlen:
  tokens = tokens[:maxlen]
else:
  tokens = tokens + ['[PAD]' for _ in range(maxlen - len(tokens))]
tokens

['[CLS]',
 'he',
 'likes',
 'playing',
 'football',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

In [0]:
#build an attention mask to ignore padding tags
attn_mask = [1 if token != '[PAD]' else 0 for token in tokens]
attn_mask


[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]

In [0]:
#if this was a 2-sentence task, there would be a segment id portion to label each sentence in the pair
sentence2 = 'he plays regularly at the playground with his friends'
tokens1 = tokenizer.tokenize(sentence)
tokens1 = ['[CLS]'] + tokens1 + ['[SEP]']
tokens2 = tokenizer.tokenize(sentence2)
tokens2 = tokens2 + ['[SEP]']
tokensP = tokens1 + tokens2
if len(tokensP) > maxlen:
  tokensP = tokensP[:maxlen-1] + ['[SEP]']
else:
  tokensP = tokensP + ['[PAD]' for _ in range(maxlen - len(tokensP))]
tokensP

['[CLS]',
 'he',
 'likes',
 'playing',
 'football',
 '[SEP]',
 'he',
 'plays',
 'regularly',
 'at',
 'the',
 '[SEP]']

In [0]:
#segment ids for the sentence pair only
segment_ids = [0 for _ in range(len(tokens1))] + [1 for _ in range(maxlen-len(tokens1))]
segment_ids

[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]

In [0]:
#taking the single-sentence example, we compute the token ids
token_ids = tokenizer.convert_tokens_to_ids(tokens)
token_ids

[101, 2002, 7777, 2652, 2374, 102, 0, 0, 0, 0, 0, 0]

In [0]:
#convert token ids and attention mask into a torch tensor object that can be recieved by the BERT model
token_ids = torch.tensor(token_ids).unsqueeze(0) #unsqueeze adds another dimension to the shape
attn_mask = torch.tensor(attn_mask).unsqueeze(0) #likewise, squeeze would remove a dimension from the shape
token_ids.shape, attn_mask.shape

(torch.Size([1, 12]), torch.Size([1, 12]))

In [0]:
hidden_repr, cls_head = bert_model(token_ids, attention_mask=attn_mask)
#making cls representation from scratch because it is not representative of semantic content; it's designed for next-sentence prediction, not sentiment analysis.
cls_repr = hidden_repr[:0] #isolating the classification representation from the whole sentence representation
hidden_repr.shape, cls_head.shape

(torch.Size([1, 12, 768]), torch.Size([1, 768]))

**LOAD DATA FOR SENTIMENT ANALYSIS**
[link text](https://)

In [0]:
from torch.utils.data import Dataset, DataLoader

In [0]:
class LoadDataset(Dataset):

  def __init__(self, n, filename, maxlen=64):
    self.n = n
    self.df = pd.read_csv(filename, delimiter=',', nrows=n)
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    self.maxlen = maxlen


  #dataset class is required to return the max length of any given string
  def __len__(self):
    return len(self.df)

  #preprocessing and returning weights by using the data loader
  def __getitem__(self, index):
    sentence = self.df.loc[index, 'review']
    label = self.df.loc[index, 'sentiment']
    #tokenization, special tokens, padding, attention mask, token ids
    tokens = self.tokenizer.tokenize(sentence)                            #tokenization
    tokens = ['[CLS]'] + tokens + ['[SEP]']                               #special tokens
    if len(tokens) < self.maxlen:                                         #padding
      tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))]
    else:
      tokens = tokens[:self.maxlen-1] + ['[SEP]']
    token_ids = self.tokenizer.convert_tokens_to_ids(tokens)              #token ids
    token_ids = torch.tensor(token_ids)
    attn_mask = (token_ids != 0).long()                                   #attention mask (if token id is not 0, return true as integer (1))
    #print(self.df.size)

    return token_ids, attn_mask, label #this would also include the position tag if a next-sentence model

In [0]:
train_set = LoadDataset(filename='data/train.csv', maxlen=64, n=4500)
val_set = LoadDataset(filename='data/validation.csv', maxlen=64, n=500)
print(train_set[0][0].shape, train_set[0][1].shape)

torch.Size([64]) torch.Size([64])


In [0]:
train_loader = DataLoader(train_set, batch_size=32, num_workers=5)
val_loader = DataLoader(val_set, batch_size=32, num_workers=5)

**TRAIN THE BERT MODEL FOR SENTIMENT ANALYSIS**
1. Set the model to train mode
2. Spart the epoch
3. For every batch in the data loader:
  a) zero out gradients
  b) get output of the model
  c) compute loss
  d) backpropagate gradients
  e) optimizer step
  f) at the end of epoch, validate data
4. Finally, save the model

In [0]:
from torch import nn

In [0]:
class SentimentClassifier(nn.Module): #custom models will always inherit from the nn.Module class of torch

  def __init__(self):
    super(SentimentClassifier, self).__init__()
    self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
    self.dropout = nn.Dropout(0.1)
    self.classifier = nn.Linear(768, 1) #N.B. that 768 is the length of the 3rd dimension of the unsqueezed hidden representation and classification tag heads
  
  def forward(self, seq, attn_masks):
    """
		Inputs:
			seq : Tensor of shape [B, T] containing token ids of sequences
			attn_masks : Tensor of shape [B, T] containing attenntion masks to be used to avoid co...
		"""
    seq_repr, _ = self.bert_layer(seq, attention_mask=attn_masks)
    cls_repr = seq_repr[:,0] #the first token is always the classification token
    logits = self.classifier(cls_repr)
    return logits

In [0]:
model = SentimentClassifier()

In [0]:
from torch.optim import Adam
from torch.nn import BCEWithLogitsLoss

criterion = BCEWithLogitsLoss()
optimizer = Adam(model.parameters(), lr=0.00002)

In [0]:
if torch.cuda.is_available():
  device = 'cuda'
  print("Using the GPU: " + torch.cuda.get_device_name(0))
else:
  device = 'cpu'
  print("No GPU available. Using CPU instead.")

Using the GPU: Tesla T4


Training Methods

In [0]:
from time import time

In [0]:
def logits_accuracy(logits, labels):
  probs = torch.sigmoid(logits.unsqueeze(-1))
  preds = (probs > 0.5).long()
  acc = (preds.squeeze() == labels).float().mean()
  return acc

In [0]:
def evaluate(model, criterion, val_loader, device):
  total_loss, total_accuracy = 0, 0
  model.eval() #set the model to evaluation mode
  count = 0
  for (seq, attn_masks, labels) in val_loader:
    count += 1
    seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device)
    #get logit predictions
    val_logits = model(seq, attn_masks)
    #calculate loss
    val_loss = criterion(val_logits.squeeze(0).squeeze(-1), labels.float())
    total_loss += val_loss.item()
    #calculate validation accuracy
    total_accuracy += logits_accuracy(val_logits, labels)
  return total_loss / count, total_accuracy / count

In [0]:
def train(model, criterion, optimizer, train_loader, val_loader, device, epochs=4, print_every=100):
  #set the device to use CUDA and the graphics card
  model.to(device)
  model.train() #set the model to training mode
  print("Training started...")

  for epoch in range(epochs): #4-5 epochs should be optimal
    print("Epoch {}".format(epoch))
    t1 = time()
    #load the batches from the data loader
    for i, (seq, attn_masks, labels) in enumerate(train_loader):
      optimizer.zero_grad()                                                               #zero out the gradients
      seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device)
      logits = model(seq, attn_masks)                                                     #get the output of the model
      loss = criterion(logits.squeeze(0).squeeze(-1), labels.float())                                #calculate the loss
      loss.backward()                                                                     #backpropagation
      nn.utils.clip_grad_norm_(model.parameters(), 1) #clipping gradients to tackle exploding gradients
      optimizer.step()                                                                    #optimizer step
      if (i + 1) % print_every == 0:
        print("Iteration {} ==== Loss {}".format(i+1, loss.item()))
    print("====Validating Data====")
    mean_val_loss, mean_val_acc = evaluate(model, criterion, val_loader, device)                            #validate data
    print("Validation Loss: {}\nValidation Accuracy: {}".format(mean_val_loss, mean_val_acc))

In [0]:
#confirm that the GPU memory is available (this must be done or else I get a "RuntimeError: CUDA out of memory." message and the model doesn't train)
!pip install gputil
import GPUtil
GPUtil.showUtilization()

Collecting gputil
  Downloading https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378cfd9565847d4576fa627d758c554/GPUtil-1.4.0.tar.gz
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-cp36-none-any.whl size=7410 sha256=ad9229da469d7ebf8d9d7274311ffd210ed45b6ee787872fbded2132924fbf97
  Stored in directory: /root/.cache/pip/wheels/3d/77/07/80562de4bb0786e5ea186911a2c831fdd0018bda69beab71fd
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0
| ID | GPU | MEM |
------------------
|  0 |  0% |  0% |


In [0]:
train(model, criterion, optimizer, train_loader, val_loader, device, epochs=5, print_every=20)

Training started...
Epoch 0
Iteration 20 ==== Loss 0.607633113861084
Iteration 40 ==== Loss 0.5047159194946289
Iteration 60 ==== Loss 0.5543547868728638
Iteration 80 ==== Loss 0.5726668834686279
Iteration 100 ==== Loss 0.37867027521133423
Iteration 120 ==== Loss 0.46347641944885254
Iteration 140 ==== Loss 0.38285166025161743
====Validating Data====
Validation Loss: 0.4667821926996112
Validation Accuracy: 0.7890625
Epoch 1
Iteration 20 ==== Loss 0.2442961484193802
Iteration 40 ==== Loss 0.4197276532649994
Iteration 60 ==== Loss 0.677882194519043
Iteration 80 ==== Loss 0.31491440534591675
Iteration 100 ==== Loss 0.07899954169988632
Iteration 120 ==== Loss 0.21755670011043549
Iteration 140 ==== Loss 0.24157455563545227
====Validating Data====
Validation Loss: 0.5969902351498604
Validation Accuracy: 0.7777343988418579
Epoch 2
Iteration 20 ==== Loss 0.1003086268901825
Iteration 40 ==== Loss 0.2278057336807251
Iteration 60 ==== Loss 0.11038534343242645
Iteration 80 ==== Loss 0.01108945254236

In [0]:
#saving our model
save_path = 'checkpoints'
if not os.path.isdir(save_path):
  os.mkdir(save_path)
torch.save({
    'model_state_dict':model.state_dict(),
    'optimizer_state_dict':optimizer.state_dict()
}, os.path.join(save_path, 'model.pth'))

In [0]:
#for inference
torch.save(model.state_dict(), os.path.join(save_path, 'inference.pth'))

In [0]:
ls checkpoints/

inference.pth  model.pth


**PREDICTIONS WITH THE MODEL**

In [0]:
#convert a new message into WordPiece tokens, and then into the input vectors
def preprocess(message, maxlen=64):
  #tokenization, special tokens, padding, attention mask, token ids
  tokens = tokenizer.tokenize(message)                            #tokenization
  tokens = ['[CLS]'] + tokens + ['[SEP]']                         #special tokens
  if len(tokens) < maxlen:                                        #padding
    tokens = tokens + ['[PAD]' for _ in range(maxlen - len(tokens))]
  else:
    tokens = tokens[:self.maxlen-1] + ['[SEP]']
  token_ids = tokenizer.convert_tokens_to_ids(tokens)             #token ids
  token_ids = torch.tensor(token_ids)
  attn_mask = (token_ids != 0).long()   
  return token_ids, attn_mask

In [0]:
def predict(model, seq, attn_masks, device):
  total_loss = 0
  model.eval()
  #load the data
  seq, attn_masks = seq.unsqueeze(0).to(device), attn_masks.unsqueeze(0).to(device)
  #get logit predictions
  logits = model(seq, attn_masks)
  probs = torch.sigmoid(logits.unsqueeze(-1))
  preds = (probs > 0.5).long()
  preds = preds[0][0].tolist()
  return preds

In [0]:
#example = "The new Star Wars movie was terrible! I did not like the acting. The writers should be fired. The characters were very poorly developed. I wish that I could have my money back!"
#example = "Citizen Caine was the best movie I ever saw. You will become smarter if you watch it."
#example = "The movie was mediocre. It had some good parts, but I wish the director was Michael Bay and not Steven Spielberg. That would have been more action-packed."
#example = "I have no opinion on this movie."
example = "I enjoyed the movie a lot. It was very fun. My kids loved it!"
example_token_ids, example_attn_masks = preprocess(example)
print(example_token_ids.shape, example_attn_masks.shape)

prediction = predict(model, example_token_ids, example_attn_masks, device)
print("The review has {} sentiment.".format('negative' if prediction[0] == 0 else 'positive'))

torch.Size([64]) torch.Size([64])
The review has positive sentiment.
