In [1]:
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader
from torch.optim import AdamW, lr_scheduler
from torch.nn.utils import clip_grad_norm_
from torch import nn

from transformers import BertTokenizer, BertForSequenceClassification, BertModel
from transformers import get_linear_schedule_with_warmup

import numpy as np
from sklearn.metrics import matthews_corrcoef
import random
from tqdm import tqdm
import requests
import os
import pandas as pd

**BERT Tokenizer**

In [2]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer

PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [3]:
# Dump into file
with open("bert_vocab.txt", 'w') as f:
    for token in tokenizer.vocab:
        f.write(token+'\n')

# Indices
tokenizer.vocab["[PAD]"], tokenizer.vocab["[UNK]"], tokenizer.vocab["[CLS]"], tokenizer.vocab["[SEP]"], tokenizer.vocab["[MASK]"]

(0, 100, 101, 102, 103)

In [4]:
# 1-char and subword tokens 
one_chars = []
hashes = []
for token in tokenizer.vocab:
    if len(token) == 1:
        one_chars.append(token)
    elif len(token) >= 2 and token[:2] == "##":
        hashes.append(token)

print(len(one_chars), len(hashes), '\n')
for token in one_chars[:25]:
    print(token, end=' ')
    
print()
for token in hashes[:25]:
    print(token, end=' ')
    

997 5828 

! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 
##s ##a ##e ##i ##ing ##n ##o ##d ##ed ##r ##y ##t ##er ##ly ##l ##m ##u ##es ##h ##on ##k ##us ##c ##g ##an 

In [5]:
example_sentence = "Hello. BERT provides contextualized word embeddings."

print(example_sentence, '\n')
print(tokenizer.tokenize(example_sentence))
print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(example_sentence)))

Hello. BERT provides contextualized word embeddings. 

['hello', '.', 'bert', 'provides', 'context', '##ual', '##ized', 'word', 'em', '##bed', '##ding', '##s', '.']
[7592, 1012, 14324, 3640, 6123, 8787, 3550, 2773, 7861, 8270, 4667, 2015, 1012]


In [6]:
# Does "[CLS] tokenized_content [SEP] [PAD] [PAD]..." and returns a list of vocab indices 
ids = tokenizer.encode(example_sentence,
                       add_special_tokens=True,
                       padding="max_length",
                       truncation=True,
                       max_length=18)
print(ids, '\n')

sentence_dict = tokenizer.encode_plus(example_sentence,
                                 add_special_tokens=True,
                                 padding="max_length",
                                 truncation=True,
                                 max_length=18,
                                 return_attention_mask=True)
for item in sentence_dict.items():
    print(item)

[101, 7592, 1012, 14324, 3640, 6123, 8787, 3550, 2773, 7861, 8270, 4667, 2015, 1012, 102, 0, 0, 0] 

('input_ids', [101, 7592, 1012, 14324, 3640, 6123, 8787, 3550, 2773, 7861, 8270, 4667, 2015, 1012, 102, 0, 0, 0])
('token_type_ids', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
('attention_mask', [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0])


**Sentence Classification** (Grammatical/ Ungrammatical)

In [7]:
if not os.path.exists("./cola_public_1.1.zip"):
    r = requests.get("https://nyu-mll.github.io/CoLA/cola_public_1.1.zip")
    with open("./cola_public_1.1.zip", "wb") as f:
        f.write(r.content)
        os.system("unzip ./cola_public_1.1.zip")

In [8]:
df_train = pd.read_csv("./cola_public/raw/in_domain_train.tsv", sep="\t", header=None)
df_train

Unnamed: 0,0,1,2,3
0,gj04,1,,"Our friends won't buy this analysis, let alone..."
1,gj04,1,,One more pseudo generalization and I'm giving up.
2,gj04,1,,One more pseudo generalization or I'm giving up.
3,gj04,1,,"The more we study verbs, the crazier they get."
4,gj04,1,,Day by day the facts are getting murkier.
...,...,...,...,...
8546,ad03,0,*,Poseidon appears to own a dragon
8547,ad03,0,*,Digitize is my happiest memory
8548,ad03,1,,It is easy to slay the Gorgon.
8549,ad03,1,,I had the strangest feeling that I knew you.


In [9]:
train_sentences = df_train[3].values
train_labels = df_train[1].values

In [10]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")
    
device

device(type='cuda', index=0)

In [11]:
required_max = max([len(tokenizer.encode(sentence)) for sentence in train_sentences])
required_max

47

In [12]:
train_sentences_dict = tokenizer.batch_encode_plus(train_sentences,
                                                   add_special_tokens=True,
                                                   padding="max_length",
                                                   truncation=True,
                                                   max_length=64,
                                                   return_attention_mask=True,
                                                   return_tensors="pt")
train_labels = torch.tensor(train_labels)

In [13]:
print(train_sentences[0], '\n')
print(train_sentences_dict["input_ids"][0])

Our friends won't buy this analysis, let alone the next one we propose. 

tensor([  101,  2256,  2814,  2180,  1005,  1056,  4965,  2023,  4106,  1010,
         2292,  2894,  1996,  2279,  2028,  2057, 16599,  1012,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])


In [14]:
train_dataset = TensorDataset(train_sentences_dict["input_ids"],
                              train_sentences_dict["attention_mask"],
                              train_labels)

train_size = int(0.9*len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=6, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=32, num_workers=6, pin_memory=True)

Provided Model

In [15]:
# This adds includes the pretrained "pooler" layer (FC, tanh) on top of the [CLS] embedding,
# and then an untrained "classifier" layer (FC,linear) on top of that
model1 = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=2,
                                                      output_attentions=False,
                                                      output_hidden_states=False)
model1.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

Custom Model

In [16]:
# Add an untrained "pooler" layer (FC, tanh) on top of the [CLS] embedding (remove theirs),
# and then an untrained "classifier" layer (FC,linear) on top of that
class SeqClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bare_bert = BertModel.from_pretrained("bert-base-uncased",
                                              add_pooling_layer=False,  # Remove the pretrained pooler
                                              output_attentions=False,
                                              output_hidden_states=False)
        
        self.classifier = nn.Sequential(nn.Linear(in_features=768, out_features=768),
                                        nn.Tanh(),  # Our pooler layer
                                        nn.Dropout(p=0.1),
                                        nn.Linear(in_features=768, out_features=2))
    
    def forward(self, input_ids, input_masks, labels, return_dict=False):
        
        embeddings, pooler_outs = self.bare_bert(input_ids, input_masks, return_dict=return_dict)    
        
        if pooler_outs is None:  # Since we removed it
            z = self.classifier(embeddings[:, 0, :])  # Pass in the CLS embedding (output at index 0)
        else:
            z = self.classifier(pooler_outs)
            
        loss = nn.CrossEntropyLoss()(z, labels)
        return loss, z

model2 = SeqClassifier()
model2.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SeqClassifier(
  (bare_bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [17]:
def count_trainable_params(model, print_shapes=False):
    total_params = 0
    for p in model.named_parameters():
        if print_shapes:
            print(p[0], p[1].shape)
        total_params += torch.numel(p[1])
    return total_params

count_trainable_params(model1), count_trainable_params(model2)

(109483778, 109483778)

In [18]:
def fit(model, train_loader, val_loader, alpha, num_epochs, grad_clip=None):
    
    optimizer = AdamW(model.parameters(), lr=alpha)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, 
                                                num_training_steps=num_epochs*len(train_loader))
    
    for e in range(num_epochs):
        model.train()
        
        batch_losses = []
        for batch_input_ids, batch_input_masks, batch_labels in tqdm(train_loader):
            batch_input_ids, batch_input_masks, batch_labels = batch_input_ids.to(device), batch_input_masks.to(device), batch_labels.to(device)
            
            loss, logits = model(batch_input_ids, batch_input_masks, labels=batch_labels, return_dict=False)
            loss.backward()
            if grad_clip:
                clip_grad_norm_(model.parameters(), grad_clip)
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
            
            batch_losses.append(loss.item())
            
        epoch_loss = np.mean(batch_losses)
        val_loss, val_acc = evaluate(model, val_loader)
        print(f"Epoch: {e+1}, Train Loss: {epoch_loss}, Val Loss: {val_loss}, Val Acc: {val_acc}") 

def evaluate(model, val_loader, test=False):
    model.eval()
    
    with torch.no_grad():
        
        batch_losses = []
        batch_accs = []
        batch_predictions = []
        batch_ys = []
        batch_logits = []
        
        for batch_input_ids, batch_input_masks, batch_labels in val_loader:
            batch_input_ids, batch_input_masks, batch_labels = batch_input_ids.to(device), batch_input_masks.to(device), batch_labels.to(device)
            
            loss, logits = model(batch_input_ids, batch_input_masks, labels=batch_labels, return_dict=False)
            batch_losses.append(loss.item())
            batch_accs.append(accuracy(logits, batch_labels))
            
            if test:
                batch_predictions.append(torch.max(logits, dim=1)[1])
                batch_ys.append(batch_labels)
                batch_logits.append(logits)
    
        if test:
            predictions = torch.cat(batch_predictions, dim=0)
            logits = torch.cat(batch_logits, dim=0)
            ys = torch.cat(batch_ys, dim=0)
            
    
    if test:
        return mcc(predictions, ys), accuracy(logits, ys)
                   
    else:
        return np.mean(batch_losses), np.mean(batch_accs)
    
def accuracy(zb, yb):
    _, predicted = torch.max(zb, dim=1)
    acc = torch.sum(predicted==yb) / yb.shape[0]
    return acc.item()

def mcc(y_pred, y):
    mcc = matthews_corrcoef(y.to("cpu"), y_pred.to("cpu"))
    return mcc

**Train**

In [19]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

alpha = 2e-5
num_epochs = 4
grad_clip = 1.0

In [20]:
%%time
fit(model1, train_loader, val_loader, alpha, num_epochs, grad_clip=grad_clip)

100%|██████████| 241/241 [01:16<00:00,  3.16it/s]
  0%|          | 0/241 [00:00<?, ?it/s]

Epoch: 1, Train Loss: 0.49027984060687146, Val Loss: 0.4167956726418601, Val Acc: 0.8263888888888888


100%|██████████| 241/241 [01:15<00:00,  3.20it/s]
  0%|          | 0/241 [00:00<?, ?it/s]

Epoch: 2, Train Loss: 0.29354344076511774, Val Loss: 0.4306241157982085, Val Acc: 0.8341049397433246


100%|██████████| 241/241 [01:15<00:00,  3.20it/s]
  0%|          | 0/241 [00:00<?, ?it/s]

Epoch: 3, Train Loss: 0.1872861801696887, Val Loss: 0.46056826136730333, Val Acc: 0.8514660508544357


100%|██████████| 241/241 [01:15<00:00,  3.20it/s]


Epoch: 4, Train Loss: 0.13424759134031924, Val Loss: 0.5413661279060222, Val Acc: 0.8530092592592593
CPU times: user 4min 52s, sys: 21.6 s, total: 5min 14s
Wall time: 5min 13s


In [21]:
%%time
fit(model2, train_loader, val_loader, alpha, num_epochs, grad_clip=grad_clip)

100%|██████████| 241/241 [01:16<00:00,  3.17it/s]
  0%|          | 0/241 [00:00<?, ?it/s]

Epoch: 1, Train Loss: 0.47876590978802486, Val Loss: 0.43310107566692213, Val Acc: 0.8240740740740741


100%|██████████| 241/241 [01:16<00:00,  3.15it/s]
  0%|          | 0/241 [00:00<?, ?it/s]

Epoch: 2, Train Loss: 0.2796057124479183, Val Loss: 0.42525640074853543, Val Acc: 0.8445216064099912


100%|██████████| 241/241 [01:15<00:00,  3.18it/s]
  0%|          | 0/241 [00:00<?, ?it/s]

Epoch: 3, Train Loss: 0.16664320225846718, Val Loss: 0.502888983046567, Val Acc: 0.845293210612403


100%|██████████| 241/241 [01:15<00:00,  3.19it/s]


Epoch: 4, Train Loss: 0.11433429133796222, Val Loss: 0.612985889116923, Val Acc: 0.8441358032049956
CPU times: user 4min 54s, sys: 22.3 s, total: 5min 16s
Wall time: 5min 15s


**Test**

In [22]:
df_test = pd.read_csv("./cola_public/raw/out_of_domain_dev.tsv", sep="\t", header=None)
test_sentences = df_test[3].values
test_labels = df_test[1].values

test_sentences_dict = tokenizer.batch_encode_plus(test_sentences,
                                                   add_special_tokens=True,
                                                   padding="max_length",
                                                   truncation=True,
                                                   max_length=64,
                                                   return_attention_mask=True,
                                                   return_tensors="pt")
test_labels = torch.tensor(test_labels)

test_dataset = TensorDataset(test_sentences_dict["input_ids"],
                              test_sentences_dict["attention_mask"],
                              test_labels)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, num_workers=6, pin_memory=True)

In [23]:
evaluate(model1, test_loader, test=True)

(0.5450036097740508, 0.8139534592628479)

In [24]:
evaluate(model2, test_loader, test=True)

(0.5500916018079557, 0.815891444683075)

**Save parameters**

In [25]:
model1.save_pretrained("./model1")
tokenizer.save_pretrained("./model1")

('./model1/tokenizer_config.json',
 './model1/special_tokens_map.json',
 './model1/vocab.txt',
 './model1/added_tokens.json')

In [27]:
model1 = BertForSequenceClassification.from_pretrained("./model1/")
model1.to(device)
evaluate(model1, test_loader, test=True)

(0.5450036097740508, 0.8139534592628479)

In [28]:
torch.save(model2.state_dict(), "./model2.pth")

In [30]:
model2 = SeqClassifier().to(device)
model2.load_state_dict(torch.load("./model2.pth"))
evaluate(model2, test_loader, test=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(0.5500916018079557, 0.815891444683075)