In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from torch.utils.data import DataLoader, TensorDataset, Dataset
import numpy as np
import pandas as pd
import metal
import os
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [3]:
from dataset import BERTDataset

In [4]:
model = 'bert-base-uncased' # also try bert-base-multilingual-cased (recommended)
src_path = os.path.join(os.environ['GLUEDATA'], 'CoLA/{}.tsv')
dataloaders = {}
for split in ['train','dev']:
    dataset = BERTDataset(
        tsv_path = src_path.format(split),
        sent1_idx=3,
        label_idx=1,
        skip_rows=0,
        label_fn=lambda x: int(x)+1 # labels are scores [1, 2] (multiclass with cardinality k)
    )

    dataloaders[split] = dataset.get_dataloader(batch_size=32)

100%|██████████| 8551/8551 [00:02<00:00, 3990.70it/s]
100%|██████████| 1043/1043 [00:00<00:00, 4127.75it/s]


In [5]:
import torch.nn as nn
from metal.end_model import EndModel

class BertEncoder(nn.Module):
    def __init__(self):
        super(BertEncoder, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased')

    def forward(self, data):
        tokens, segments, mask = data
        # TODO: check if we should return all layers or just last hidden representation 
        _, hidden_layer = self.bert_model(tokens, segments, mask)
        return hidden_layer

class SSTHead(EndModel):     
    def __init__(self, output_dims, **kwargs):
        super(SSTHead, self).__init__(output_dims, **kwargs)

In [6]:
encoder_module = BertEncoder()

In [7]:
end_model = SSTHead(
    [768, 2],
    input_module=encoder_module,
    seed=123,
    skip_head=False,
    input_relu=False,
    input_batchnorm=False,
    verbose=False,
    device=torch.device("cuda")
)

In [8]:
from metal.utils import place_on_gpu
from sklearn.metrics import matthews_corrcoef

def cola_metrics(model, dataloader):
    predictions = []
    targets = []
    for batch in dataloader:
        
        # HACK -- we won't need to do this moving forward
        batch = place_on_gpu(batch)
        
        data, Y = batch
        
        Y = Y.detach().cpu().numpy() - 1

        output = model.forward(data)
        prediction = output.detach().cpu().numpy()
#         import pdb; pdb.set_trace()

        targets.append(Y)
        predictions.append(prediction)
    
    targets = np.concatenate(targets)    
    predictions = np.concatenate(predictions)
    metrics = {
        "matthews_corr": matthews_corr(targets, predictions)
    }
    return metrics

def matthews_corr(targets, predictions):
    predictions = np.argmax(predictions,1)
    return matthews_corrcoef(targets, predictions)

In [12]:
%%time
end_model.train_model(dataloaders['train'], valid_data=dataloaders['dev'],
                      lr=0.0001, l2=0,
                      n_epochs=5,
                      log_train_metrics_func=[cola_metrics],
                      log_train_metrics=["matthews_corr"],
                      log_train_every=50,
                      log_valid_metrics_func=[cola_metrics],
                      log_valid_metrics=["matthews_corr"],
                      log_valid_every=100,
                      checkpoint_metric="model/train/loss",#'matthews_corr',
                      log_unit="batches",
                      checkpoint_metric_mode='min',
                      verbose=True, progress_bar=True
                    )

Using GPU...
Saving model at iteration 1 with best score 0.273
[50 bat (0.02 epo)]: TRAIN:[loss=0.456, matthews_corr=0.496]
[100 bat (0.04 epo)]: TRAIN:[loss=0.374, matthews_corr=0.528] VALID:[matthews_corr=0.339]
[150 bat (0.05 epo)]: TRAIN:[loss=0.559, matthews_corr=0.530]
[200 bat (0.07 epo)]: TRAIN:[loss=0.489, matthews_corr=0.356] VALID:[matthews_corr=0.191]
[250 bat (0.09 epo)]: TRAIN:[loss=0.508, matthews_corr=0.602]
Saving model at iteration 269 with best score 0.257
[300 bat (0.11 epo)]: TRAIN:[loss=0.367, matthews_corr=0.659] VALID:[matthews_corr=0.458]
[350 bat (0.12 epo)]: TRAIN:[loss=0.354, matthews_corr=0.643]
Saving model at iteration 351 with best score 0.234
Saving model at iteration 353 with best score 0.209
Saving model at iteration 354 with best score 0.208
[400 bat (0.14 epo)]: TRAIN:[loss=0.351, matthews_corr=0.684] VALID:[matthews_corr=0.405]
[450 bat (0.16 epo)]: TRAIN:[loss=0.361, matthews_corr=0.316]
[500 bat (0.18 epo)]: TRAIN:[loss=0.372, matthews_corr=0.708

In [10]:
cola_metrics(end_model, dataloaders['dev'])

{'matthews_corr': 0.0}