In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import torch
from torch.utils.data import DataLoader, TensorDataset, Dataset
import numpy as np
import pandas as pd
import metal
import os
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [4]:
from dataset import BERTDataset

In [5]:
model = 'bert-base-uncased' # also try bert-base-multilingual-cased (recommended)
src_path = os.path.join(os.environ['GLUEDATA'], 'CoLA/{}.tsv')
dataloaders = {}
for split in ['train','dev']:
    dataset = BERTDataset(
        dataset_name = 'CoLA',
        dataset_split = split,
        sent1_idx=3,
        label_idx=1,
        skip_rows=0,
        label_fn=lambda x: int(x)+1 # labels are scores [1, 2] (multiclass with cardinality k)
    )

    dataloaders[split] = dataset.get_dataloader(batch_size=32)

100%|██████████| 8551/8551 [00:02<00:00, 4033.24it/s]
100%|██████████| 1043/1043 [00:00<00:00, 4146.40it/s]


In [6]:
import torch.nn as nn
from metal.end_model import EndModel

class BertEncoder(nn.Module):
    def __init__(self):
        super(BertEncoder, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased')

    def forward(self, data):
        tokens, segments, mask = data
        # TODO: check if we should return all layers or just last hidden representation 
        _, hidden_layer = self.bert_model(tokens, segments, mask)
        return hidden_layer

class SSTHead(EndModel):     
    def __init__(self, output_dims, **kwargs):
        super(SSTHead, self).__init__(output_dims, **kwargs)

In [7]:
encoder_module = BertEncoder()

In [8]:
end_model = SSTHead(
    [768, 2],
    input_module=encoder_module,
    seed=123,
    skip_head=False,
    input_relu=False,
    input_batchnorm=False,
    verbose=False,
    device=torch.device("cuda")
)

In [11]:
from metal.utils import place_on_gpu
from sklearn.metrics import matthews_corrcoef

def cola_metrics(model, dataloader):
    predictions = []
    targets = []
    for batch in dataloader:
        
        # HACK -- we won't need to do this moving forward
        batch = place_on_gpu(batch)
        
        data, Y = batch
        
        Y = Y.detach().cpu().numpy() - 1

        output = model.forward(data)
        prediction = output.detach().cpu().numpy()
#         import pdb; pdb.set_trace()

        targets.append(Y)
        predictions.append(prediction)
    
    targets = np.concatenate(targets)    
    predictions = np.concatenate(predictions)
    metrics = {
        "matthews_corr": matthews_corr(targets, predictions)
    }
    return metrics

def matthews_corr(targets, predictions):
    predictions = np.argmax(predictions,1)
    return matthews_corrcoef(targets, predictions)

In [12]:
%%time
end_model.train_model(dataloaders['train'], valid_data=dataloaders['dev'],
                      lr=0.0001, l2=0,
                      n_epochs=1,
                      log_train_metrics_func=[cola_metrics],
                      log_train_metrics=["matthews_corr"],
                      log_train_every=50,
                      log_valid_metrics_func=[cola_metrics],
                      log_valid_metrics=["matthews_corr"],
                      log_valid_every=100,
                      checkpoint_metric="train/loss",#'spearman_corr',
                      log_unit="batches",
                      checkpoint_metric_mode='min',
                      verbose=True, progress_bar=True
                    )

Using GPU...
Saving model at iteration 1 with best score 0.581
Saving model at iteration 3 with best score 0.572
[50 bat (0.02 epo)]: TRAIN:[loss=0.627, matthews_corr=0.232]
[100 bat (0.04 epo)]: TRAIN:[loss=0.604, matthews_corr=0.412] VALID:[matthews_corr=0.313]
Saving model at iteration 105 with best score 0.551
Saving model at iteration 106 with best score 0.534
Saving model at iteration 121 with best score 0.534
[150 bat (0.05 epo)]: TRAIN:[loss=0.556, matthews_corr=0.432]
Saving model at iteration 163 with best score 0.530
Saving model at iteration 183 with best score 0.528
Saving model at iteration 184 with best score 0.528
Saving model at iteration 185 with best score 0.521
Saving model at iteration 187 with best score 0.518
Saving model at iteration 191 with best score 0.516
Saving model at iteration 192 with best score 0.511
Saving model at iteration 193 with best score 0.508
Saving model at iteration 195 with best score 0.504
Saving model at iteration 196 with best score 0.50

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


[250 bat (0.09 epo)]: TRAIN:[loss=0.597, matthews_corr=0.000]
Saving model at iteration 251 with best score 0.459
Restoring best model from iteration 251 with score 0.459
Finished Training
Accuracy: 0.691
        y=1    y=2   
 l=1     0     322   
 l=2     0     721   
CPU times: user 1min 57s, sys: 50.5 s, total: 2min 47s
Wall time: 4min 31s


In [None]:
cola_metrics(end_model, dataloaders['dev'])