In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from torch.utils.data import DataLoader, TensorDataset, Dataset
import numpy as np
import pandas as pd
import metal
import os
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [8]:
from dataset import BERTDataset

In [9]:
model = 'bert-base-uncased' # also try bert-base-multilingual-cased (recommended)
src_path = os.path.join(os.environ['GLUEDATA'], 'CoLA/{}.tsv')
dataloaders = {}
for split in ['train','dev']:
    dataset = BERTDataset(
        dataset_name = 'CoLA',
        dataset_split = split,
        sent1_idx=3,
        label_idx=1,
        skip_rows=0,
        label_fn=lambda x: int(x)+1 # labels are scores [1, 2] (multiclass with cardinality k)
    )

    dataloaders[split] = dataset.get_dataloader(batch_size=32)

100%|██████████| 8551/8551 [00:02<00:00, 4200.72it/s]
100%|██████████| 1043/1043 [00:00<00:00, 2226.56it/s]


In [10]:
import torch.nn as nn
from metal.end_model import EndModel

class BertEncoder(nn.Module):
    def __init__(self):
        super(BertEncoder, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased')

    def forward(self, data):
        tokens, segments, mask = data
        # TODO: check if we should return all layers or just last hidden representation 
        _, hidden_layer = self.bert_model(tokens, segments, mask)
        return hidden_layer

class SSTHead(EndModel):     
    def __init__(self, output_dims, **kwargs):
        super(SSTHead, self).__init__(output_dims, **kwargs)

In [None]:
encoder_module = BertEncoder()

In [12]:
end_model = SSTHead(
    [768, 2],
    input_module=encoder_module,
    seed=123,
    skip_head=False,
    input_relu=False,
    input_batchnorm=False,
    verbose=False,
    device=torch.device("cuda")
)

In [15]:
from metal.utils import place_on_gpu
from sklearn.metrics import matthews_corrcoef

def cola_metrics(model, dataloader):
    predictions = []
    targets = []
    for batch in dataloader:
        
        # HACK -- we won't need to do this moving forward
        batch = place_on_gpu(batch)
        
        data, Y = batch
        
        Y = Y.detach().cpu().numpy()

        output = model.forward(data)
        prediction = torch.sigmoid(output)
        prediction = prediction.detach().cpu().numpy()
#         import pdb; pdb.set_trace()

        targets.append(Y)
        predictions.append(prediction)
    
    targets = np.concatenate(targets)    
    predictions = np.concatenate(predictions)
    metrics = {
        "matthews_corr": matthews_corr(targets, predictions)
    }
    return metrics

def matthews_corr(targets, predictions):
    import pdb;
    pdb.set_trace()
    return matthews_corrcoef(targets, predictions)

In [None]:
%%time
end_model.train_model(dataloaders['train'], valid_data=dataloaders['dev'],
                      lr=0.0001, l2=0,
                      n_epochs=1,
                      log_train_metrics_func=[cola_metrics],
                      log_train_metrics=["matthews_corr"],
                      log_train_every=50,
                      log_valid_metrics_func=[cola_metrics],
                      log_valid_metrics=["matthews_corr"],
                      log_valid_every=100,
                      checkpoint_metric="train/loss",#'spearman_corr',
                      log_unit="batches",
                      checkpoint_metric_mode='min',
                      verbose=True, progress_bar=True
                    )

Using GPU...
Saving model at iteration 1 with best score 0.581
Saving model at iteration 3 with best score 0.572
> <ipython-input-15-959cee3a4edd>(34)matthews_corr()
-> return matthews_corrcoef(targets, predictions)


In [None]:
stsb_metrics(end_model, dataloaders['dev'])