In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from torch.utils.data import DataLoader, TensorDataset, Dataset
import numpy as np
import pandas as pd
import metal
import os
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [3]:
from dataset import BERTDataset

In [4]:
model = 'bert-base-uncased' # also try bert-base-multilingual-cased (recommended)
src_path = os.path.join(os.environ['GLUEDATA'], 'STS-B/{}.tsv')
dataloaders = {}
for split in ['train', 'dev']:
    dataset = BERTDataset(
        src_path.format(split),
        sent1_idx=7,
        sent2_idx=8,
        label_idx=9,
        skip_rows=1,
        label_fn=lambda x: float(x)/5, # labels are scores [1, 2, 3, 4, 5]
        max_len=512,
        label_type=float
    )

    shuffle = True if split == "train" else False
    dataloaders[split] = dataset.get_dataloader(batch_size=32, shuffle=shuffle)

100%|██████████| 5749/5749 [00:02<00:00, 2099.17it/s]
100%|██████████| 1500/1500 [00:00<00:00, 1598.94it/s]


In [None]:
import torch.nn as nn
from metal.end_model import EndModel

class BertEncoder(nn.Module):
    def __init__(self):
        super(BertEncoder, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased')

    def forward(self, data):
        tokens, segments, mask = data
        # TODO: check if we should return all layers or just last hidden representation 
        _, hidden_layer = self.bert_model(tokens, segments, mask)
        return hidden_layer

class STSBHead(EndModel):     
    def __init__(self, output_dims, **kwargs):
        super(STSBHead, self).__init__(output_dims, **kwargs)
        self.criteria = nn.modules.loss.MSELoss()
        
    @torch.no_grad()
    def predict_proba(self, data):
        return torch.sigmoid(self.forward(data))

    def _loss(self, data, Y):
        output = self.forward(data)
        prediction = torch.sigmoid(output)
        criteria = self.criteria.to(self.config["device"])
        loss = criteria(prediction, Y)

        return loss

    def _get_loss_fn(self):
        # This self.preprocess_Y allows us to not handle preprocessing
        # in a custom dataloader, but decreases speed a bit
        return self._loss

In [None]:
encoder_module = BertEncoder()
end_model = STSBHead(
    [768, 1],
    input_module=encoder_module,
    seed=123,
    skip_head=False,
    input_relu=False,
    input_batchnorm=False,
    verbose=False,
    device=torch.device("cuda")
)

In [None]:
from metal.utils import place_on_gpu
from scipy.stats import spearmanr, pearsonr

def stsb_metrics(model, dataloader):
    predictions = []
    targets = []
    for batch in dataloader:
        
        # HACK -- we won't need to do this moving forward
        batch = place_on_gpu(batch)
        
        data, Y = batch
        
        Y = Y.detach().cpu().numpy()

        output = model.forward(data)
        prediction = torch.sigmoid(output)
        prediction = prediction.detach().cpu().numpy()
#         import pdb; pdb.set_trace()

        targets.append(Y)
        predictions.append(prediction)
    
    targets = np.concatenate(targets)    
    predictions = np.concatenate(predictions)
    metrics = {
        "spearman_corr": spearman_corr(targets, predictions),
        "pearson_corr": pearson_corr(targets, predictions)
    }
    return metrics

        
def spearman_corr(gold, outputs):
    corr, p_value = spearmanr(gold, outputs)
    return corr

def pearson_corr(gold, outputs):
    corr, p_value = pearsonr(gold, outputs)
    return corr

In [None]:
end_model.train_model(dataloaders['train'], valid_data=dataloaders['dev'],
                      lr=0.0001, l2=0,
                      n_epochs=1,
                      log_train_metrics_func=[stsb_metrics],
                      log_train_metrics=["spearman_corr", "pearson_corr"],
                      log_train_every=50,
                      log_valid_metrics_func=[stsb_metrics],
                      log_valid_metrics=["spearman_corr", "pearson_corr"],
                      log_valid_every=50,
                      checkpoint_metric="train/loss",#'spearman_corr',
                      log_unit="batches",
                      checkpoint_metric_mode='min',
                      verbose=True, progress_bar=True
                    )

Using GPU...
[1 bat (0.00 epo)]: TRAIN:[loss=0.089, spearman_corr=0.177, pearson_corr=[0.18246996]]
Saving model at iteration 1 with best score 0.089
[2 bat (0.00 epo)]: TRAIN:[loss=0.106, spearman_corr=0.300, pearson_corr=[0.36197945]]
[3 bat (0.00 epo)]: TRAIN:[loss=0.079, spearman_corr=0.274, pearson_corr=[0.30163735]]
Saving model at iteration 3 with best score 0.079
[4 bat (0.00 epo)]: TRAIN:[loss=0.086, spearman_corr=0.386, pearson_corr=[0.46158716]]
[5 bat (0.00 epo)]: TRAIN:[loss=0.075, spearman_corr=0.478, pearson_corr=[0.5733305]]
Saving model at iteration 5 with best score 0.075
[6 bat (0.00 epo)]: TRAIN:[loss=0.066, spearman_corr=0.541, pearson_corr=[0.6170989]]
Saving model at iteration 6 with best score 0.066
[7 bat (0.00 epo)]: TRAIN:[loss=0.079, spearman_corr=0.564, pearson_corr=[0.6373296]]


In [None]:
stsb_metrics(end_model, dataloaders['dev'])