In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from torch.utils.data import DataLoader, TensorDataset, Dataset
import numpy as np
import pandas as pd
import metal
import os
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [3]:
from dataset import BERTDataset
from dataset import STSB as STSBDataset

In [4]:
model = 'bert-base-uncased' # also try bert-base-multilingual-cased (recommended)
src_path = os.path.join(os.environ['GLUEDATA'], 'STS-B/{}.tsv')
dataloaders = {}
for split in ['train', 'dev']:
    dataset = STSBDataset(split=split, bert_model='bert-base-uncased', max_len=512)

    shuffle = True if split == "train" else False
    dataloaders[split] = dataset.get_dataloader(batch_size=32, shuffle=shuffle)

100%|██████████| 5749/5749 [00:02<00:00, 2385.60it/s]
100%|██████████| 1500/1500 [00:00<00:00, 2274.82it/s]


In [5]:
import torch.nn as nn
from metal.end_model import EndModel

class BertEncoder(nn.Module):
    def __init__(self):
        super(BertEncoder, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased')

    def forward(self, data):
        tokens, segments, mask = data
        # TODO: check if we should return all layers or just last hidden representation 
        _, hidden_layer = self.bert_model(tokens, segments, mask)
        return hidden_layer

class STSBHead(EndModel):     
    def __init__(self, output_dims, **kwargs):
        super(STSBHead, self).__init__(output_dims, **kwargs)
        self.criteria = nn.modules.loss.MSELoss()
        
    @torch.no_grad()
    def predict_proba(self, data):
        return torch.sigmoid(self.forward(data))

    def _loss(self, data, Y):
        output = self.forward(data)
        prediction = torch.sigmoid(output)
        criteria = self.criteria.to(self.config["device"])
        loss = criteria(prediction, Y)

        return loss

    def _get_loss_fn(self):
        # This self.preprocess_Y allows us to not handle preprocessing
        # in a custom dataloader, but decreases speed a bit
        return self._loss

In [6]:
encoder_module = BertEncoder()
end_model = STSBHead(
    [768, 1],
    input_module=encoder_module,
    seed=123,
    skip_head=False,
    input_relu=False,
    input_batchnorm=False,
    verbose=False,
    device=torch.device("cuda")
)

In [7]:
from metal.utils import place_on_gpu
from scipy.stats import spearmanr, pearsonr

def stsb_metrics(model, dataloader):
    predictions = []
    targets = []
    for batch in dataloader:
        
        # HACK -- we won't need to do this moving forward
        batch = place_on_gpu(batch)
        
        data, Y = batch
        
        Y = Y.detach().cpu().numpy()

        output = model.forward(data)
        prediction = torch.sigmoid(output)
        prediction = prediction.detach().cpu().numpy()
#         import pdb; pdb.set_trace()

        targets.append(Y)
        predictions.append(prediction)
    
    targets = np.concatenate(targets)    
    predictions = np.concatenate(predictions)
    metrics = {
        "spearman_corr": spearman_corr(targets, predictions),
        "pearson_corr": pearson_corr(targets, predictions)
    }
    return metrics

        
def spearman_corr(gold, outputs):
    corr, p_value = spearmanr(gold, outputs)
    return corr

def pearson_corr(gold, outputs):
    corr, p_value = pearsonr(gold, outputs)
    return corr

In [8]:
%%time
end_model.train_model(dataloaders['train'], valid_data=dataloaders['dev'],
                      lr=0.0001, l2=0,
                      n_epochs=1,
                      log_train_metrics_func=[stsb_metrics],
                      log_train_metrics=["spearman_corr", "pearson_corr"],
                      log_train_every=50,
                      log_valid_metrics_func=[stsb_metrics],
                      log_valid_metrics=["spearman_corr", "pearson_corr"],
                      log_valid_every=100,
                      checkpoint_metric="train/loss",#'spearman_corr',
                      log_unit="batches",
                      checkpoint_metric_mode='min',
                      verbose=True, progress_bar=True
                    )

Using GPU...
Saving model at iteration 1 with best score 0.090
Saving model at iteration 5 with best score 0.089
Saving model at iteration 6 with best score 0.086
Saving model at iteration 8 with best score 0.085
Saving model at iteration 10 with best score 0.085


Exception ignored in: <generator object tqdm_notebook.__iter__ at 0x7f64198a19e8>
Traceback (most recent call last):
  File "/dfs/scratch0/vschen/venv-mmtl/lib/python3.6/site-packages/tqdm/_tqdm_notebook.py", line 226, in __iter__
    self.sp(bar_style='danger')
AttributeError: 'tqdm_notebook' object has no attribute 'sp'


KeyboardInterrupt: 

In [9]:
stsb_metrics(end_model, dataloaders['dev'])

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()