In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from torch.utils.data import DataLoader, TensorDataset, Dataset
import numpy as np
import pandas as pd
import metal
import os
from pytorch_pretrained_bert import BertTokenizer, BertModel

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [3]:
from dataset import BERTDataset

In [4]:
model = 'bert-base-uncased' # also try bert-base-multilingual-cased (recommended)
src_path = os.path.join(os.environ['GLUEDATA'], 'QQP/{}.tsv')
dataloaders = {}
for split in ['train', 'dev']: #, 'train', 'test']:
    label_idx = 5 if split in ['train', 'dev'] else -1
    dataset = BERTDataset(
        src_path.format(split),
        sent1_idx=3,
        sent2_idx=4,
        label_idx=label_idx,
        skip_rows=1,
        label_fn=lambda label: 1 if label=='0' else 2,
        max_len=128,
    )
    dataloaders[split] = dataset.get_dataloader(batch_size=32)

100%|██████████| 635/635 [00:00<00:00, 1426.30it/s]
100%|██████████| 71/71 [00:00<00:00, 906.87it/s]


In [5]:
import torch.nn as nn
from metal.end_model import EndModel

hidden_dropout_prob = 0.1

class BertEncoder(nn.Module):
    def __init__(self):
        super(BertEncoder, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(hidden_dropout_prob)
#         for param in self.bert_model.parameters():
#             param.requires_grad = False

    def forward(self, data):
        tokens, segments, mask = data
        # TODO: check if we should return all layers or just last hidden representation 
        _, hidden_layer = self.bert_model(tokens, segments, mask, output_all_encoded_layers=False)
        hidden_layer = self.dropout(hidden_layer)

        return hidden_layer

In [6]:
encoder_module = BertEncoder()
end_model = EndModel(
    [768, 2],
    input_module=encoder_module,
    seed=123,
    skip_head=False,
    input_relu=False,
    input_batchnorm=False,
    verbose=False,
    device=torch.device("cuda")
)

In [7]:
end_model.train_model(
    dataloaders["train"],
    valid_data=dataloaders["dev"],
    lr=5e-5,
    l2=0,
    n_epochs=5,
#     checkpoint_metric="model/train/loss",
    checkpoint_metric="valid/accuracy",
    log_unit="batches",
    checkpoint_metric_mode="max",
    verbose=True,
    progress_bar=True,
)

Using GPU...
[1 bat (0.00 epo)]: TRAIN:[loss=0.741] VALID:[accuracy=0.986]
Saving model at iteration 1 with best score 0.986
[2 bat (0.01 epo)]: TRAIN:[loss=0.629] VALID:[accuracy=0.986]
[3 bat (0.01 epo)]: TRAIN:[loss=0.500] VALID:[accuracy=0.986]
[4 bat (0.02 epo)]: TRAIN:[loss=0.356] VALID:[accuracy=0.986]
[5 bat (0.02 epo)]: TRAIN:[loss=0.251] VALID:[accuracy=0.986]
[6 bat (0.03 epo)]: TRAIN:[loss=0.166] VALID:[accuracy=0.986]
[7 bat (0.03 epo)]: TRAIN:[loss=0.108] VALID:[accuracy=0.986]
[8 bat (0.04 epo)]: TRAIN:[loss=0.063] VALID:[accuracy=0.986]
[9 bat (0.04 epo)]: TRAIN:[loss=0.047] VALID:[accuracy=0.986]
[10 bat (0.05 epo)]: TRAIN:[loss=0.034] VALID:[accuracy=0.986]
[11 bat (0.05 epo)]: TRAIN:[loss=0.028] VALID:[accuracy=0.986]
[12 bat (0.06 epo)]: TRAIN:[loss=0.019] VALID:[accuracy=0.986]
[13 bat (0.06 epo)]: TRAIN:[loss=0.013] VALID:[accuracy=0.986]
[14 bat (0.07 epo)]: TRAIN:[loss=0.010] VALID:[accuracy=0.986]
[15 bat (0.07 epo)]: TRAIN:[loss=0.009] VALID:[accuracy=0.986]
[

In [8]:
# Test end model
end_model.score(dataloaders["dev"], metric=["accuracy", "precision", "recall", "f1"])

Accuracy: 0.986
Precision: 0.000
Recall: 0.000
F1: 0.000
        y=1    y=2   
 l=1     0      1    
 l=2     0     70    


[0.9859154929577465, 0, 0.0, 0]