In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import metal
import os
from pytorch_pretrained_bert import BertTokenizer, BertModel
from dataset import QQPDataset, RTEDataset, WNLIDataset, MNLIDataset, MRPCDataset

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [3]:
train_ds = RTEDataset(split='train', bert_model='bert-base-uncased', max_len=128)
train_dl, dev_dl = train_ds.get_dataloader(split_prop=0.8, batch_size=16)

test_ds = RTEDataset(split='dev', bert_model='bert-base-uncased', max_len=128)
test_dl = test_ds.get_dataloader(batch_size=16)

HBox(children=(IntProgress(value=0, max=2490), HTML(value='')))




HBox(children=(IntProgress(value=0, max=277), HTML(value='')))




In [4]:
import torch.nn as nn

bert_model = 'bert-base-uncased'

In [5]:
class BertEncoder(nn.Module):
    def __init__(self, bert_model='bert-base-uncased', dropout=0.1, cache_dir="."):
        super(BertEncoder, self).__init__()
        self.bert_model = BertModel.from_pretrained(bert_model, cache_dir=cache_dir)
        self.dropout = nn.Dropout(dropout)

    def forward(self, data):
        tokens, segments, mask = data
        output_layer, hidden_layer = self.bert_model(tokens, segments, mask, output_all_encoded_layers=False)
        output_layer = self.dropout(output_layer)
        hidden_layer = self.dropout(hidden_layer)
        return output_layer, hidden_layer

In [6]:
class AverageLayer(nn.Module):
    def __init__(self, k=5):
        super(AverageLayer, self).__init__()
        self.k = k

    def forward(self, x):
        return x / self.k

In [7]:
class LinearSelfAttn(nn.Module):
    def __init__(self, input_size):
        super(LinearSelfAttn, self).__init__()
        self.linear = nn.Linear(input_size, 1)
#         self.softmax = nn.Softmax(1)

    def forward(self, x, x_mask):
        scores = self.linear(x).view(x.size(0), x.size(1))
        scores.data.masked_fill_(x_mask.data, -float('inf'))
        alpha = torch.softmax(scores, 1)
        return alpha.unsqueeze(1).bmm(x).squeeze(1)

In [8]:
class BilinearSelfAttn(nn.Module):
    def __init__(self, x_size, y_size):
        super(BilinearSelfAttn, self).__init__()
        self.linear = nn.Linear(y_size, x_size)
#         self.softmax = nn.Softmax(1)

    def forward(self, x, y, x_mask):
        Wy = self.linear(y)
        xWy = x.bmm(Wy.unsqueeze(2)).squeeze(2)
        xWy.data.masked_fill_(x_mask.data, -float('inf'))
        beta = torch.softmax(xWy, 1)
        return beta.unsqueeze(1).bmm(x).squeeze(1)

In [9]:
class SAN(nn.Module):
    def __init__(self, bert_model=BertEncoder(), emb_size=100, hidden_size=100, num_classes=2, k=5):
        super(SAN, self).__init__()
        self.bert_model = bert_model
        self.sent1_attn = LinearSelfAttn(input_size=emb_size)
        self.sent2_attn = BilinearSelfAttn(emb_size, emb_size)
        self.final_linear = nn.Linear(emb_size * 4, num_classes)
        self.rnn = rnn = nn.GRU(emb_size, hidden_size, 1, batch_first=True)
        self.k = k
        self.num_classes = num_classes
#         self.softmax = nn.Softmax(1)
        
#         for param in self.bert_model.parameters():
#             param.requires_grad = False

    def forward(self, X):
        idx_matrix, seg_matrix, mask_matrix = X
#         sent1, sent1_mask, sent2, sent2_mask = X

        batch_size = idx_matrix.size(0)
        
        output_layer, _ = self.bert_model.forward(X)
       
        res = output_layer.new_zeros((batch_size, self.num_classes))
        
        sk = self.sent1_attn(output_layer, (1 - mask_matrix + seg_matrix).byte())
        
#         sk = self.sent1_attn(sent1, sent1_mask.byte())

        for i in range(self.k):
            
            xk = self.sent2_attn(output_layer, sk, (1 - seg_matrix).byte())
        
#             xk = self.sent2_attn(sent2, sk, sent2_mask.byte())
            _, sk = self.rnn(xk.unsqueeze(1), sk.unsqueeze(0))
            sk = sk.squeeze(0)

            f = torch.softmax(
                self.final_linear(torch.cat((sk, xk, torch.abs(sk - xk), sk * xk), 1)), 1
            )
            res += f

        return res

In [10]:
san = SAN(emb_size = 768, hidden_size = 768)

In [11]:
from metal.mmtl.task import Task

task = Task("RTE", {'train': train_dl, 'valid': dev_dl, "test": test_dl}, san, AverageLayer())
# task = Task(task_name, dataloaders, BertEncoder(), task_head)
tasks = [task]

In [12]:
from metal.end_model import EndModel
from metal.mmtl.metal_model import MetalModel
from metal.mmtl.trainer import MultitaskTrainer

model = MetalModel(tasks, verbose=False)
trainer = MultitaskTrainer()
trainer.train_model(
    model, 
    tasks, 
    n_epochs=3, 
    lr=5e-5,
    progress_bar=False,
    log_every=0.25,
    score_every=0.25,
    checkpoint_best=True,
    #checkpoint_metric=task.name + "/valid/accuracy",
    #checkpoint_metric_mode="max",
    verbose=True,
#     device="cuda",
)

# trainer.train_model(
#     model,
#     tasks,
#     checkpoint_metric="model/train/loss",
#     n_epochs=1,
#     progress_bar=True
# )

Beginning train loop.
Expecting a total of _approximately_ 2000 examples and 125 batches per epoch from 1 tasks.
[0.26 epo]: TRAIN:[loss=0.702] VALID:[RTE/accuracy=0.560]
Saving model at iteration 0.256 with best (min) score 0.702
[0.51 epo]: TRAIN:[loss=0.683] VALID:[RTE/accuracy=0.546]
Saving model at iteration 0.512 with best (min) score 0.683
[0.77 epo]: TRAIN:[loss=0.675] VALID:[RTE/accuracy=0.610]
Saving model at iteration 0.768 with best (min) score 0.675
[1.02 epo]: TRAIN:[loss=0.650] VALID:[RTE/accuracy=0.649]
Saving model at iteration 1.024 with best (min) score 0.650
[1.28 epo]: TRAIN:[loss=0.555] VALID:[RTE/accuracy=0.639]
Saving model at iteration 1.28 with best (min) score 0.555
[1.54 epo]: TRAIN:[loss=0.575] VALID:[RTE/accuracy=0.610]
[1.79 epo]: TRAIN:[loss=0.584] VALID:[RTE/accuracy=0.612]
[2.05 epo]: TRAIN:[loss=0.566] VALID:[RTE/accuracy=0.624]
[2.30 epo]: TRAIN:[loss=0.456] VALID:[RTE/accuracy=0.629]
Saving model at iteration 2.304 with best (min) score 0.456
[2.56 

In [13]:
from metal.end_model import EndModel
end_model = EndModel(
    [2], input_module=san, seed=123, device="cuda", skip_head=True, input_relu=False
)


Network architecture:
SAN(
  (bert_model): BertEncoder(
    (bert_model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): BertLayerNorm()
        (dropout): Dropout(p=0.1)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                (LayerNorm): BertLayerNorm()
                (dropout): Dr

In [14]:
end_model.train_model(
    train_dl,
#     dataset["train"].get_dataloader(batch_size=32),
    valid_data=dev_dl,
#     valid_data=dataset["dev"].get_dataloader(batch_size=32),
#     dataloaders["train"],
#     valid_data=dataloaders["dev"],
    lr=5e-5,
    l2=0,
    n_epochs=3,
#     checkpoint_metric="model/train/loss",
    checkpoint_metric="valid/accuracy",
    log_unit="batches",
    checkpoint_metric_mode="max",
    verbose=True,
    progress_bar=True,
)

Using GPU...
[1 bat (0.00 epo)]: TRAIN:[loss=0.008] VALID:[accuracy=0.633]
Saving model at iteration 1 with best (max) score 0.633
[2 bat (0.00 epo)]: TRAIN:[loss=0.858] VALID:[accuracy=0.592]
[3 bat (0.00 epo)]: TRAIN:[loss=0.990] VALID:[accuracy=0.590]
[4 bat (0.00 epo)]: TRAIN:[loss=1.456] VALID:[accuracy=0.600]
[5 bat (0.01 epo)]: TRAIN:[loss=0.209] VALID:[accuracy=0.637]
Saving model at iteration 5 with best (max) score 0.637
[6 bat (0.01 epo)]: TRAIN:[loss=0.515] VALID:[accuracy=0.641]
Saving model at iteration 6 with best (max) score 0.641
[7 bat (0.01 epo)]: TRAIN:[loss=0.008] VALID:[accuracy=0.637]
[8 bat (0.01 epo)]: TRAIN:[loss=0.320] VALID:[accuracy=0.612]
[9 bat (0.01 epo)]: TRAIN:[loss=1.517] VALID:[accuracy=0.582]
[10 bat (0.01 epo)]: TRAIN:[loss=0.398] VALID:[accuracy=0.586]
[11 bat (0.01 epo)]: TRAIN:[loss=1.125] VALID:[accuracy=0.580]
[12 bat (0.01 epo)]: TRAIN:[loss=0.334] VALID:[accuracy=0.598]
[13 bat (0.02 epo)]: TRAIN:[loss=1.017] VALID:[accuracy=0.620]
[14 bat (

[127 bat (0.15 epo)]: TRAIN:[loss=0.165] VALID:[accuracy=0.635]
[128 bat (0.15 epo)]: TRAIN:[loss=0.135] VALID:[accuracy=0.627]
[129 bat (0.16 epo)]: TRAIN:[loss=0.222] VALID:[accuracy=0.631]
[130 bat (0.16 epo)]: TRAIN:[loss=0.183] VALID:[accuracy=0.637]
[131 bat (0.16 epo)]: TRAIN:[loss=0.146] VALID:[accuracy=0.635]
[132 bat (0.16 epo)]: TRAIN:[loss=0.103] VALID:[accuracy=0.635]
[133 bat (0.16 epo)]: TRAIN:[loss=0.195] VALID:[accuracy=0.635]
[134 bat (0.16 epo)]: TRAIN:[loss=0.154] VALID:[accuracy=0.637]
[135 bat (0.16 epo)]: TRAIN:[loss=0.098] VALID:[accuracy=0.639]
[136 bat (0.16 epo)]: TRAIN:[loss=0.221] VALID:[accuracy=0.639]
[137 bat (0.17 epo)]: TRAIN:[loss=0.298] VALID:[accuracy=0.641]
[138 bat (0.17 epo)]: TRAIN:[loss=0.395] VALID:[accuracy=0.643]
[139 bat (0.17 epo)]: TRAIN:[loss=0.077] VALID:[accuracy=0.641]
[140 bat (0.17 epo)]: TRAIN:[loss=0.084] VALID:[accuracy=0.645]
[141 bat (0.17 epo)]: TRAIN:[loss=0.076] VALID:[accuracy=0.645]
[142 bat (0.17 epo)]: TRAIN:[loss=0.251]

[253 bat (0.30 epo)]: TRAIN:[loss=0.045] VALID:[accuracy=0.637]
[254 bat (0.31 epo)]: TRAIN:[loss=0.045] VALID:[accuracy=0.645]
[255 bat (0.31 epo)]: TRAIN:[loss=0.077] VALID:[accuracy=0.645]
[256 bat (0.31 epo)]: TRAIN:[loss=0.064] VALID:[accuracy=0.645]
[257 bat (0.31 epo)]: TRAIN:[loss=0.226] VALID:[accuracy=0.647]
[258 bat (0.31 epo)]: TRAIN:[loss=0.261] VALID:[accuracy=0.643]
[259 bat (0.31 epo)]: TRAIN:[loss=0.267] VALID:[accuracy=0.639]
[260 bat (0.31 epo)]: TRAIN:[loss=0.102] VALID:[accuracy=0.637]
[261 bat (0.31 epo)]: TRAIN:[loss=0.054] VALID:[accuracy=0.631]
[262 bat (0.32 epo)]: TRAIN:[loss=0.068] VALID:[accuracy=0.635]
[263 bat (0.32 epo)]: TRAIN:[loss=0.115] VALID:[accuracy=0.633]
[264 bat (0.32 epo)]: TRAIN:[loss=0.068] VALID:[accuracy=0.637]
[265 bat (0.32 epo)]: TRAIN:[loss=0.070] VALID:[accuracy=0.637]
[266 bat (0.32 epo)]: TRAIN:[loss=0.017] VALID:[accuracy=0.641]
[267 bat (0.32 epo)]: TRAIN:[loss=0.259] VALID:[accuracy=0.635]
[268 bat (0.32 epo)]: TRAIN:[loss=0.264]

Exception ignored in: <generator object tqdm_notebook.__iter__ at 0x7f9f213e8ba0>
Traceback (most recent call last):
  File "/dfs/scratch0/vschen/venv-mmtl/lib/python3.6/site-packages/tqdm/_tqdm_notebook.py", line 226, in __iter__
    self.sp(bar_style='danger')
AttributeError: 'tqdm_notebook' object has no attribute 'sp'


KeyboardInterrupt: 