In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import metal
import os
from pytorch_pretrained_bert import BertTokenizer, BertModel
from dataset import QQPDataset, RTEDataset, WNLIDataset, MNLIDataset, MRPCDataset

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [3]:
train_ds = RTEDataset(split='train', bert_model='bert-base-uncased', max_len=128)
train_dl, dev_dl = train_ds.get_dataloader(split_prop=0.8, batch_size=16)

HBox(children=(IntProgress(value=0, max=2490), HTML(value='')))




In [4]:
for x, y in train_dl:
    print(x)
    print(y)
    break

(tensor([[  101,  5170,  6384,  ...,     0,     0,     0],
        [  101,  1996,  2343,  ...,     0,     0,     0],
        [  101,  2048, 28171,  ...,     0,     0,     0],
        ...,
        [  101,  6972,  1048,  ...,     0,     0,     0],
        [  101,  3174,  1011,  ...,     0,     0,     0],
        [  101,  1996, 10863,  ...,     0,     0,     0]]), tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]))
tensor([1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1])


In [5]:
# import torch.nn as nn

# bert_model = 'bert-base-uncased'

# class BertEncoder(nn.Module):
#     def __init__(self, dropout=0.1):
#         super(BertEncoder, self).__init__()
#         self.bert_model = BertModel.from_pretrained('bert-base-uncased')
#         self.dropout = nn.Dropout(dropout)

#     def forward(self, data):
#         tokens, segments, mask = data
#         _, hidden_layer = self.bert_model(tokens, segments, mask, output_all_encoded_layers=False)
#         hidden_layer = self.dropout(hidden_layer)

#         return hidden_layer

In [6]:
import torch.nn as nn

bert_model = 'bert-base-uncased'

In [7]:
class BertEncoder(nn.Module):
    def __init__(self, bert_model='bert-base-uncased', dropout=0.1, cache_dir="."):
        super(BertEncoder, self).__init__()
        self.bert_model = BertModel.from_pretrained(bert_model, cache_dir=cache_dir)
        self.dropout = nn.Dropout(dropout)

    def forward(self, data):
        tokens, segments, mask = data
        output_layer, hidden_layer = self.bert_model(tokens, segments, mask, output_all_encoded_layers=False)
        output_layer = self.dropout(output_layer)
        hidden_layer = self.dropout(hidden_layer)
        return output_layer, hidden_layer

In [8]:
class LinearLayer(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearLayer, self).__init__()
        self.linear = nn.Linear(input_size, output_size)

    def forward(self, x):
        return self.linear(x)

In [9]:
class LinearSelfAttn(nn.Module):
    def __init__(self, input_size):
        super(LinearSelfAttn, self).__init__()
        self.linear = nn.Linear(input_size, 1)
#         self.softmax = nn.Softmax(1)

    def forward(self, x, x_mask):
        scores = self.linear(x).view(x.size(0), x.size(1))
        scores.data.masked_fill_(x_mask.data, -float('inf'))
        alpha = torch.softmax(scores, 1)
        return alpha.unsqueeze(1).bmm(x).squeeze(1)

In [10]:
class BilinearSelfAttn(nn.Module):
    def __init__(self, x_size, y_size):
        super(BilinearSelfAttn, self).__init__()
        self.linear = nn.Linear(y_size, x_size)
#         self.softmax = nn.Softmax(1)

    def forward(self, x, y, x_mask):
        Wy = self.linear(y)
        xWy = x.bmm(Wy.unsqueeze(2)).squeeze(2)
        xWy.data.masked_fill_(x_mask.data, -float('inf'))
        beta = torch.softmax(xWy, 1)
        return beta.unsqueeze(1).bmm(x).squeeze(1)

In [11]:
# class SAN(nn.Module):
#     def __init__(self, emb_size=100, hidden_size=100, num_classes=2, k=5):
#         super(SAN, self).__init__()
#         self.bert_model = BertModel.from_pretrained("bert-base-uncased")
#         self.sent1_attn = LinearSelfAttn(input_size=emb_size)
#         self.sent2_attn = BilinearSelfAttn(emb_size, emb_size)
#         self.final_linear = nn.Linear(emb_size * 4, num_classes)
#         self.rnn = rnn = nn.GRU(emb_size, hidden_size, 1, batch_first=True)
#         self.k = k
#         self.num_classes = num_classes
#         self.softmax = nn.Softmax(1)
        
#         for param in self.bert_model.parameters():
#             param.requires_grad = False

#     def forward(self, X):
# #         print("!")
#         sent1, sent1_mask, sent2, sent2_mask = X
#         batch_size = sent1.size(0)
#         sent1, _ = self.bert_model(
#             sent1, sent1_mask, 1 - sent1_mask, output_all_encoded_layers=False
#         )
#         sent2, _ = self.bert_model(
#             sent2, sent2_mask, 1 - sent2_mask, output_all_encoded_layers=False
#         )
#         res = sent1.new_zeros((batch_size, self.num_classes))
# #         res = torch.zeros((batch_size, self.num_classes))
#         sk = self.sent1_attn(sent1, sent1_mask.byte())

#         for i in range(self.k):
#             xk = self.sent2_attn(sent2, sk, sent2_mask.byte())
#             _, sk = self.rnn(xk.unsqueeze(1), sk.unsqueeze(0))
#             sk = sk.squeeze(0)

#             f = self.softmax(
#                 self.final_linear(torch.cat((sk, xk, torch.abs(sk - xk), sk * xk), 1))
#             )
#             res += f

#         return res / self.k

In [12]:
# idx_matrix, seg_matrix, mask_matrix

class SAN(nn.Module):
    def __init__(self, bert_model=BertEncoder(), emb_size=100, hidden_size=100, num_classes=2, k=5):
        super(SAN, self).__init__()
        self.bert_model = bert_model
        self.sent1_attn = LinearSelfAttn(input_size=emb_size)
        self.sent2_attn = BilinearSelfAttn(emb_size, emb_size)
        self.final_linear = nn.Linear(emb_size * 4, num_classes)
        self.rnn = rnn = nn.GRU(emb_size, hidden_size, 1, batch_first=True)
        self.k = k
        self.num_classes = num_classes
#         self.softmax = nn.Softmax(1)
        
#         for param in self.bert_model.parameters():
#             param.requires_grad = False

    def forward(self, X):
        idx_matrix, seg_matrix, mask_matrix = X
#         sent1, sent1_mask, sent2, sent2_mask = X

        batch_size = idx_matrix.size(0)
        
        output_layer, _ = self.bert_model.forward(X)
       
        res = output_layer.new_zeros((batch_size, self.num_classes))
        
        sk = self.sent1_attn(output_layer, (1 - mask_matrix + seg_matrix).byte())
        
#         sk = self.sent1_attn(sent1, sent1_mask.byte())

        for i in range(self.k):
            
            xk = self.sent2_attn(output_layer, sk, (1 - seg_matrix).byte())
        
#             xk = self.sent2_attn(sent2, sk, sent2_mask.byte())
            _, sk = self.rnn(xk.unsqueeze(1), sk.unsqueeze(0))
            sk = sk.squeeze(0)

            f = torch.softmax(
                self.final_linear(torch.cat((sk, xk, torch.abs(sk - xk), sk * xk), 1)), 1
            )
            res += f

        return res / self.k

In [13]:
san = SAN(emb_size = 768, hidden_size = 768)

In [14]:
# task = Task(task_name, dataloaders, BertEncoder(), task_head)

In [15]:
from metal.end_model import EndModel
end_model = EndModel(
    [2], input_module=san, seed=123, device="cuda", skip_head=True, input_relu=False
)


Network architecture:
SAN(
  (bert_model): BertEncoder(
    (bert_model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): BertLayerNorm()
        (dropout): Dropout(p=0.1)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                (LayerNorm): BertLayerNorm()
                (dropout): Dr

In [16]:
end_model.train_model(
    train_dl,
#     dataset["train"].get_dataloader(batch_size=32),
    valid_data=dev_dl,
#     valid_data=dataset["dev"].get_dataloader(batch_size=32),
#     dataloaders["train"],
#     valid_data=dataloaders["dev"],
    lr=5e-5,
    l2=0,
    n_epochs=3,
#     checkpoint_metric="model/train/loss",
    checkpoint_metric="valid/accuracy",
    log_unit="batches",
    checkpoint_metric_mode="max",
    verbose=True,
    progress_bar=True,
)

Using GPU...
[1 bat (0.00 epo)]: TRAIN:[loss=0.703] VALID:[accuracy=0.518]
Saving model at iteration 1 with best (max) score 0.518
[2 bat (0.00 epo)]: TRAIN:[loss=0.666] VALID:[accuracy=0.520]
Saving model at iteration 2 with best (max) score 0.520
[3 bat (0.00 epo)]: TRAIN:[loss=0.711] VALID:[accuracy=0.510]
[4 bat (0.00 epo)]: TRAIN:[loss=0.723] VALID:[accuracy=0.490]
[5 bat (0.01 epo)]: TRAIN:[loss=0.682] VALID:[accuracy=0.474]
[6 bat (0.01 epo)]: TRAIN:[loss=0.683] VALID:[accuracy=0.472]
[7 bat (0.01 epo)]: TRAIN:[loss=0.659] VALID:[accuracy=0.480]
[8 bat (0.01 epo)]: TRAIN:[loss=0.707] VALID:[accuracy=0.470]
[9 bat (0.01 epo)]: TRAIN:[loss=0.707] VALID:[accuracy=0.472]
[10 bat (0.01 epo)]: TRAIN:[loss=0.646] VALID:[accuracy=0.466]
[11 bat (0.01 epo)]: TRAIN:[loss=0.701] VALID:[accuracy=0.466]
[12 bat (0.01 epo)]: TRAIN:[loss=0.633] VALID:[accuracy=0.468]
[13 bat (0.02 epo)]: TRAIN:[loss=0.711] VALID:[accuracy=0.472]
[14 bat (0.02 epo)]: TRAIN:[loss=0.744] VALID:[accuracy=0.474]
[1

[115 bat (0.14 epo)]: TRAIN:[loss=0.570] VALID:[accuracy=0.633]
[116 bat (0.14 epo)]: TRAIN:[loss=0.648] VALID:[accuracy=0.635]
[117 bat (0.14 epo)]: TRAIN:[loss=0.538] VALID:[accuracy=0.673]
Saving model at iteration 117 with best (max) score 0.673
[118 bat (0.14 epo)]: TRAIN:[loss=0.708] VALID:[accuracy=0.643]
[119 bat (0.14 epo)]: TRAIN:[loss=0.507] VALID:[accuracy=0.645]
[120 bat (0.14 epo)]: TRAIN:[loss=0.570] VALID:[accuracy=0.653]
[121 bat (0.15 epo)]: TRAIN:[loss=0.784] VALID:[accuracy=0.653]
[122 bat (0.15 epo)]: TRAIN:[loss=0.645] VALID:[accuracy=0.647]
[123 bat (0.15 epo)]: TRAIN:[loss=0.644] VALID:[accuracy=0.649]
[124 bat (0.15 epo)]: TRAIN:[loss=0.626] VALID:[accuracy=0.639]
[125 bat (0.15 epo)]: TRAIN:[loss=0.661] VALID:[accuracy=0.629]
[126 bat (0.15 epo)]: TRAIN:[loss=0.552] VALID:[accuracy=0.629]
[127 bat (0.15 epo)]: TRAIN:[loss=0.543] VALID:[accuracy=0.629]
[128 bat (0.15 epo)]: TRAIN:[loss=0.466] VALID:[accuracy=0.629]
[129 bat (0.16 epo)]: TRAIN:[loss=0.529] VALID

[239 bat (0.29 epo)]: TRAIN:[loss=0.705] VALID:[accuracy=0.657]
[240 bat (0.29 epo)]: TRAIN:[loss=0.613] VALID:[accuracy=0.645]
[241 bat (0.29 epo)]: TRAIN:[loss=0.420] VALID:[accuracy=0.645]
[242 bat (0.29 epo)]: TRAIN:[loss=0.540] VALID:[accuracy=0.645]
[243 bat (0.29 epo)]: TRAIN:[loss=0.596] VALID:[accuracy=0.651]
[244 bat (0.29 epo)]: TRAIN:[loss=0.578] VALID:[accuracy=0.659]
[245 bat (0.30 epo)]: TRAIN:[loss=0.579] VALID:[accuracy=0.657]
[246 bat (0.30 epo)]: TRAIN:[loss=0.545] VALID:[accuracy=0.673]
[247 bat (0.30 epo)]: TRAIN:[loss=0.576] VALID:[accuracy=0.667]
[248 bat (0.30 epo)]: TRAIN:[loss=0.629] VALID:[accuracy=0.673]
[249 bat (0.30 epo)]: TRAIN:[loss=0.665] VALID:[accuracy=0.675]
[250 bat (0.30 epo)]: TRAIN:[loss=0.530] VALID:[accuracy=0.661]
[251 bat (0.30 epo)]: TRAIN:[loss=0.470] VALID:[accuracy=0.667]
[252 bat (0.30 epo)]: TRAIN:[loss=0.453] VALID:[accuracy=0.653]
[253 bat (0.30 epo)]: TRAIN:[loss=0.563] VALID:[accuracy=0.639]
[254 bat (0.31 epo)]: TRAIN:[loss=0.406]

[366 bat (0.44 epo)]: TRAIN:[loss=0.449] VALID:[accuracy=0.590]
[367 bat (0.44 epo)]: TRAIN:[loss=0.688] VALID:[accuracy=0.582]
[368 bat (0.44 epo)]: TRAIN:[loss=0.582] VALID:[accuracy=0.582]
[369 bat (0.44 epo)]: TRAIN:[loss=0.616] VALID:[accuracy=0.586]
[370 bat (0.45 epo)]: TRAIN:[loss=0.571] VALID:[accuracy=0.584]
[371 bat (0.45 epo)]: TRAIN:[loss=0.611] VALID:[accuracy=0.588]
[372 bat (0.45 epo)]: TRAIN:[loss=0.538] VALID:[accuracy=0.608]
[373 bat (0.45 epo)]: TRAIN:[loss=0.417] VALID:[accuracy=0.618]
[374 bat (0.45 epo)]: TRAIN:[loss=0.692] VALID:[accuracy=0.641]
[375 bat (0.45 epo)]: TRAIN:[loss=0.317] VALID:[accuracy=0.651]
Restoring best model from iteration 321 with score 0.691
Finished Training
Accuracy: 0.691
        y=1    y=2   
 l=1    194    112   
 l=2    42     150   


In [20]:
test_ds = RTEDataset(split='dev', bert_model='bert-base-uncased', max_len=128)
test_dl = test_ds.get_dataloader(batch_size=16)

HBox(children=(IntProgress(value=0, max=277), HTML(value='')))




In [21]:
# Test end model
end_model.score(test_dl, metric=["accuracy", "precision", "recall", "f1"])

Accuracy: 0.679
Precision: 0.669
Recall: 0.774
F1: 0.717
        y=1    y=2   
 l=1    113    56    
 l=2    33     75    


[0.6787003610108303, 0.6686390532544378, 0.773972602739726, 0.7174603174603176]