In [43]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [69]:
import os

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

from metal.end_model import EndModel
from metal.mmtl.utils.dataset_utils import get_all_dataloaders
from pytorch_pretrained_bert import BertTokenizer
from metal.mmtl.modules import BertEncoder
from metal.mmtl.metal_model import MetalModel
from metal.mmtl.scorer import Scorer
from metal.mmtl.task import Task
from metal.mmtl.trainer import MultitaskTrainer

### Config

In [45]:
bert_model = 'bert-base-uncased'
bert_model_output_shape = 768
max_len = 512
batch_size = 16
split_prop = 0.8

In [46]:
dataloaders = get_all_dataloaders(
    "QNLIR", bert_model,
    split_prop=split_prop,
    max_len=max_len,
    max_datapoints=200,
    dl_kwargs={'batch_size': batch_size}
)

Loading QNLIR Dataset








In [47]:
tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True)

In [64]:
for x, y in dataloaders['valid']:
    break

In [68]:
k = np.random.randint(int(x[0].shape[0]/2))
print(' '.join(tokenizer.convert_ids_to_tokens(x[0][2*k].numpy())).replace('[PAD]', ''))
print()
print(' '.join(tokenizer.convert_ids_to_tokens(x[0][2*k+1].numpy())).replace('[PAD]', ''))

[CLS] a movement broken up further is called what ? [SEP] longer works are often divided into self - contained pieces , called movements , often with contrasting characters or moods . [SEP]                                                         

[CLS] a movement broken up further is called what ? [SEP] these movements can then be further broken down into a hierarchy of smaller units : first sections , then periods , and finally phrases . [SEP]                                                     


In [70]:
trainer_config = {
    "verbose": True,
    "device": "cuda",
    "loss_fn_reduction": "mean",
    "progress_bar": True,
    #"data_loader_config": {"batch_size": 32, "num_workers": 1, "shuffle": True}, ## TODO? 
    "n_epochs": 1,
    # 'grad_clip': 1.0,  ## TODO? 
    "l2": 0.01,
    "optimizer_config": {
        "optimizer": "adam",
        "optimizer_common": {"lr": 1e-5},
        "adam_config": {"betas": (0.9, 0.999)},
    },
    "lr_scheduler": "exponential", # reduce_on_plateau  ## TODO? Warmup
    "lr_scheduler_config": {
        "lr_freeze": 0,
        # Scheduler - exponential
        "exponential_config": {"gamma": 0.9},  # decay rate
        # Scheduler - reduce_on_plateau
        "plateau_config": {
            "factor": 0.5,
            "patience": 10,
            "threshold": 0.0001,
            "min_lr": 1e-4,
        },
    },
    # Logger (see metal/logging/logger.py for descriptions)
    "logger": True,
    "logger_config": {
        "log_unit": "epochs",  # ['seconds', 'examples', 'batches', 'epochs']
        "log_every": 0.05,
        "score_every": 0.1,
    },# Checkpointer (see metal/logging/checkpointer.py for descriptions)
    "checkpoint": True,  # If True, checkpoint models when certain conditions are met
    "checkpoint_config": {
        "checkpoint_every": 0,  # Save a model checkpoint every this many log_units
        "checkpoint_best": True,
        # "checkpoint_final": False,  # Save a model checkpoint at the end of training
        "checkpoint_metric": "ranking/valid/accuracy",
        "checkpoint_metric_mode": "max",
        "checkpoint_dir": f"{os.environ['METALHOME']}/checkpoints/qnli_single",
        "checkpoint_runway": 0,
    },
}

In [71]:
bert_encoder = BertEncoder(bert_model)

In [6]:
from functools import partial
from typing import Callable, List
import torch.nn.functional as F

ranking_head = nn.Linear(in_features=bert_model_output_shape, out_features=2, bias=False)
ranking_task = Task(
    name="ranking",
    data_loaders=dataloaders, 
    input_module=bert_encoder,
    head_module=ranking_head,
    scorer=Scorer(standard_metrics=["accuracy"]),
    loss_hat_func= lambda X, Y: F.cross_entropy(X, Y - 1),
    output_hat_func=partial(F.softmax, dim=1)
)

In [9]:
tasks = [ranking_task]
model = MetalModel(tasks, verbose=False)
trainer = MultitaskTrainer()
trainer.train_model(
    model,
    tasks,
    **trainer_config)

Using GPU...


[0.050521691378363535 epo]: TRAIN:[loss=0.641]
{'ranking/loss': 0.6414394093596417, 'train/loss': 0.6414394093596417}
ranking/valid/accuracy
[0.10104338275672707 epo]: TRAIN:[loss=0.523] VALID:[ranking/accuracy=0.813]
{'ranking/loss': 0.5226242652405864, 'train/loss': 0.5226242652405864, 'ranking/valid/accuracy': 0.8125572030020135}
ranking/valid/accuracy
hello
Saving model at iteration 0.10104338275672707 with best (max) score 0.813
[0.1515650741350906 epo]: TRAIN:[loss=0.466]
{'ranking/loss': 0.46633189868019975, 'train/loss': 0.46633189868019975}
ranking/valid/accuracy

Restoring best model from iteration 0.10104338275672707 with score 0.813
Finished Training
{'ranking/valid/accuracy': 0.8125572030020135}


In [17]:
for (X, Y) in dataloaders['dev']:
    X = [x.cuda() for x in X]
    print(model(X, ['ranking']))
    print(model.calculate_loss(X, Y.cuda(), ['ranking']))    
    print(model.calculate_output(X, ['ranking']))    
    break

{'ranking': tensor([[ 0.3899, -0.5400],
        [ 0.4240, -0.5404],
        [ 0.3580, -0.6234],
        [ 0.4644, -0.6587],
        [-0.8041,  1.6927],
        [-0.4173,  0.9388],
        [-0.3887,  0.4791],
        [ 0.4736, -0.7310],
        [-0.5424,  0.5344],
        [-0.2840,  1.0988],
        [-0.3985,  0.9711],
        [-0.5806,  1.2527],
        [ 0.7220, -0.9857],
        [ 0.2798, -0.3332],
        [ 0.3106, -0.4223],
        [-0.4811,  1.6733]], device='cuda:0', grad_fn=<MmBackward>)}
{'ranking': tensor(0.3953, device='cuda:0', grad_fn=<NllLossBackward>)}
{'ranking': tensor([[0.7171, 0.2829],
        [0.7240, 0.2760],
        [0.7274, 0.2726],
        [0.7546, 0.2454],
        [0.0761, 0.9239],
        [0.2049, 0.7951],
        [0.2957, 0.7043],
        [0.7693, 0.2307],
        [0.2541, 0.7459],
        [0.2006, 0.7994],
        [0.2027, 0.7973],
        [0.1378, 0.8622],
        [0.8465, 0.1535],
        [0.6486, 0.3514],
        [0.6754, 0.3246],
        [0.1039, 0.8961]]

In [12]:
import pandas as pd

In [32]:
df = pd.read_csv('/dfs/scratch0/bradenjh/glue/QNLIR/old_train.tsv', sep='\t', error_bad_lines=False)

In [33]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,question,sentence,label
0,58116,59052,'Melting pot' was first used to describe neigh...,"The term ""melting pot"" was first coined to des...",entailment
1,16326,16673,'Melting pot' was first used to describe neigh...,"Throughout its history, the city has been a ma...",not_entailment
2,10592,10779,"*Slověninъ, plural *Slověne, is the Slavic aut...","In the 6th century AD Procopius, writing in By...",not_entailment
3,36607,37249,"*Slověninъ, plural *Slověne, is the Slavic aut...",The Slavic autonym is reconstructed in Proto-S...,entailment
4,87597,89033,.What was the title of Silius Italicus' epic i...,The Augustan poet Ovid parodies the opening li...,not_entailment


In [34]:
df.columns

Index(['Unnamed: 0', 'index', 'question', 'sentence', 'label'], dtype='object')

In [35]:
df = df.drop(columns=df.columns[0])

In [36]:
df.head()

Unnamed: 0,index,question,sentence,label
0,59052,'Melting pot' was first used to describe neigh...,"The term ""melting pot"" was first coined to des...",entailment
1,16673,'Melting pot' was first used to describe neigh...,"Throughout its history, the city has been a ma...",not_entailment
2,10779,"*Slověninъ, plural *Slověne, is the Slavic aut...","In the 6th century AD Procopius, writing in By...",not_entailment
3,37249,"*Slověninъ, plural *Slověne, is the Slavic aut...",The Slavic autonym is reconstructed in Proto-S...,entailment
4,89033,.What was the title of Silius Italicus' epic i...,The Augustan poet Ovid parodies the opening li...,not_entailment


In [37]:
df.to_csv('/dfs/scratch0/bradenjh/glue/QNLIR/train.tsv', sep='\t', index=False)

In [38]:
import codecs
data_file = '/dfs/scratch0/bradenjh/glue/QNLI/train.tsv'
with codecs.open(data_file, "r", "utf-8") as data_fh:
    print(data_fh.readline())
    print(data_fh.readline())
    s1=data_fh.readline()

index	question	sentence	label

0	When did the third Digimon series begin?	Unlike the two seasons before it and most of the seasons that followed, Digimon Tamers takes a darker and more realistic approach to its story featuring Digimon who do not reincarnate after their deaths and more complex character development in the original Japanese.	not_entailment



In [40]:
import codecs
data_file = '/dfs/scratch0/bradenjh/glue/QNLIR/train.tsv'
with codecs.open(data_file, "r", "utf-8") as data_fh:
    print(data_fh.readline())
    print(data_fh.readline())
    s2=data_fh.readline()

index	question	sentence	label

59052	'Melting pot' was first used to describe neighborhoods in what area of the city?	"The term ""melting pot"" was first coined to describe densely populated immigrant neighborhoods on the Lower East Side."	entailment



In [41]:
s1

'1\tWhich missile batteries often have individual launchers several kilometres from one another?\tWhen MANPADS is operated by specialists, batteries may have several dozen teams deploying separately in small sections; self-propelled air defence guns may deploy in pairs.\tnot_entailment\n'

In [42]:
s2

"16673\t'Melting pot' was first used to describe neighborhoods in what area of the city?\tThroughout its history, the city has been a major port of entry for immigrants into the United States; more than 12 million European immigrants were received at Ellis Island between 1892 and 1924.\tnot_entailment\n"