In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
task_names = [
    "COLA", 
#     "SST2",
#     "MNLI",
#     "RTE",
#     "WNLI",
#     "QQP",
#     "MRPC",
#     "STSB",
#     "QNLI",
#     "SPACY_POS",
#     "SPACY_NER",
#     "THIRD",
#     "BLEU",
]

FILENAME = "COLA_tasks_and_payloads"

In [3]:
SEED = 1

## Load previously trained model
Hint: make sure the `bert_model` is initialized correctly!

In [4]:
%%time
from metal.mmtl.glue.glue_tasks import create_tasks_and_payloads

# Create tasks and payloads
tasks, payloads = create_tasks_and_payloads(
    task_names,
    dl_kwargs={"batch_size": 16},
#     freeze_bert=True,
    bert_model='bert-large-cased'
)

Using random seed: 590343
Loading COLA Dataset


HBox(children=(IntProgress(value=0, max=8551), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1043), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1063), HTML(value='')))


CPU times: user 26.7 s, sys: 7.35 s, total: 34 s
Wall time: 31 s


In [5]:
tasks, payloads

([ClassificationTask(name=COLA, loss_multiplier=1.0)],
 [Payload(COLA_train: tasks=[COLA], split=train),
  Payload(COLA_valid: tasks=[COLA], split=valid),
  Payload(COLA_test: tasks=[COLA], split=test)])

In [6]:
from metal.mmtl.metal_model import MetalModel

model = MetalModel(tasks, seed=SEED, verbose=False)

In [7]:
import os
import torch
model_dir = '/dfs/scratch0/chami/metal/metal/mmtl/aws/output/2019_03_14_01_58_14/0/logdir/bert_large/QNLI.STSB.MRPC.QQP.RTE.MNLI.SST2.COLA.WNLI_09_15_09'
model_path = os.path.join(model_dir, 'best_model.pth')
device = torch.device(f"cuda:0")
model.load_state_dict(torch.load(model_path, map_location=device)["model"], strict=False)

#### Sanity check that task head is trained!

In [8]:
model.score(payloads[1])

{'COLA/COLA_valid/accuracy': 0.8465963566634708,
 'COLA/COLA_valid/matthews_corr': 0.6313245338647664}

## Define slices for evaluation

In [9]:
%%time
from metal.mmtl.glue.glue_tasks import create_tasks_and_payloads

# define slices
slice_dict = {  # A map of the slices that apply to each task
   "COLA": ["ends_with_question_mark"]
}

# Create tasks and payloads
_, payloads_slice = create_tasks_and_payloads(
    task_names,
    dl_kwargs={"batch_size": 16},
    slice_dict=slice_dict,
#     freeze_bert=True,
    bert_model='bert-large-cased'
)

Using random seed: 128037
Loading COLA Dataset


HBox(children=(IntProgress(value=0, max=8551), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1043), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1063), HTML(value='')))


Added label_set with 8551 labels for task COLA:ends_with_question_mark to payload COLA_train.
Added label_set with 1043 labels for task COLA:ends_with_question_mark to payload COLA_valid.
Added label_set with 1063 labels for task COLA:ends_with_question_mark to payload COLA_test.
CPU times: user 21.5 s, sys: 7.13 s, total: 28.6 s
Wall time: 29.6 s


In [10]:
payloads_slice

[Payload(COLA_train: tasks=[COLA,COLA:ends_with_question_mark], split=train),
 Payload(COLA_valid: tasks=[COLA,COLA:ends_with_question_mark], split=valid),
 Payload(COLA_test: tasks=[COLA,COLA:ends_with_question_mark], split=test)]

#### Sanity check the number of examples in the train set

In [11]:
import numpy as np
def count_num_labels(labels):
    return np.sum(np.array(labels) != 0)

In [12]:
dataset = payloads_slice[0].data_loader.dataset
for labelset_name, labels in dataset.labels.items():
    print(labelset_name, count_num_labels(labels))

COLA 8551
COLA:ends_with_question_mark 615


### Evaluate baseline model on the slice of interest

In [13]:
model.score(payloads_slice[1])

Evaluating 57 / 1043 active labels


{'COLA/COLA_valid/accuracy': 0.8465963566634708,
 'COLA/COLA_valid/matthews_corr': 0.6313245338647664,
 'COLA:ends_with_question_mark/COLA_valid/accuracy': 0.7543859649122807,
 'COLA:ends_with_question_mark/COLA_valid/matthews_corr': 0.4818181818181818}

## Finetune model on slice of interest

In [14]:
from metal.mmtl.metal_model import MetalModel

from metal.mmtl.trainer import MultitaskTrainer
trainer = MultitaskTrainer(seed=SEED)

In [15]:
# # only finetune on the slices, not the original task
# for p in payloads_slice:
#     p.task_names.remove('COLA')
# payloads_slice

NOTE: We are training on a different set of payloads than we initialized the model with.

In [16]:
trainer.train_model(
    model,
    payloads_slice,
    checkpoint_metric="COLA:ends_with_question_mark/COLA_valid/matthews_corr",
    checkpoint_metric_mode="max",
    checkoint_best=True,
    writer="tensorboard",
    optimizer="adamax",
    lr=3e-5,
    l2=1e-3,
    log_every=0.05, 
    score_every=0.1,
    n_epochs=1,
    progress_bar=True,
    checkpoint_tasks=True
)

Beginning train loop.
Expecting a total of approximately 8560 examples and 535 batches per epoch from 1 payload(s) in the train split.
Writing config to /dfs/scratch0/vschen/metal-mmtl/logs/2019_03_17/20_39_08/config.json




HBox(children=(IntProgress(value=0, max=535), HTML(value='')))

[0.05 epo]: COLA:[train/loss=2.20e-01] COLA:ends_with_question_mark:[train/loss=2.75e-01] model:[train/loss=2.24e-01, train/lr=3.00e-05]
Evaluating 57 / 1043 active labels
[0.10 epo]: COLA:[train/loss=3.46e-01, COLA_valid/accuracy=8.35e-01, COLA_valid/matthews_corr=5.96e-01] COLA:ends_with_question_mark:[train/loss=4.73e-01, COLA_valid/accuracy=6.84e-01, COLA_valid/matthews_corr=3.07e-01] model:[train/loss=3.55e-01, train/lr=3.00e-05, valid/glue=5.96e-01]
Saving model at iteration 0.10 with best (max) score COLA/COLA_valid/matthews_corr=0.596




Saving model at iteration 0.10 with best (max) score COLA:ends_with_question_mark/COLA_valid/matthews_corr=0.307
[0.15 epo]: COLA:[train/loss=2.53e-01] COLA:ends_with_question_mark:[train/loss=4.18e-01] model:[train/loss=2.69e-01, train/lr=3.00e-05]
Evaluating 57 / 1043 active labels
[0.20 epo]: COLA:[train/loss=3.48e-01, COLA_valid/accuracy=8.43e-01, COLA_valid/matthews_corr=6.22e-01] COLA:ends_with_question_mark:[train/loss=3.26e-01, COLA_valid/accuracy=7.37e-01, COLA_valid/matthews_corr=4.40e-01] model:[train/loss=3.47e-01, train/lr=3.00e-05, valid/glue=6.22e-01]
Saving model at iteration 0.20 with best (max) score COLA/COLA_valid/matthews_corr=0.622
Saving model at iteration 0.20 with best (max) score COLA:ends_with_question_mark/COLA_valid/matthews_corr=0.440
[0.25 epo]: COLA:[train/loss=3.61e-01] COLA:ends_with_question_mark:[train/loss=6.54e-01] model:[train/loss=3.81e-01, train/lr=3.00e-05]
Evaluating 57 / 1043 active labels
[0.30 epo]: COLA:[train/loss=2.12e-01, COLA_valid/acc

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Evaluating 53 / 1063 active labels
{'COLA/COLA_test/accuracy': 0.0,
 'COLA/COLA_test/matthews_corr': 0.0,
 'COLA/COLA_train/accuracy': 0.9401239621096947,
 'COLA/COLA_train/matthews_corr': 0.854402919502345,
 'COLA/COLA_valid/accuracy': 0.8312559923298178,
 'COLA/COLA_valid/matthews_corr': 0.5856531698367675,
 'COLA:ends_with_question_mark/COLA_test/accuracy': 0.0,
 'COLA:ends_with_question_mark/COLA_test/matthews_corr': 0.0,
 'COLA:ends_with_question_mark/COLA_train/accuracy': 0.9252032520325203,
 'COLA:ends_with_question_mark/COLA_train/matthews_corr': 0.8260685528009882,
 'COLA:ends_with_question_mark/COLA_valid/accuracy': 0.7543859649122807,
 'COLA:ends_with_question_mark/COLA_valid/matthews_corr': 0.4742657956547625,
 'model/None/glue': 0.5856531698367675}
Restoring best model from iteration 0.20 with score 0.622
Restoring best model from iteration 0.71 with score 0.474
Final scores using task-specific checkpoints:
{'COLA/COLA_test/accuracy': 0.0,
 'COLA/COLA_test/matthews_corr': 

#### Did we improve?

In [17]:
from metal.mmtl.metal_model import MetalModel
model.score(payloads_slice[1])

Evaluating 57 / 1043 active labels


{'COLA/COLA_valid/accuracy': 0.8370086289549377,
 'COLA/COLA_valid/matthews_corr': 0.6007221525351318,
 'COLA:ends_with_question_mark/COLA_valid/accuracy': 0.7543859649122807,
 'COLA:ends_with_question_mark/COLA_valid/matthews_corr': 0.4742657956547625}