# Setup

In [1]:
import os
import torch
from torch import cuda
from torch.utils.data import TensorDataset, DataLoader
import evaluate
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainingArguments, pipeline, AdamW, get_scheduler
from argparse import ArgumentParser
import numpy as np
from datasets import load_dataset, DownloadMode
from tqdm.auto import tqdm
from script.rec_adam import RecAdam

## model + tokenizer

In [2]:
model_name='facebook/bart-base'
#model_name='gpt2'
device='cuda'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = "[PAD]"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to(device)
print("Model + tokenizer")

TADA!
TADA!
TADA!
TADA!
TADA!
TADA!


Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['decoder.layers.4.adapters.4.sequential.2.bias', 'decoder.layers.1.adapters.3.sequential.0.bias', 'decoder.layers.4.adapters.1.sequential.2.weight', 'decoder.layers.4.ln_3.weight', 'decoder.layers.4.adapters.1.sequential.0.weight', 'decoder.layers.0.ln_3.bias', 'decoder.layers.2.adapters.3.sequential.2.bias', 'decoder.layers.5.adapters.3.sequential.2.bias', 'decoder.layers.5.adapters.4.sequential.0.weight', 'decoder.layers.2.adapters.4.sequential.2.bias', 'decoder.layers.3.adapters.4.sequential.0.bias', 'decoder.layers.1.adapters.4.sequential.2.weight', 'decoder.layers.5.adapters.2.sequential.2.weight', 'decoder.layers.2.adapters.3.sequential.0.weight', 'decoder.layers.5.adapters.0.sequential.0.bias', 'decoder.layers.0.adapters.0.sequential.2.weight', 'decoder.layers.2.adapters.4.sequential.0.weight', 'decoder.layers.5.adapters.3.sequential.2.wei

Model + tokenizer


In [56]:
# Freeze model except head and adapters parameters
for param in model.model.parameters():
    param.requires_grad = False
for param in model.lm_head.parameters():
    param.requires_grad = True
for layer_index in range(6):
    layer = model.model.decoder.layers[layer_index]
    for part in [layer.adapters, layer.ln_3, layer.linear, layer.softmax]:
        for param in part.parameters():
            param.requires_grad = True


## Train dataset

In [3]:
dataset_dir= 'modified_dataset/'
train_dict={}
relations = ['Physical','Event', 'Intent','Reaction']
for relation in relations:
    train_dict[relation] = load_dataset('json', data_files={'train': f'{dataset_dir}{relation} train.json'}, download_mode= DownloadMode.REUSE_DATASET_IF_EXISTS)

Using custom data configuration default-a9ea61d2f4e7a15c
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-a9ea61d2f4e7a15c/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-bfe3ce7955f4ec3f
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-bfe3ce7955f4ec3f/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-6c62c6b01d7a74cd
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-6c62c6b01d7a74cd/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-ac5c9909ebd64193
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-ac5c9909ebd64193/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
max_seq_length = 64
def preprocess_function(examples):
    model_inputs = tokenizer(examples['head'], text_target=examples['tail'], max_length=max_seq_length,
                             truncation=True)
    return model_inputs

train_tok_dict={}
for relation in relations:
    train_tok_dict[relation] = train_dict[relation].map(
        preprocess_function,
        batched=True,  # num_proc=num_proc,
        remove_columns=['head', 'tail'],
        load_from_cache_file=True
    )

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-a9ea61d2f4e7a15c/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-fc4caa6007dbf9e5.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-bfe3ce7955f4ec3f/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-24e56ffa2c841521.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-6c62c6b01d7a74cd/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-cf01bf2f244f700f.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-ac5c9909ebd64193/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-3b1388a7e63f7c6a.arrow


## Test dataset

In [5]:
dataset_dir= 'modified_dataset/'
test_dict={}
relations = ['Physical','Event', 'Intent','Reaction']
for relation in relations:
    test_dict[relation] = load_dataset('json', data_files={'test': f'{dataset_dir}{relation} test.json'}, download_mode= DownloadMode.REUSE_DATASET_IF_EXISTS)

max_seq_length = 64
def preprocess_function(examples):
    examples['tail']=['\t'.join(x) for x in examples['tail']]
    return examples

for relation in relations:
    test_dict[relation] = test_dict[relation].map(
        preprocess_function,
        batched=True,
        load_from_cache_file=True
    )

Using custom data configuration default-58f9268bc00d6442
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-58f9268bc00d6442/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-bb7b8c53aeb03b18
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-bb7b8c53aeb03b18/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-499d547d83530161
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-499d547d83530161/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-79c5d3620c090446
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-79c5d3620c090446/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-58f9268bc00d6442/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-b8466aeb5d9a977b.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-bb7b8c53aeb03b18/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-6caa6a9eabea699e.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-499d547d83530161/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-06827029f485b6b6.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-79c5d3620c090446/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-6716832c84c52af8.arrow


## generation example

In [8]:
for relation in relations:
    test_dataloader = DataLoader(
        test_dict[relation]['test'],
        batch_size=1,
    )
    for batch in test_dataloader:
        input_ids = tokenizer(batch['head'], padding=True, return_tensors="pt").to(device)
        generations = model.generate(**input_ids)
        print(f"{relation}")
        print(batch['head'])
        print(tokenizer.batch_decode(generations, skip_special_tokens=True))
        print('labels')
        print([s.split('\t') for s in batch['tail']])
        break

Physical
['a teabag can be used to']
['useful']
labels
[['hold the loose leaves', 'increase the strength of the drink', 'put into mug for steeping', 'ease eye pain', 'make plants grow better', 'wipe stains off of steel', 'fertilize plants', 'deororize garbage', 'feed slugs', 'get rid of bags under eyes', 'make a nice aroma in the home', 'get herbal nutrients', 'make sweet tea with', 'sniff']]
Event
["PersonX asks PersonY's dad. Before that,"]
['curious']
labels
[['PersonX wants to ask an important question']]
Intent
['PersonX tells PersonY a little. After, others will want to']
['cries']
labels
[[' know more about the matter', ' entice him into letting out the whole secret', ' ask questions', 'PersonX to tell everything', ' know the whole story', ' help PersonX']]
Reaction
["PersonX waters PersonX's garden. PersonX will be"]
['satisfied']
labels
[['dutiful', 'satisfied', 'proud', 'expectant', 'hopeful', 'nurturing']]


# Incremental Training + Evaluation

In [None]:
metric=evaluate.load('bleu')
USE_REC_ADAM =False
output_dir= 'rec_adam/' if USE_REC_ADAM else 'UMA/' 
for train_relation in relations:
    os.makedirs(f'{output_dir}{train_relation}', exist_ok=True)
    
    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model
    )

    train_dataloader = DataLoader(
        train_tok_dict[train_relation]['train'],
        shuffle=True,
        collate_fn=data_collator,
        batch_size=64,
    )
    if USE_REC_ADAM and train_relation != relations[0]:
        optimizer = RecAdam(model.parameters(), lr=1e-3, pretrain_params= list(model.parameters()))
    else:
        optimizer = AdamW(model.parameters(), lr=2e-5)



    num_train_epochs = 3
    num_update_steps_per_epoch = len(train_dataloader)
    num_training_steps = num_train_epochs * num_update_steps_per_epoch

    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )



    progress_bar = tqdm(range(num_training_steps))

    model.train()
    for epoch in range(num_train_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

    
    model.eval() # put in testing mode (dropout modules are deactivated)
    for test_relation in relations:
        test_dataloader = DataLoader(
            test_dict[test_relation]['test'],
            batch_size=64,
        )
        for batch in test_dataloader:
            input_ids = tokenizer(batch['head'], padding=True, return_tensors="pt").to(device)
            with torch.no_grad():
                generations = model.generate(**input_ids)
            decoded_gens= tokenizer.batch_decode(generations, skip_special_tokens=True)
            labels = [s.split('\t') for s in batch['tail']]
            metric.add_batch(predictions=decoded_gens, references=labels)
        results = metric.compute(max_order=2)
        results['blue-1']=results['brevity_penalty']*results['precisions'][0]
        f = open(f'{output_dir}results.txt', "a")
        f.write(f'{train_relation} test on {test_relation} \n {results} \n')
        f.close()   
    
        

    model.save_pretrained(f'{output_dir}{train_relation}')    
    



  0%|          | 0/8379 [00:00<?, ?it/s]

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


# Elastic Weight Consolidation

In [None]:
optpar_dict = {}
fisher_dict = {}
def on_task_update(train_dataloader):
    model.train()
    optimizer.zero_grad()
    # accumulating gradients
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
    optpar_dict.clear()
    fisher_dict.clear()
    # gradients accumulated is used to compute fisher
    for name, param in model.named_parameters():
        optpar_dict[name] = param.data.clone()
        fisher_dict[name] = param.grad.data.clone().pow(2)
    optimizer.zero_grad()

In [None]:

metric=evaluate.load('bleu')
ewc_lambda=1000
output_dir= f'ewc_l={ewc_lambda}/'
for train_relation in relations:
    os.makedirs(f'{output_dir}{train_relation}', exist_ok=False)
    
    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model
    )

    train_dataloader = DataLoader(
        train_tok_dict[train_relation]['train'],
        shuffle=True,
        collate_fn=data_collator,
        batch_size=64,
    )
    
    optimizer = AdamW(model.parameters(), lr=2e-5)



    num_train_epochs = 3
    num_update_steps_per_epoch = len(train_dataloader)
    num_training_steps = num_train_epochs * num_update_steps_per_epoch

    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )



    progress_bar = tqdm(range(num_training_steps))

    model.train()
    for epoch in range(num_train_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            
            if train_relation!= relations[0]:
                #EWC penalty
                for name, param in model.named_parameters():
                    fisher = fisher_dict[name]
                    optpar = optpar_dict[name]
                    loss += (fisher * (optpar - param).pow(2)).sum() * ewc_lambda

            loss.backward()            
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
    
    on_task_update(train_dataloader)
    
    model.eval() # put in testing mode (dropout modules are deactivated)
    for test_relation in relations:
        test_dataloader = DataLoader(
            test_dict[test_relation]['test'],
            batch_size=64,
        )
        for batch in test_dataloader:
            input_ids = tokenizer(batch['head'], padding=True, return_tensors="pt").to(device)
            with torch.no_grad():
                generations = model.generate(**input_ids)
            decoded_gens= tokenizer.batch_decode(generations, skip_special_tokens=True)
            labels = [s.split('\t') for s in batch['tail']]
            metric.add_batch(predictions=decoded_gens, references=labels)
        results = metric.compute(max_order=2)
        results['blue-1']=results['brevity_penalty']*results['precisions'][0]
        f = open(f'{output_dir}results.txt', "a")
        f.write(f'{train_relation} test on {test_relation} \n {results} \n')
        f.close()   
    
        

    model.save_pretrained(f'{output_dir}{train_relation}')    
    