# Setup

In [1]:
import os
import torch
from torch import cuda
from torch.utils.data import TensorDataset, DataLoader
import evaluate
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainingArguments, pipeline, AdamW, get_scheduler
from argparse import ArgumentParser
import numpy as np
from datasets import load_dataset, DownloadMode
from tqdm.auto import tqdm
from script.rec_adam import RecAdam

## model + tokenizer

In [2]:
model_name='facebook/bart-base'
#model_name='gpt2'
tokenizer_name = 'facebook/bart-base'
#model_name= 'UMA_strong_weights/Physical'
device='cuda'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenizer.pad_token = "[PAD]"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to(device)
print("Model + tokenizer")
model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1000**2:.1f}M parameters")

TADA!
TADA!
TADA!
TADA!
TADA!
TADA!
TADA!
TADA!
TADA!
TADA!
TADA!
TADA!


Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['decoder.layers.3.adapters.2.down_linear.weight', 'decoder.layers.5.adapters.2.layer_norm.weight', 'decoder.layers.4.adapters.3.layer_norm.bias', 'decoder.layers.0.adapters.3.up_linear.bias', 'encoder.layers.5.adapters.2.down_linear.bias', 'decoder.layers.2.adapters.0.up_linear.weight', 'decoder.layers.3.adapters.0.layer_norm.bias', 'encoder.layers.5.adapters.0.up_linear.weight', 'decoder.layers.4.adapters.2.up_linear.weight', 'decoder.layers.3.adapters.4.down_linear.bias', 'encoder.layers.0.adapters.2.down_linear.weight', 'decoder.layers.3.adapters.1.layer_norm.bias', 'encoder.layers.3.adapters.0.layer_norm.weight', 'decoder.layers.5.adapters.4.up_linear.bias', 'decoder.layers.3.adapters.0.up_linear.bias', 'encoder.layers.5.adapters.2.layer_norm.bias', 'decoder.layers.1.adapters.0.layer_norm.bias', 'decoder.layers.1.adapters.1.down_linear.bias',

Model + tokenizer
Model size: 144.0M parameters


In [3]:
from torch import nn 
def init_bert_weights(module):
    """Initialize the weights."""
    if isinstance(module, (nn.Linear, nn.Embedding)):
        # std defaults to 0.02, this might need to be changed
        module.weight.data.normal_(mean=0.0, std=0.001)#was 0.02 #0.002 with softmax
    elif isinstance(module, nn.LayerNorm):
        module.bias.data.zero_()
        module.weight.data.fill_(1.0)
    if isinstance(module, nn.Linear) and module.bias is not None:
        module.bias.data.zero_()

# Freeze model except head and adapters parameters
for param in model.model.parameters():
    param.requires_grad = False
for param in model.lm_head.parameters():
    param.requires_grad = False
for layer_index in range(6):
    for coder in model.model.encoder, model.model.decoder:
        layer = coder.layers[layer_index]
        for part in [layer.adapters]: #layer.linear]:
            for param in part.parameters():
                param.requires_grad = True
            part.apply(init_bert_weights)
        #layer.linear.weight.data.normal_(mean=0.0, std= 0.05)
        
                
                
                


## Train dataset

In [4]:
dataset_dir= 'modified_dataset/'
train_dict={}
relations = ['Physical','Event', 'Intent','Reaction']
for relation in relations:
    train_dict[relation] = load_dataset('json', data_files={'train': f'{dataset_dir}{relation} train.json'}, download_mode= DownloadMode.REUSE_DATASET_IF_EXISTS)

Using custom data configuration default-f728267a53beffc2
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-f728267a53beffc2/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-70c0d86cfc6bffa9
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-70c0d86cfc6bffa9/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-b455e0cad2a443ca
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-b455e0cad2a443ca/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-afbb1536dcae3f27
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-afbb1536dcae3f27/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
max_seq_length = 64
def preprocess_function(examples):
    model_inputs = tokenizer(examples['head'], text_target=examples['tail'], max_length=max_seq_length,
                             truncation=True)
    return model_inputs

train_tok_dict={}
for relation in relations:
    train_tok_dict[relation] = train_dict[relation].map(
        preprocess_function,
        batched=True,  # num_proc=num_proc,
        remove_columns=['head', 'tail'],
        load_from_cache_file=True
    )

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-f728267a53beffc2/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-ef26b901db75d16a.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-70c0d86cfc6bffa9/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-2e6a3fb87ecd2eed.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-b455e0cad2a443ca/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-e7c9f649fffb48db.arrow


  0%|          | 0/332 [00:00<?, ?ba/s]

## Test dataset

In [6]:
dataset_dir= 'modified_dataset/'
test_dict={}
relations = ['Physical','Event', 'Intent','Reaction']
for relation in relations:
    test_dict[relation] = load_dataset('json', data_files={'test': f'{dataset_dir}{relation} test.json'}, download_mode= DownloadMode.REUSE_DATASET_IF_EXISTS)

max_seq_length = 64
def preprocess_function(examples):
    examples['tail']=['\t'.join(x) for x in examples['tail']]
    return examples

for relation in relations:
    test_dict[relation] = test_dict[relation].map(
        preprocess_function,
        batched=True,
        load_from_cache_file=True
    )

Using custom data configuration default-e79445650a26d083
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-e79445650a26d083/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-230ba91174d30e5b
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-230ba91174d30e5b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-4795e9088563da5a
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-4795e9088563da5a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-87ba0d1eb2ed645f
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-87ba0d1eb2ed645f/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-e79445650a26d083/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-3ce57d0c2a44c556.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-230ba91174d30e5b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-50a6acecfaf6ea84.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-4795e9088563da5a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-03e1ee95dfae4b59.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-87ba0d1eb2ed645f/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-bad382ca95ad3a12.arrow


## generation example

In [7]:
for relation in relations:
    test_dataloader = DataLoader(
        test_dict[relation]['test'],
        batch_size=1,
    )
    for batch in test_dataloader:
        input_ids = tokenizer(batch['head'], padding=True, return_tensors="pt").to(device)
        generations = model.generate(**input_ids)
        print(f"{relation}")
        print(batch['head'])
        print(tokenizer.batch_decode(generations, skip_special_tokens=True))
        print('labels')
        print([s.split('\t') for s in batch['tail']])
        break



Physical
['a spyglass can be used to']
['a spyglass can be used to']
labels
[['view images far way', 'see the stars', 'spy on neighbors', 'catch things on fire', 'look at a map more closely', 'see the planets', 'look at the stars', 'conduct a science experiment', 'hang on the wall as a decoration in a spy themed room', 'give as a gift to a child who wants to be a spy', 'put in a glass box to display on a shelf', 'spin as a spinner for a game.']]
Event
['PersonX makes a spectacle of oneself. This would not happen if']
['PersonX makes a spectacle of oneself. This would not happen if']
labels
[['PersonX is very modest', 'he is too shy.', 'PersonX has terrible stage fright.', 'PersonX is very retiring', 'no one is there.', "Other people steal PersonX's limelight.", 'everyone ignores him.', 'person x passes out.']]
Intent
['PersonX puts together a ___. After, others will want to']
['PersonX puts together a ___. After, others will want to']
labels
[['glad', 'impression']]
Reaction
["PersonX 

In [8]:
model.model

BartModel(
  (shared): Embedding(50265, 768, padding_idx=1)
  (encoder): BartEncoder(
    (embed_tokens): Embedding(50265, 768, padding_idx=1)
    (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
    (layers): ModuleList(
      (0): BartEncoderLayer(
        (self_attn): BartAttention(
          (k_proj): Linear(in_features=768, out_features=768, bias=True)
          (v_proj): Linear(in_features=768, out_features=768, bias=True)
          (q_proj): Linear(in_features=768, out_features=768, bias=True)
          (out_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (activation_fn): GELUActivation()
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (adapters): ModuleList(
          (0): Ad

In [None]:
#model.model.encoder.layers[1].adapters[0].up_linear.weight.data.normal_(mean=0.0, std=0.001)
model.model.encoder.layers[1].adapters[0].up_linear.weight

In [None]:
model.model.encoder.layers[1]

In [None]:
print(model.model.encoder.layers[2].linear.weight)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
optimizer.zero_grad()
data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model
    )

train_dataloader = DataLoader(
    train_tok_dict['Reaction']['train'],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=1,
)

model.train()
for batch in train_dataloader:
    print(batch)
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()
    break

#optimizer.zero_grad()

In [None]:
from entmax import sparsemax
from torch import nn
for relation in relations:
    print(relation)
    item =test_dict[relation]['test'][0]
    input_ids = tokenizer(item['head'], padding=True, return_tensors="pt").to(device)
    test= model.generate(**input_ids)
    #print(test.size())
    #test= model(input_ids['input_ids'])
    renormed=activation['Linear']
    print(renormed)
    #print(sparsemax(renormed,-1))
    #print(nn.functional.softmax(renormed,-1))

In [None]:
activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

model.model.decoder.layers[5].adapters[0].register_forward_hook(get_activation('Linear'))

In [None]:
model.model.encoder.layers[0].linear._forward_hooks.clear()

# Incremental Training + Evaluation

In [10]:
metric=evaluate.load('bleu')
USE_REC_ADAM =False
output_dir= 'rec_adam/' if USE_REC_ADAM else 'UMA_no_softmax_freeze/' 
for relation_index, train_relation in enumerate(relations):
    os.makedirs(f'{output_dir}{train_relation}', exist_ok=True)
    
    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model
    )

    train_dataloader = DataLoader(
        train_tok_dict[train_relation]['train'],
        shuffle=True,
        collate_fn=data_collator,
        batch_size=64,
    )
    if USE_REC_ADAM and train_relation != relations[0]:
        optimizer = RecAdam(model.parameters(), lr=1e-3, pretrain_params= list(model.parameters()))
    else:
        optimizer = AdamW(model.parameters(), lr=2e-5)



    num_train_epochs = 3
    num_update_steps_per_epoch = len(train_dataloader)
    num_training_steps = num_train_epochs * num_update_steps_per_epoch

    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )



    progress_bar = tqdm(range(num_training_steps))
    

    # Freeze model except concerned adapters parameters
    for param in model.model.parameters():
        param.requires_grad = False
    for param in model.lm_head.parameters():
        param.requires_grad = False
    for layer_index in range(6):
        for coder in model.model.encoder, model.model.decoder:
            layer = coder.layers[layer_index]
            for part in [layer.adapters[relation_index]]: #, layer.linear]:
                for param in part.parameters():
                    param.requires_grad = True
                    
    
    model.train()
    for epoch in range(num_train_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

    
    model.eval() # put in testing mode (dropout modules are deactivated)
    for test_relation in relations:
        test_dataloader = DataLoader(
            test_dict[test_relation]['test'],
            batch_size=64,
        )
        for batch in test_dataloader:
            input_ids = tokenizer(batch['head'], padding=True, return_tensors="pt").to(device)
            with torch.no_grad():
                generations = model.generate(**input_ids)
            decoded_gens= tokenizer.batch_decode(generations, skip_special_tokens=True)
            labels = [s.split('\t') for s in batch['tail']]
            metric.add_batch(predictions=decoded_gens, references=labels)
        results = metric.compute(max_order=2)
        results['blue-1']=results['brevity_penalty']*results['precisions'][0]
        f = open(f'{output_dir}results.txt', "a")
        f.write(f'{train_relation} test on {test_relation} \n {results} \n')
        f.close()   
    
        

    model.save_pretrained(f'{output_dir}{train_relation}')    
    

  0%|          | 0/8379 [00:00<?, ?it/s]

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


# Elastic Weight Consolidation

In [None]:
optpar_dict = {}
fisher_dict = {}
def on_task_update(train_dataloader):
    model.train()
    optimizer.zero_grad()
    # accumulating gradients
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
    optpar_dict.clear()
    fisher_dict.clear()
    # gradients accumulated is used to compute fisher
    for name, param in model.named_parameters():
        optpar_dict[name] = param.data.clone()
        fisher_dict[name] = param.grad.data.clone().pow(2)
    optimizer.zero_grad()

In [None]:

metric=evaluate.load('bleu')
ewc_lambda=1000
output_dir= f'ewc_l={ewc_lambda}/'
for train_relation in relations:
    os.makedirs(f'{output_dir}{train_relation}', exist_ok=False)
    
    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model
    )

    train_dataloader = DataLoader(
        train_tok_dict[train_relation]['train'],
        shuffle=True,
        collate_fn=data_collator,
        batch_size=64,
    )
    
    optimizer = AdamW(model.parameters(), lr=2e-5)



    num_train_epochs = 3
    num_update_steps_per_epoch = len(train_dataloader)
    num_training_steps = num_train_epochs * num_update_steps_per_epoch

    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )



    progress_bar = tqdm(range(num_training_steps))

    model.train()
    for epoch in range(num_train_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            
            if train_relation!= relations[0]:
                #EWC penalty
                for name, param in model.named_parameters():
                    fisher = fisher_dict[name]
                    optpar = optpar_dict[name]
                    loss += (fisher * (optpar - param).pow(2)).sum() * ewc_lambda

            loss.backward()            
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
    
    on_task_update(train_dataloader)
    
    model.eval() # put in testing mode (dropout modules are deactivated)
    for test_relation in relations:
        test_dataloader = DataLoader(
            test_dict[test_relation]['test'],
            batch_size=64,
        )
        for batch in test_dataloader:
            input_ids = tokenizer(batch['head'], padding=True, return_tensors="pt").to(device)
            with torch.no_grad():
                generations = model.generate(**input_ids)
            decoded_gens= tokenizer.batch_decode(generations, skip_special_tokens=True)
            labels = [s.split('\t') for s in batch['tail']]
            metric.add_batch(predictions=decoded_gens, references=labels)
        results = metric.compute(max_order=2)
        results['blue-1']=results['brevity_penalty']*results['precisions'][0]
        f = open(f'{output_dir}results.txt', "a")
        f.write(f'{train_relation} test on {test_relation} \n {results} \n')
        f.close()   
    
        

    model.save_pretrained(f'{output_dir}{train_relation}')    
    