# Setup

In [1]:
import os
import torch
from torch import cuda
from torch.utils.data import TensorDataset, DataLoader
import evaluate
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainingArguments, pipeline, AdamW, get_scheduler
from argparse import ArgumentParser
import numpy as np
from datasets import load_dataset, DownloadMode
from tqdm.auto import tqdm
from script.rec_adam import RecAdam

## model + tokenizer

In [2]:
model_name='facebook/bart-base'
#model_name='gpt2'
#model_name= 'UMA_softmax_full_freeze/Physical'
tokenizer_name = 'facebook/bart-base'

device='cuda'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenizer.pad_token = "[PAD]"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to(device)
print("Model + tokenizer")
model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1000**2:.1f}M parameters")

Model + tokenizer
Model size: 139.4M parameters


In [None]:
from torch import nn 
def init_bert_weights(module):
    """Initialize the weights."""
    if isinstance(module, (nn.Linear, nn.Embedding)):
        # std defaults to 0.02, this might need to be changed
        module.weight.data.normal_(mean=0.0, std=0.002)#was 0.02 #0.002 with softmax
    elif isinstance(module, nn.LayerNorm):
        module.bias.data.zero_()
        module.weight.data.fill_(1.0)
    if isinstance(module, nn.Linear) and module.bias is not None:
        module.bias.data.zero_()

# Freeze model except head and adapters parameters
for param in model.model.parameters():
    param.requires_grad = False
for param in model.lm_head.parameters():
    param.requires_grad = False
for layer_index in range(6):
    for coder in model.model.encoder, model.model.decoder:
        layer = coder.layers[layer_index]
        for part in [layer.adapters, layer.linears]:
            for param in part.parameters():
                param.requires_grad = True
            part.apply(init_bert_weights)
        #layer.linear.weight.data.normal_(mean=0.0, std= 0.05)
        
                
                
                


## Train dataset

In [3]:
dataset_dir= 'modified_dataset/'
train_dict={}
relations = ['Physical','Event', 'Intent','Reaction']
for relation in relations:
    train_dict[relation] = load_dataset('json', data_files={'train': f'{dataset_dir}{relation} train.json'}, download_mode= DownloadMode.REUSE_DATASET_IF_EXISTS)

Using custom data configuration default-f728267a53beffc2
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-f728267a53beffc2/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-70c0d86cfc6bffa9
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-70c0d86cfc6bffa9/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-b455e0cad2a443ca
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-b455e0cad2a443ca/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-afbb1536dcae3f27
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-afbb1536dcae3f27/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
max_seq_length = 64
def preprocess_function(examples):
    model_inputs = tokenizer(examples['head'], text_target=examples['tail'], max_length=max_seq_length,
                             truncation=True)
    return model_inputs

train_tok_dict={}
for relation in relations:
    train_tok_dict[relation] = train_dict[relation].map(
        preprocess_function,
        batched=True,  # num_proc=num_proc,
        remove_columns=['head', 'tail'],
        load_from_cache_file=True
    )

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-f728267a53beffc2/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-ef26b901db75d16a.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-70c0d86cfc6bffa9/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-2e6a3fb87ecd2eed.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-b455e0cad2a443ca/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-e7c9f649fffb48db.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-afbb1536dcae3f27/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-e850e3a6af5af9c9.arrow


## Test dataset

In [None]:
dataset_dir= 'modified_dataset/'
test_dict={}
relations = ['Physical','Event', 'Intent','Reaction']
for relation in relations:
    test_dict[relation] = load_dataset('json', data_files={'test': f'{dataset_dir}{relation} test.json'}, download_mode= DownloadMode.REUSE_DATASET_IF_EXISTS)

max_seq_length = 64
def preprocess_function(examples):
    examples['tail']=['\t'.join(x) for x in examples['tail']]
    return examples

for relation in relations:
    test_dict[relation] = test_dict[relation].map(
        preprocess_function,
        batched=True,
        load_from_cache_file=True
    )

## Dev dataset

In [5]:
dataset_dir= 'modified_dataset/'
dev_dict={}
relations = ['Physical','Event', 'Intent','Reaction']
for relation in relations:
    dev_dict[relation] = load_dataset('json', data_files={'dev': f'{dataset_dir}{relation} dev.json'}, download_mode= DownloadMode.REUSE_DATASET_IF_EXISTS)

max_seq_length = 64
def preprocess_function(examples):
    examples['tail']=['\t'.join(x) for x in examples['tail']]
    return examples

for relation in relations:
    dev_dict[relation] = dev_dict[relation].map(
        preprocess_function,
        batched=True,
        load_from_cache_file=True
    )

Using custom data configuration default-ebc21b7d7f5bf37e
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-ebc21b7d7f5bf37e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-b188c89b504a4a69
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-b188c89b504a4a69/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-959376889ff22667
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-959376889ff22667/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-eecd38dab30452cf
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-eecd38dab30452cf/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-ebc21b7d7f5bf37e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-026f41029d51b7d0.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-b188c89b504a4a69/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-ef2dd232c8e4124a.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-959376889ff22667/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-ec3ccea9f1abffb0.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-eecd38dab30452cf/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-7b41803dcf05c203.arrow


## generation example

In [6]:
for relation in relations:
    test_dataloader = DataLoader(
        test_dict[relation]['test'],
        batch_size=1,
    )
    for batch in test_dataloader:
        input_ids = tokenizer(batch['head'], padding=True, return_tensors="pt").to(device)
        generations = model.generate(**input_ids)
        print(f"{relation}")
        print(batch['head'])
        print(tokenizer.batch_decode(generations, skip_special_tokens=True))
        print('labels')
        print([s.split('\t') for s in batch['tail']])
        break



Physical
['a gambling money can be used to']
['a gambling money can be used to']
labels
[['bet on race', 'make the rent', 'lose on a bet', 'bet on horse races', 'bet on cock fights', 'bet on a Horse', 'pay a bookie']]
Event
['PersonX plays dumb. This would not happen if']
['PersonX plays dumb. This would not happen if']
labels
[["PersonY sees through PersonX's lies too easily.", 'PersonY has hooked PersonX up to a polygraph machine.', 'PersonX has too much integrity.', 'PersonX is far too intelligent to play dumb.', "PersonX deson't want to be embarrassed.", 'PersonX is the boss.']]
Intent
["PersonX puts it on PersonY's wrist. After, PersonX will want to"]
["PersonX puts it on PersonY's wrist. After, PersonX will want to"]
labels
[['clasp the bracelet on', 'clasp the watch on', ' check the time', ' finish getting ready']]
Reaction
["PersonX feels PersonY's hands. The effect on PersonX will be that PersonX"]
["PersonX feels PersonY's hands. The effect on PersonX will be that Person"]
la

In [None]:
model.model

In [None]:
#model.model.encoder.layers[1].adapters[0].up_linear.weight.data.normal_(mean=0.0, std=0.001)
model.model.encoder.layers[1].adapters[0].up_linear.weight

In [None]:
model.model.encoder.layers[1]

In [None]:
print(model.model.encoder.layers[2].linear.weight)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
optimizer.zero_grad()
data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model
    )

train_dataloader = DataLoader(
    train_tok_dict['Reaction']['train'],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=1,
)

model.train()
for batch in train_dataloader:
    print(batch)
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()
    break

#optimizer.zero_grad()

In [None]:
from entmax import sparsemax
from torch import nn
for relation in relations:
    print(relation)
    item =test_dict[relation]['test'][0]
    input_ids = tokenizer(item['head'], padding=True, return_tensors="pt").to(device)
    test= model.generate(**input_ids)
    #print(test.size())
    #test= model(input_ids['input_ids'])
    renormed=activation['Linear']
    print(renormed)
    #print(sparsemax(renormed,-1))
    #print(nn.functional.softmax(renormed,-1))

In [None]:
from entmax import sparsemax
from torch import nn
for relation in relations:
    print(relation)
    item =test_dict[relation]['test'][0]
    input_ids = tokenizer(item['head'], padding=True, return_tensors="pt").to(device)
    test= model.generate(**input_ids)
    #print(test.size())
    #test= model(input_ids['input_ids'])
    renormed=activation['Linear']
    print(renormed)
    #print(sparsemax(renormed,-1))
    #print(nn.functional.softmax(renormed,-1))

In [None]:
activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

model.model.decoder.layers[0].softmax.register_forward_hook(get_activation('Linear'))

In [None]:
model.model.decoder.layers[5].softmax._forward_hooks.clear()

In [None]:
for param in model.model.parameters():
    param.requires_grad = False
for param in model.lm_head.parameters():
    param.requires_grad = False
for layer_index in range(6):
    for coder in model.model.encoder, model.model.decoder:
        layer = coder.layers[layer_index]
        for i in range(5):
            lin=layer.linears[i]
            lin.weight.data = (0.01 if i !=2 else 1 )* lin.weight.data/torch.norm(lin.weight.data)

In [None]:
print(model.model.decoder.layers[5].linears[4].weight.grad)

In [None]:
print([x for x in model.model.decoder.layers[5].linears.named_parameters()])

# Incremental Training + Evaluation

In [None]:
metric=evaluate.load('bleu')
USE_REC_ADAM =False
output_dir= 'rec_adam/' if USE_REC_ADAM else 'UMA_normed_softmax_full_freeze/' 
for relation_index, train_relation in enumerate(relations):
    os.makedirs(f'{output_dir}{train_relation}', exist_ok=True)
    
    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model
    )

    train_dataloader = DataLoader(
        train_tok_dict[train_relation]['train'],
        shuffle=True,
        collate_fn=data_collator,
        batch_size=64,
    )
    if USE_REC_ADAM and train_relation != relations[0]:
        optimizer = RecAdam(model.parameters(), lr=1e-3, pretrain_params= list(model.parameters()))
    else:
        optimizer = AdamW(model.parameters(), lr=2e-5)



    num_train_epochs = 3
    num_update_steps_per_epoch = len(train_dataloader)
    num_training_steps = num_train_epochs * num_update_steps_per_epoch

    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )



    progress_bar = tqdm(range(num_training_steps))
    

    # Freeze model except concerned adapters parameters
    for param in model.model.parameters():
        param.requires_grad = False
    for param in model.lm_head.parameters():
        param.requires_grad = False
    for layer_index in range(6):
        for coder in model.model.encoder, model.model.decoder:
            layer = coder.layers[layer_index]
            for part in [layer.adapters[relation_index], layer.linears[relation_index]]:
                for param in part.parameters():
                    param.requires_grad = True
    
    for layer_index in range(6):
        for coder in model.model.encoder, model.model.decoder:
            layer = coder.layers[layer_index]
            for i in range(5):
                lin=layer.linears[i]
                lin.weight.data = 0.5*lin.weight.data/torch.norm(lin.weight.data)
    
    
    model.train()
    for epoch in range(num_train_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

    
    model.eval() # put in testing mode (dropout modules are deactivated)
    for test_relation in relations:
        test_dataloader = DataLoader(
            test_dict[test_relation]['test'],
            batch_size=64,
        )
        for batch in test_dataloader:
            input_ids = tokenizer(batch['head'], padding=True, return_tensors="pt").to(device)
            with torch.no_grad():
                generations = model.generate(**input_ids)
            decoded_gens= tokenizer.batch_decode(generations, skip_special_tokens=True)
            labels = [s.split('\t') for s in batch['tail']]
            metric.add_batch(predictions=decoded_gens, references=labels)
        results = metric.compute(max_order=2)
        results['blue-1']=results['brevity_penalty']*results['precisions'][0]
        f = open(f'{output_dir}results.txt', "a")
        f.write(f'{train_relation} test on {test_relation} \n {results} \n')
        f.close()   
    
        

    model.save_pretrained(f'{output_dir}{train_relation}')    
    

# Elastic Weight Consolidation

In [8]:
optpar_dict = {}
fisher_dict = {}
def on_task_update(train_dataloader):
    model.train()
    optimizer.zero_grad()
    # accumulating gradients
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
    optpar_dict.clear()
    fisher_dict.clear()
    # gradients accumulated is used to compute fisher
    for name, param in model.named_parameters():
        optpar_dict[name] = param.data.clone()
        fisher_dict[name] = param.grad.data.clone().pow(2)
    optimizer.zero_grad()

In [None]:
device='cuda'
for lambda_power in range(8):
    tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base')
    tokenizer.pad_token = "[PAD]"
    model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-base')
    model.to(device)
    
    metric=evaluate.load('bleu')
    ewc_lambda=10**lambda_power
    output_dir= f'ewc_l={ewc_lambda}/'
    for train_relation in relations:
        os.makedirs(f'{output_dir}{train_relation}', exist_ok=False)

        data_collator = DataCollatorForSeq2Seq(
            tokenizer,
            model=model
        )

        train_dataloader = DataLoader(
            train_tok_dict[train_relation]['train'],
            shuffle=True,
            collate_fn=data_collator,
            batch_size=64,
        )

        optimizer = AdamW(model.parameters(), lr=2e-5)



        num_train_epochs = 3
        num_update_steps_per_epoch = len(train_dataloader)
        num_training_steps = num_train_epochs * num_update_steps_per_epoch

        lr_scheduler = get_scheduler(
            "linear",
            optimizer=optimizer,
            num_warmup_steps=0,
            num_training_steps=num_training_steps,
        )



        progress_bar = tqdm(range(num_training_steps))

        model.train()
        for epoch in range(num_train_epochs):
            for batch in train_dataloader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss

                if train_relation!= relations[0]:
                    #EWC penalty
                    for name, param in model.named_parameters():
                        fisher = fisher_dict[name]
                        optpar = optpar_dict[name]
                        loss += (fisher * (optpar - param).pow(2)).sum() * ewc_lambda

                loss.backward()            
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)

        if train_relation!= relations[-1]:
            on_task_update(train_dataloader)

        model.eval() # put in testing mode (dropout modules are deactivated)
        for dev_relation in relations:
            dev_dataloader = DataLoader(
                dev_dict[dev_relation]['dev'],
                batch_size=64,
            )
            for batch in dev_dataloader:
                input_ids = tokenizer(batch['head'], padding=True, return_tensors="pt").to(device)
                with torch.no_grad():
                    generations = model.generate(**input_ids)
                decoded_gens= tokenizer.batch_decode(generations, skip_special_tokens=True)
                labels = [s.split('\t') for s in batch['tail']]
                metric.add_batch(predictions=decoded_gens, references=labels)
            results = metric.compute(max_order=2)
            results['blue-1']=results['brevity_penalty']*results['precisions'][0]
            f = open(f'{output_dir}results.txt', "a")
            f.write(f'{train_relation} dev on {dev_relation} \n {results} \n')
            f.close()   



        #model.save_pretrained(f'{output_dir}{train_relation}')    




  0%|          | 0/8379 [00:00<?, ?it/s]

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
