In [1]:
import logging
import torch
import torch.nn as nn
import nltk
import numpy as np
from datasets import load_dataset, load_metric
from torch.utils.data.dataloader import DataLoader
from tqdm.auto import tqdm
from tqdm import tqdm as tqdm1

In [2]:
torch.cuda.is_available()

True

In [3]:
import transformers
from accelerate import Accelerator
from filelock import FileLock
from transformers import set_seed
from transformers.file_utils import is_offline_mode
from utils.arguments import parse_args
from multitask_model import MultitaskModel
#from preprocess import convert_to_features
from multitask_data_collator import MultitaskTrainer, NLPDataCollator
from multitask_eval import multitask_eval_fn
from checkpoint_model import save_model
from pathlib import Path

In [5]:
# from evaluate_bleu import *
from sklearn.metrics import f1_score, classification_report

In [6]:
set_seed(42)

In [None]:
import pandas as pd
import pickle
df_train = pickle.load(open('../../../arg-generation/multi-taks-counter-argument-generation/reddit_data/conclusion_and_ca_generation/preprocessed_train_conclusion_all.pkl','rb'))
# pd.read_csv('../../../arg-generation/multi-taks-counter-argument-generation/reddit_data/conclusion_and_ca_generation/preprocessed_train_conclusion_all.pkl')

df_train.to_csv('preprocessed_train_conclusion_all.csv')

df_validation = pickle.load(open('../../../arg-generation/multi-taks-counter-argument-generation/reddit_data/conclusion_and_ca_generation/sample_valid_conclusion_all.pkl','rb'))
df_validation.to_csv('sample_valid_conclusion_all.csv')

In [8]:
import pandas as pd
df_train=pd.read_csv('preprocessed_train_conclusion_all.csv')
df_validate=pd.read_csv('sample_valid_conclusion_all.csv')
df_train[:3]

Unnamed: 0.1,Unnamed: 0,post_id,split,comment_id,title,post,n_sentences,counter,bot_comment,counter_conclusion,counter_conclusions
0,0,t3_1u4mmo,train,t1_ceehlbn,I believe that churches and other religious in...,['i believe that a church is like any other pr...,4,1. it can be used as a chilling effect that ca...,False,Tax exemptions are a privilege and should not ...,Taxes are theft because the money is used for ...
1,22,t3_1u4mo5,train,t1_ceegi4s,I don't believe it is acceptable to attack the...,"['so it seems to be common that, when caught c...",8,you presumably love your so and hope to fix yo...,False,I believe that physical violence is never more...,I believe that physical violence against a par...
2,38,t3_1u4txm,train,t1_ceeioxh,"There is no viable alternative to capitalism, ...","[""i've tried researching discussions on this t...",16,the system you are describing is not capitalis...,False,"Capitalism is not the ideal system, there are ...","Capitalism is not the ideal system, there are ..."


In [37]:
tasks = ['counter_gen', 'title_gen']


In [29]:
df_train['title_input']=df_train.apply(lambda x:str(x['post'])+'<s>'+str(x['title']),axis=1)
df_train['counter_input']=df_train.apply(lambda x:str(x['post'])+'<s>'+str(x['counter']),axis=1)

df_validate['title_input']=df_validate.apply(lambda x:str(x['post'])+'<s>'+str(x['title']),axis=1)
df_validate['counter_input']=df_validate.apply(lambda x:str(x['post'])+'<s>'+str(x['counter']),axis=1)

In [46]:
df_train_new = df_train[['title_input','counter_input']]


df_validate_new = df_validate[['title_input','counter_input']]
# df_validate_title=df_validate[['post','title']]

df_validate_new

Unnamed: 0,title_input,counter_input
0,"['first off, i am studying video game developm...","['first off, i am studying video game developm..."
1,"['i grew up in a liberal environment, went to ...","['i grew up in a liberal environment, went to ..."
2,"[""hey all, been quite a while since i've done ...","[""hey all, been quite a while since i've done ..."
3,"[""of course it depends on what exactly you're ...","[""of course it depends on what exactly you're ..."
4,"['hi reddit.', 'firstly, i would like to prefa...","['hi reddit.', 'firstly, i would like to prefa..."
...,...,...
1995,['we now have several decades of data when it ...,['we now have several decades of data when it ...
1996,"[""we have branches of the military to fight ag...","[""we have branches of the military to fight ag..."
1997,['i thoroughly believe at this point in my lif...,['i thoroughly believe at this point in my lif...
1998,"[""at first, i thought it was only for 'nigger'...","[""at first, i thought it was only for 'nigger'..."


In [33]:
model_name = 'facebook/bart-base'

In [34]:
model_names = [model_name] * 2

In [35]:
class MultitaskBartModel(transformers.PreTrainedModel):
    def __init__(self, encoder, taskmodels_dict):
        """
        Setting MultitaskModel up as a PretrainedModel allows us
        to take better advantage of Trainer features
        """
        super().__init__(transformers.PretrainedConfig())

        self.encoder = encoder
        self.taskmodels_dict = nn.ModuleDict(taskmodels_dict)

    @classmethod
    def create(cls, model_name, model_type_dict, model_config_dict):
        """
        This creates a MultitaskModel using the model class and config objects
        from single-task models.

        We do this by creating each single-task model, and having them share
        the same encoder transformer.
        """
        shared_encoder = None
        taskmodels_dict = {}
        for task_name, model_type in model_type_dict.items():
            model = model_type.from_pretrained(
                model_name,
                config=model_config_dict[task_name],
            )
            if shared_encoder is None:
                shared_encoder = model.model.encoder
            else:
                model.model.encoder = shared_encoder
            taskmodels_dict[task_name] = model
        return cls(encoder=shared_encoder, taskmodels_dict=taskmodels_dict)
    
    @classmethod
    def load(cls, model_folder, model_type_dict, model_config_dict):
        """
        This loads a MultitaskModel using the model class and config objects
        from single-task models.
        """
        shared_encoder = None
        taskmodels_dict = {}
        for task_name, model_type in model_type_dict.items():
            model = model_type.from_pretrained(
                f"{model_folder}/{task_name}_model",
                config=model_config_dict[task_name],
            )
            if shared_encoder is None:
                shared_encoder = model.model.encoder
            else:
                model.model.encoder = shared_encoder
            taskmodels_dict[task_name] = model
        return cls(encoder=shared_encoder, taskmodels_dict=taskmodels_dict)

    def forward(self, task_name, **kwargs):
        return self.taskmodels_dict[task_name](**kwargs)
    
    def generate_text(self, task_name, input_, num_beams=1, early_stopping=False, max_length=512, length_penalty=1.0):
        return self.taskmodels_dict[task_name].generate(
            input_, 
            num_beams=num_beams, 
            early_stopping=early_stopping, 
            max_length=max_length, 
            length_penalty=length_penalty,
            #return_dict_in_generate=True, 
            #output_scores=True
        )

    def resize_token_embeddings(self, new_num_tokens):
        for task_name, model in self.taskmodels_dict.items():
            model.resize_token_embeddings(new_num_tokens)
        

In [36]:
num_labels = 12 #20 #50

In [38]:
multitask_model = MultitaskBartModel.create(
    model_name=model_names[0],
    model_type_dict={
        "counter_gen": transformers.BartForConditionalGeneration,
        "title_gen": transformers.BartForConditionalGeneration,
    },
    model_config_dict={
        "counter_gen": transformers.AutoConfig.from_pretrained(
            model_names[0], num_labels=num_labels
        ),
        "title_gen": transformers.AutoConfig.from_pretrained(
            model_names[1]
        ),
    },
)

Downloading:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [40]:
print('same addresses for encoders?')
print(multitask_model.encoder.embed_tokens.weight.data_ptr())
print(
    multitask_model.taskmodels_dict[
        "counter_gen"
    ].model.encoder.embed_tokens.weight.data_ptr()
)
print(
    multitask_model.taskmodels_dict[
        "title_gen"
    ].model.encoder.embed_tokens.weight.data_ptr()
)

same addresses for encoders?
139863328723008
139863328723008
139863328723008


In [3]:
tokenizer.get_vocab

<bound method PreTrainedTokenizerFast.get_vocab of PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})>

In [4]:
tokenizer_bart.get_vocab

<bound method PreTrainedTokenizerFast.get_vocab of PreTrainedTokenizerFast(name_or_path='facebook/bart-base', vocab_size=50265, model_max_len=1024, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})>

In [None]:
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)

In [70]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/facebook/bart-base/resolve/main/config.json from cache at /mnt/ceph/storage/data-tmp/current//akshitbhatia/.cache/huggingface/transformers/f5310d276a6d1648d00c32fadc8bf7b4607e0fbd5b404fc4a0045960aa2bdfdb.a243ed957122436adb0b8d8e9d20f896f45c174b6324d625ca0a20a84f72a910
Model config BartConfig {
  "_name_or_path": "bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 307

In [85]:
df_train_new['label']=[1]*len(df_train_new)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [88]:
from datasets import Dataset
dataset = Dataset.from_pandas(df_train_new)
# title_dataset = Dataset.from_pandas(df_train_new)

In [89]:
dataset

Dataset({
    features: ['title_input', 'counter_input', 'label'],
    num_rows: 25704
})

In [90]:
tokenized_counter = dataset.map(lambda a: tokenizer(a['counter_input'], padding=True,max_length=512,truncation=True),batched=True)
tokenized_title = dataset.map(lambda a: tokenizer(a['title_input'], padding=True,max_length=512,truncation=True),batched=True)

tokenized_dict = {'title_gen':tokenized_title,'counter_gen':tokenized_counter}

  0%|          | 0/26 [00:00<?, ?ba/s]

  0%|          | 0/26 [00:00<?, ?ba/s]

In [91]:
tokenized_title

Dataset({
    features: ['attention_mask', 'counter_input', 'input_ids', 'label', 'title_input'],
    num_rows: 25704
})

In [92]:
train_dataset = {
    'counter_gen':tokenized_counter, 'title_gen':tokenized_title
}

In [None]:
eval_dataset = {
    task_name: dataset["validation"] for task_name, dataset in features_dict.items()
}

# train

best parameters: lr 5e-05, epochs 7, bs 4, numbeams 10; f1: 0.7980, bleu: 0.4150

In [80]:
tokenized_title

Dataset({
    features: ['attention_mask', 'counter_input', 'input_ids', 'title_input'],
    num_rows: 25704
})

In [95]:
trainer = MultitaskTrainer(
    model=multitask_model,
    args=transformers.TrainingArguments(
        output_dir='multitask_trainer_output',
        overwrite_output_dir=True,
        learning_rate=5e-5,
        do_train=True,
        num_train_epochs=7,
        per_device_train_batch_size=4,
        save_steps=3000,
        report_to='none'
    ),
#     data_collator=transformers.DataCollatorWithPadding(tokenizer=tokenizer),
    train_dataset=tokenized_dict,
)

PyTorch: setting up devices


In [96]:
trainer.train()

***** Running training *****
  Num examples = 51408
  Num Epochs = 7
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 89964


IndexError: too many indices for tensor of dimension 1

In [None]:
def save_model(multitask_model, suffix=''):
    for task_name in ["input_comment_pairs", "tfidf_cluster"]:
        multitask_model.taskmodels_dict[task_name].config.to_json_file(
            f"./multitask_model{suffix}/{task_name}_model/config.json"
        )
        torch.save(
            multitask_model.taskmodels_dict[task_name].state_dict(),
            f"./multitask_model{suffix}/{task_name}_model/pytorch_model.bin",
        )
        tokenizer.save_pretrained(f"./multitask_model{suffix}/{task_name}_model/")

In [None]:
save_model(multitask_model)

# eval

In [None]:
multitask_model = MultitaskBartModel.load(
    model_folder='multitask_model_best',
    model_type_dict={
        "tfidf_cluster": transformers.BartForSequenceClassification,
        "input_comment_pairs": transformers.BartForConditionalGeneration,
    },
    model_config_dict={
        "tfidf_cluster": transformers.AutoConfig.from_pretrained(
            'multitask_model/tfidf_cluster_model', num_labels=num_labels
        ),
        "input_comment_pairs": transformers.AutoConfig.from_pretrained(
            'multitask_model/input_comment_pairs_model'
        ),
    },
)

In [None]:
_ = multitask_model.to('cuda')

In [None]:
_ = multitask_model.eval()

In [None]:
def eval_cluster_fn(multitask_model, model_name, features_dict, batch_size=8):
    metric = load_metric("f1")
    task_name = 'tfidf_cluster'
    val_len = len(features_dict[task_name]["validation"])
    
    preds_all = []
    refs_all = []

    for index in tqdm(range(0, val_len, batch_size)):
        input_ = features_dict[task_name]["validation"][
            index : min(index + batch_size, val_len)
        ]['input_ids']
        labels = features_dict[task_name]["validation"][
            index : min(index + batch_size, val_len)
        ]["labels"]
        attention_masks = features_dict[task_name]["validation"][
            index : min(index + batch_size, val_len)
        ]['attention_mask']            

        inputs={}
        inputs["input_ids"] = torch.LongTensor(input_).to(multitask_model.device)
        inputs["attention_mask"] = torch.LongTensor(attention_masks).to(multitask_model.device)
        logits = multitask_model("tfidf_cluster", **inputs)[0]
        preds_all.extend(torch.argmax(
            torch.FloatTensor(torch.softmax(logits, dim=1).detach().cpu().tolist()),
            dim=1,
        ))
        refs_all.extend(labels)

    metric.add_batch(predictions=preds_all, references=refs_all)       
    print(f"F1: {metric.compute(average='macro')}")
    return preds_all, refs_all

In [None]:
preds_all, refs_all = eval_cluster_fn(multitask_model, model_name, features_dict)

In [None]:
print(classification_report(refs_all, preds_all))

In [None]:
def eval_bleu_fn(multitask_model, model_name, features_dict, batch_size=8, num_beams=7):
    preds_all = []
    scores = []
    refs_all = []
    #in_all = []
    for task_name in ['input_comment_pairs']:
        val_len = len(features_dict[task_name]["validation"])
        
        for index in tqdm(range(0, val_len, batch_size)):            
            input_ = features_dict[task_name]["validation"][
                index : min(index + batch_size, val_len)
            ]['input_ids']
            labels = features_dict[task_name]["validation"][
                index : min(index + batch_size, val_len)
            ]["labels"]
            attention_masks = features_dict[task_name]["validation"][
                index : min(index + batch_size, val_len)
            ]['attention_mask']            
                       
            with torch.no_grad():
                outputs = multitask_model.generate_text("input_comment_pairs", input_.to(multitask_model.device), num_beams=num_beams, early_stopping=True, max_length=512)
            in_all.extend([tokenizer.decode(inp, skip_special_tokens=True) for inp in input_])
            #preds_all.extend([postprocess(tokenizer.decode(out, skip_special_tokens=True)) for out in outputs])
            preds_all.extend([tokenizer.decode(out, skip_special_tokens=True) for out in outputs])
            #preds_all.extend([tokenizer.decode(out, skip_special_tokens=True) for out in outputs.sequences])
            #scores.extend(outputs.sequences_scores)
            refs_all.extend([tokenizer.decode(ref, skip_special_tokens=True) for ref in labels])

    print_bleu(refs_all, preds_all)
    return preds_all, refs_all, in_all #, scores

In [None]:
preds_all, refs_all, in_all = eval_bleu_fn(multitask_model, model_name, features_dict, num_beams=1)

In [None]:
save_model(multitask_model, '_best')

In [None]:
with open('multitask-tfidf-tagging-preds.txt', 'w') as f:
    for pred in preds_all:
        f.write("%s\n" % pred)

In [None]:
c

In [None]:
c = 0
for i in range(len(refs_all)):
    print_ = True
    if refs_all[i] == preds_all[i]:
        print_ = False
        c+=1
    
    if print_:
        print(i+1, in_all[i])
        print('\tRef:',  refs_all[i])
        print('\tOur:', preds_all[i])
        print()