In [1]:
import pandas
import re, json
import csv

import torch
import torch.nn as nn
from datasets import load_metric,Dataset,DatasetDict, load_dataset, Sequence, Value
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, BartForConditionalGeneration
from transformers import AutoTokenizer, Trainer

import evaluate

import numpy as np
import nltk
import os
import random
from sklearn.model_selection import train_test_split
from typing import List, Optional, Tuple, Union, Dict, Any
from jointbart_step2 import myBartForConditionalGeneration
from hg_utils import GenerationMixin

In [2]:
seed = 42
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
_numpy_rng = np.random.default_rng(seed)
random.seed(seed)
np.random.seed(seed)
torch.use_deterministic_algorithms(False)
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
max_input_length = 256
max_target_length = 128

In [5]:
model_checkpoint = "hallucination-tagging-classifier"
metric = evaluate.load("rouge")
model = myBartForConditionalGeneration.from_pretrained(model_checkpoint).to(device)
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large", add_prefix_space=True)

In [6]:
for name, param in model.named_parameters():
    if name == 'classifier.weight' or name == 'classifier.bias':
        param.requires_grad=False

In [7]:
dataset = load_dataset('pvisnrt/special_samsum')
id2label =  {0: 'C', 1: 'M', 2: 'N', 3: 'O', 4: 'OB', 5: 'W'}
label2id = {'C': 0, 'M': 1, 'N': 2, 'O': 3, 'OB': 4, 'W': 5}

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'tags', 'tag_ids'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'tags', 'tag_ids'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'tags', 'tag_ids'],
        num_rows: 818
    })
})

In [9]:
# dataset['train'] = dataset['train'].cast_column("tag_ids", Sequence(Value("int32")))
# dataset['validation'] = dataset['validation'].cast_column("tag_ids", Sequence(Value("int32")))
# dataset['test'] = dataset['test'].cast_column("tag", Sequence(Value("int32")))

In [10]:
def tokenize_and_align_labels(examples):
    inputs = [doc for doc in examples['dialogue']]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, is_split_into_words=True, return_tensors='pt', padding='max_length')

    with tokenizer.as_target_tokenizer():
        tokenized_inputs = tokenizer(examples["summary"], max_length=max_target_length, truncation=True, is_split_into_words=True, return_tensors='pt', padding='max_length')

    labels = []
    summary_len = []
    tags_len = []
    tags_ids_len = []
    for i, label in enumerate(examples["tag_ids"]):
        summary_len.append(len(examples['summary'][i]))
        tags_len.append(len(examples['tags'][i]))
        tags_ids_len.append(len(examples['tag_ids'][i]))
        
        word_ids = tokenized_inputs.word_ids(batch_index=i)# Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    model_inputs['labels'] = tokenized_inputs['input_ids']

    for i, t in zip(model_inputs['labels'], labels):
        if len(i) != len(t):
            print("Issue")

    model_inputs["decoder_tags"] = labels
    
    model_inputs['summary_len'] = summary_len
    model_inputs['tags_len'] = tags_len
    model_inputs['tags_ids_len'] = tags_ids_len
     
    return model_inputs

In [11]:
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

In [12]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'tags', 'tag_ids', 'input_ids', 'attention_mask', 'labels', 'decoder_tags', 'summary_len', 'tags_len', 'tags_ids_len'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'tags', 'tag_ids', 'input_ids', 'attention_mask', 'labels', 'decoder_tags', 'summary_len', 'tags_len', 'tags_ids_len'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'tags', 'tag_ids', 'input_ids', 'attention_mask', 'labels', 'decoder_tags', 'summary_len', 'tags_len', 'tags_ids_len'],
        num_rows: 818
    })
})

In [13]:
tokenized_datasets['train'] = tokenized_datasets['train'].remove_columns(['id', 'dialogue', 'summary', 'tags', 'tag_ids'])
tokenized_datasets['validation'] = tokenized_datasets['validation'].remove_columns(['id', 'dialogue', 'summary', 'tags', 'tag_ids'])
tokenized_datasets['test'] = tokenized_datasets['test'].remove_columns(['id', 'dialogue', 'summary', 'tags', 'tag_ids'])

tokenized_datasets['train'] = tokenized_datasets['train'].remove_columns(['tags_ids_len', 'summary_len', 'tags_len'])
tokenized_datasets['test'] = tokenized_datasets['test'].remove_columns(['tags_ids_len', 'summary_len', 'tags_len'])

tokenized_datasets['train'] = tokenized_datasets['train'].select(range(100))
tokenized_datasets['validation'] = tokenized_datasets['validation'].select(range(20))
tokenized_datasets['test'] = tokenized_datasets['test'].select(range(20))


In [14]:
tokenized_datasets['train']

Dataset({
    features: ['input_ids', 'attention_mask', 'labels', 'decoder_tags'],
    num_rows: 100
})

In [15]:
class MySeq2SeqTrainer(Seq2SeqTrainer):
    def prediction_step(
        self,
        model: nn.Module,
        inputs: Dict[str, Union[torch.Tensor, Any]],
        prediction_loss_only: bool,
        ignore_keys: Optional[List[str]] = None,
    ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
        """
        Perform an evaluation step on `model` using `inputs`.
        Subclass and override to inject custom behavior.
        Args:
            model (`nn.Module`):
                The model to evaluate.
            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.
                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument `labels`. Check your model's documentation for all accepted arguments.
            prediction_loss_only (`bool`):
                Whether or not to return the loss only.
        Return:
            Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
            labels (each being optional).
        """
        summary_len = inputs['summary_len'].cpu().detach().tolist()
        tag_ids_len = inputs['tags_ids_len'].cpu().detach().tolist()
        tags_len = inputs['tags_len'].cpu().detach().tolist()
        inputs.pop('tags_ids_len')
        inputs.pop('summary_len')
        inputs.pop('tags_len')
        if not self.args.predict_with_generate or prediction_loss_only:
            return super().prediction_step(
                model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
            )

        has_labels = "labels" in inputs
        inputs = self._prepare_inputs(inputs)
        
        # print("prediction_step inputs: {}".format(inputs.keys()))

        # XXX: adapt synced_gpus for fairscale as well
        gen_kwargs = self._gen_kwargs.copy()
        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
            gen_kwargs["max_length"] = self.model.config.max_length
        gen_kwargs["num_beams"] = (
            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams
        )
        # default_synced_gpus = True if is_deepspeed_zero3_enabled() else False
        default_synced_gpus = False
        gen_kwargs["synced_gpus"] = (
            gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus
        )

        if "attention_mask" in inputs:
            gen_kwargs["attention_mask"] = inputs.get("attention_mask", None)
        if "global_attention_mask" in inputs:
            gen_kwargs["global_attention_mask"] = inputs.get("global_attention_mask", None)

        # prepare generation inputs
        # some encoder-decoder models can have varying encoder's and thus
        # varying model input names
        if hasattr(self.model, "encoder") and self.model.encoder.main_input_name != self.model.main_input_name:
            generation_inputs = inputs[self.model.encoder.main_input_name]
        else:
            generation_inputs = inputs[self.model.main_input_name]

        tags = inputs["decoder_tags"]
        gen_kwargs.update({"decoder_tags": tags})
        # print(f"Gen kwargs: {gen_kwargs}")
        # print(f"Gen inputs:{generation_inputs}")
         #generated_tokens = self.model.generate(
        #    generation_inputs,
        #    **gen_kwargs,
        #)
        
        gen_mix = GenerationMixin(model)
        generated_tokens, classification_ids = gen_mix.generate(generation_inputs, **gen_kwargs)
        
        dialog = tokenizer.batch_decode(generation_inputs, skip_special_tokens=True)
        print('-'*89)
        print('dialog: ', dialog)
        
        generated_summaries = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        
        print('Generated Summaries:\n',*generated_summaries, sep='\n')
        print(f'Generated summary length: {generated_tokens.shape}')
        print(f"Gold summary length: {summary_len}")
        
        classification_labels = []
        classification_ids_lst = classification_ids.cpu().detach().tolist()
        for batch_classification_ids in classification_ids_lst:
            batch_classification_labels = []
            for classification_id in batch_classification_ids:
                classification_id = classification_id - 3
                if classification_id >= 0 and classification_id < len(id2label):
                    batch_classification_labels.append(id2label[classification_id])
            
            classification_labels.append(' '.join(batch_classification_labels))
                
        print('Generated Classification Labels:\n',*classification_labels, sep='\n')
        print(f'Generated classification tag length: {classification_ids.shape}')
        print(f"Gold Classification tag length: {tags_len}")
        print(f"Gold Classification tag ids length: {tag_ids_len}")
    
       
        # in case the batch is shorter than max length, the output should be padded
        if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]:
            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
        elif gen_kwargs.get("max_new_tokens") is not None and generated_tokens.shape[-1] < (
            gen_kwargs["max_new_tokens"] + 1
        ):
            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_new_tokens"] + 1)

        with torch.no_grad():
            if has_labels:
                with self.compute_loss_context_manager():
                    outputs = model(**inputs) # lm_logits as output
                if self.label_smoother is not None:
                    loss = self.label_smoother(outputs, inputs["labels"]).mean().detach()
                else:
                    loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach()
            else:
                loss = None

        if self.args.prediction_loss_only:
            return (loss, None, None)

        if has_labels:
            labels = inputs["labels"]
            if gen_kwargs.get("max_length") is not None and labels.shape[-1] < gen_kwargs["max_length"]:
                labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
            elif gen_kwargs.get("max_new_tokens") is not None and labels.shape[-1] < (
                gen_kwargs["max_new_tokens"] + 1
            ):
                labels = self._pad_tensors_to_max_len(labels, (gen_kwargs["max_new_tokens"] + 1))
        else:
            labels = None
        # print(labels)

        return (loss, generated_tokens, labels)

In [16]:
# training_args = Seq2SeqTrainingArguments(
#     output_dir="checkpoints/",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=2,
#     per_device_eval_batch_size=2,
#     weight_decay=0.01,
#     save_total_limit=4,
#     num_train_epochs=10,
#     predict_with_generate=True,
#     do_train=True,
#     do_eval=True,
#     fp16=True,
#     logging_steps=1,
#     save_strategy="epoch",
#     greater_is_better=True,
#     metric_for_best_model='Rouge1',
#     load_best_model_at_end=True,
#     seed=42,
#     generation_max_length=max_target_length,
# )

In [17]:
training_args = Seq2SeqTrainingArguments(
    output_dir="checkpoints/",
    logging_steps=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    seed=42,
    generation_max_length=max_target_length,
    dataloader_drop_last=True,
    remove_unused_columns =False
)

In [18]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [19]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    print(f"Generated summary: {decoded_preds[0]}")

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    print(f"Gold summary: {decoded_labels[0]}")

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [20]:
trainer = MySeq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [21]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mdevavratj[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,5.3323,4.466559,0.4094,0.1944,0.3453,0.346,23.6
2,3.4557,3.383375,0.4338,0.2237,0.375,0.3741,27.95


-----------------------------------------------------------------------------------------
dialog:  [" A : Hi Tom, are you busy tomorrow’s afternoon? B : I’m pretty sure I am. What’s up? A : Can you go with me to the animal shelter?. B : What do you want to do? A : I want to get a puppy for my son. B : That will make him so happy. A : Yeah, we’ve discussed it many times. I think he’s ready now. B : That’s good. Raising a dog is a tough issue. Like having a baby ; -) A : I'll get him one of those little dogs. B : One that won't grow up too big ; -) A : And eat too much ; -)) B : Do you know which one he would like? A : Oh, yes, I took him there last Monday. He showed me one that he really liked. B : I bet you had to drag him away. A : He wanted to take it home right away ; -). B : I wonder what he'll name it. A : He said he’d name it after his dead hamster – Lemmy - he's a great Motorhead fan : -)))"]
Generated Summaries:

 A wants to get a puppy for her son. She'll take him to the anima

-----------------------------------------------------------------------------------------
dialog:  [" Jane : Hello Vegano Resto : Hello, how may I help you today? Jane : I would like to make a reservation. Jane : For 6 people, tonight around 20 : 00 Vegano Resto : Let me just check. Vegano Resto : Ah, I'm afraid that there is no room at 20 : 00. Vegano Resto : However, I could offer you a table for six at 18 : 30 or at 21 : 00 Vegano Resto : Would either of those times suit you? Jane : Oh dear. Jane : Let me just ask my friends. Vegano Resto : No problem. Jane : 21 : 00 will be ok. Vegano Resto : Perfect. So tonight at 21 : 00 for six people under your name. Jane : great, thank you!"]
Generated Summaries:

 Jane would like to make a reservation for six people at 21 : 00. Vegano Resto will make the reservation for Jane at 18 : 30.
Generated summary length: torch.Size([1, 33])
Gold summary length: [14]
Generated Classification Labels:

O O O O O O O O O O O O O O O O O O O O O O O O O O 

-----------------------------------------------------------------------------------------
dialog:  [" Julia : What is your biggest dream Julia : I mean the kind that can be achieved James : Everyone say I have nice voice James : My mom liked very much when I was reading outloud James : I've had this dream for some time now, to become a voice actor James : Be a part of cartoon or video game as a voice actor reading a character Julia : Wow. Nice one. Julia : Btw you do have a nice voice Julia : I could listen to you as a radio speaker. James : Thanks James : I've worked in radio, but it was during college so I had little time for this Julia : Shame. James : I know. But nothing is lost. I still have microphone at home and with a bit of help I could make homemade radio station Julia : That's actually a great idea Julia : I cheer for you!"]
Generated Summaries:

 James has a good voice. He would like to become a voice actor. He has a microphone at home and could make a radio station.
Genera

-----------------------------------------------------------------------------------------
dialog:  [" Marla : <file_photo> Marla : look what I found under my bed Kiki : lol Tamara : is that someone's underwear? Marla : it certainly isn't mine, my ass is big but it isn't huge Kiki : it looks like male underwear Tamara : not necessarily, maybe some butch had fun in your room while you were gone Marla : ok but how can you leave your underwear after hooking up? wtf is wrong with people Kiki : she or he could be too wasted to notice Tamara : or maybe someone put their pants there to piss you off Marla : that makes no sense Marla : it's so fucking childish Kiki : if it's childish then it must have been your sister's idea Marla : she's 13, she doesn't have underwear that isn't pink Tamara : maybe it belonged to one of your exes? Kiki : she would have recognized it Marla : lol we're doing total CSI investigation on one pair of boxers : D Kiki : <file_gif> Tamara : lol Tamara : I think your sis

-----------------------------------------------------------------------------------------
dialog:  [' Norbert : we need to hurry to catch the tour Wendy : ok, am buying something. be right out! Norbert : ok. am not waiting long though. missed the last one because of you Wendy : just be patient for once. Norbert : im always patient Wendy : at the register now Norbert : alright']
Generated Summaries:

 Norbert and Wendy are buying something. They need to hurry to catch the tour.
Generated summary length: torch.Size([1, 20])
Gold summary length: [15]
Generated Classification Labels:

O O O O O O O O O O O O O O O O O O O O O
Generated classification tag length: torch.Size([1, 21])
Gold Classification tag length: [15]
Gold Classification tag ids length: [15]
-----------------------------------------------------------------------------------------
dialog:  [" Lidia : hi guys, how was your day? Cecil : amazing Lidia : where did you go? Cheryl : to the Jandia Peninsula Cheryl : sorry, Cecil i

Generated summary:  A wants to take her son to the animal shelter to get a puppy for him. She wants to get him one of those little dogs. She took him to the shelter last Monday and he really liked one of the dogs.
Gold summary:  A will go to the animal shelter tomorrow to get a puppy for her son. They already visited the shelter last Monday and the son chose the puppy.


TrainOutput(global_step=200, training_loss=5.848726259469986, metrics={'train_runtime': 368.7167, 'train_samples_per_second': 0.542, 'train_steps_per_second': 0.542, 'total_flos': 108358064025600.0, 'train_loss': 5.848726259469986, 'epoch': 2.0})

In [22]:
!nvidia-smi

Tue Nov 28 02:37:37 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 537.13                 Driver Version: 537.13       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3070 Ti   WDDM  | 00000000:01:00.0 Off |                  N/A |
|  0%   43C    P2              34W / 310W |   7937MiB /  8192MiB |     49%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [23]:
trainer.evaluate(tokenized_datasets['test'])

KeyError: 'summary_len'

In [None]:
model.save_pretrained("summarizer_w_classifier_loss_frozen")