In [1]:
import pandas
import re, json
import csv

import torch
import torch.nn as nn
from datasets import load_metric,Dataset,DatasetDict, load_dataset, Sequence, Value
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, BartForConditionalGeneration
from transformers import AutoTokenizer, Trainer

import evaluate

import numpy as np
import nltk
import os
import random
from sklearn.model_selection import train_test_split
from typing import List, Optional, Tuple, Union, Dict, Any
from jointbart_step2 import myBartForConditionalGeneration

In [2]:
seed = 42
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
_numpy_rng = np.random.default_rng(seed)
random.seed(seed)
np.random.seed(seed)
torch.use_deterministic_algorithms(False)
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
max_input_length = 256
max_target_length = 128

In [5]:
model_checkpoint = "hallucination-tagging-classifier_5e-4"
metric = evaluate.load("rouge")
model = myBartForConditionalGeneration.from_pretrained(model_checkpoint).to(device)
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large", add_prefix_space=True)

In [6]:
for name, param in model.named_parameters():
    if name == 'classifier.weight' or name == 'classifier.bias':
        param.requires_grad=False

In [7]:
dataset = load_dataset('pvisnrt/special_samsum')

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'tags', 'tag_ids'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'tags', 'tag_ids'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'tags', 'tag_ids'],
        num_rows: 818
    })
})

In [9]:
# dataset['train'] = dataset['train'].cast_column("tag_ids", Sequence(Value("int32")))
# dataset['validation'] = dataset['validation'].cast_column("tag_ids", Sequence(Value("int32")))
# dataset['test'] = dataset['test'].cast_column("tag", Sequence(Value("int32")))

In [10]:
def tokenize_and_align_labels(examples):
    inputs = [doc for doc in examples['dialogue']]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, is_split_into_words=True, return_tensors='pt', padding='max_length')

    with tokenizer.as_target_tokenizer():
        tokenized_inputs = tokenizer(examples["summary"], max_length=max_target_length, truncation=True, is_split_into_words=True, return_tensors='pt', padding='max_length')

    labels = []
    for i, label in enumerate(examples["tag_ids"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)# Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    model_inputs['labels'] = tokenized_inputs['input_ids']

    for i, t in zip(model_inputs['labels'], labels):
        if len(i) != len(t):
            print("Issue")

    model_inputs["decoder_tags"] = labels
    
    return model_inputs

In [11]:
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

In [12]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'tags', 'tag_ids', 'input_ids', 'attention_mask', 'labels', 'decoder_tags'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'tags', 'tag_ids', 'input_ids', 'attention_mask', 'labels', 'decoder_tags'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'tags', 'tag_ids', 'input_ids', 'attention_mask', 'labels', 'decoder_tags'],
        num_rows: 818
    })
})

In [13]:
tokenized_datasets['train'] = tokenized_datasets['train'].remove_columns(['id', 'dialogue', 'summary', 'tags', 'tag_ids'])
tokenized_datasets['validation'] = tokenized_datasets['validation'].remove_columns(['id', 'dialogue', 'summary', 'tags', 'tag_ids'])
tokenized_datasets['test'] = tokenized_datasets['test'].remove_columns(['id', 'dialogue', 'summary', 'tags', 'tag_ids'])

In [14]:
tokenized_datasets['train']

Dataset({
    features: ['input_ids', 'attention_mask', 'labels', 'decoder_tags'],
    num_rows: 14732
})

In [15]:
class MySeq2SeqTrainer(Seq2SeqTrainer):
    def prediction_step(
        self,
        model: nn.Module,
        inputs: Dict[str, Union[torch.Tensor, Any]],
        prediction_loss_only: bool,
        ignore_keys: Optional[List[str]] = None,
    ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
        """
        Perform an evaluation step on `model` using `inputs`.
        Subclass and override to inject custom behavior.
        Args:
            model (`nn.Module`):
                The model to evaluate.
            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.
                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument `labels`. Check your model's documentation for all accepted arguments.
            prediction_loss_only (`bool`):
                Whether or not to return the loss only.
        Return:
            Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
            labels (each being optional).
        """

        if not self.args.predict_with_generate or prediction_loss_only:
            return super().prediction_step(
                model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
            )

        has_labels = "labels" in inputs
        inputs = self._prepare_inputs(inputs)
        
        # print("prediction_step inputs: {}".format(inputs.keys()))

        # XXX: adapt synced_gpus for fairscale as well
        gen_kwargs = self._gen_kwargs.copy()
        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
            gen_kwargs["max_length"] = self.model.config.max_length
        gen_kwargs["num_beams"] = (
            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams
        )
        # default_synced_gpus = True if is_deepspeed_zero3_enabled() else False
        default_synced_gpus = False
        gen_kwargs["synced_gpus"] = (
            gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus
        )

        if "attention_mask" in inputs:
            gen_kwargs["attention_mask"] = inputs.get("attention_mask", None)
        if "global_attention_mask" in inputs:
            gen_kwargs["global_attention_mask"] = inputs.get("global_attention_mask", None)

        # prepare generation inputs
        # some encoder-decoder models can have varying encoder's and thus
        # varying model input names
        if hasattr(self.model, "encoder") and self.model.encoder.main_input_name != self.model.main_input_name:
            generation_inputs = inputs[self.model.encoder.main_input_name]
        else:
            generation_inputs = inputs[self.model.main_input_name]

        tags = inputs["decoder_tags"]
        gen_kwargs.update({"decoder_tags": tags})
        # print(f"Gen kwargs: {gen_kwargs}")
        # print(f"Gen inputs:{generation_inputs}")
        generated_tokens = self.model.generate(
            generation_inputs,
            **gen_kwargs,
        )
        # in case the batch is shorter than max length, the output should be padded
        if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]:
            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
        elif gen_kwargs.get("max_new_tokens") is not None and generated_tokens.shape[-1] < (
            gen_kwargs["max_new_tokens"] + 1
        ):
            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_new_tokens"] + 1)

        with torch.no_grad():
            if has_labels:
                with self.compute_loss_context_manager():
                    outputs = model(**inputs) # lm_logits as output
                if self.label_smoother is not None:
                    loss = self.label_smoother(outputs, inputs["labels"]).mean().detach()
                else:
                    loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach()
            else:
                loss = None

        if self.args.prediction_loss_only:
            return (loss, None, None)

        if has_labels:
            labels = inputs["labels"]
            if gen_kwargs.get("max_length") is not None and labels.shape[-1] < gen_kwargs["max_length"]:
                labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
            elif gen_kwargs.get("max_new_tokens") is not None and labels.shape[-1] < (
                gen_kwargs["max_new_tokens"] + 1
            ):
                labels = self._pad_tensors_to_max_len(labels, (gen_kwargs["max_new_tokens"] + 1))
        else:
            labels = None
        # print(labels)

        return (loss, generated_tokens, labels)

In [16]:
# training_args = Seq2SeqTrainingArguments(
#     output_dir="checkpoints/",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=2,
#     per_device_eval_batch_size=2,
#     weight_decay=0.01,
#     save_total_limit=4,
#     num_train_epochs=10,
#     predict_with_generate=True,
#     do_train=True,
#     do_eval=True,
#     fp16=True,
#     logging_steps=1,
#     save_strategy="epoch",
#     greater_is_better=True,
#     metric_for_best_model='Rouge1',
#     load_best_model_at_end=True,
#     seed=42,
#     generation_max_length=max_target_length,
# )

In [17]:
training_args = Seq2SeqTrainingArguments(
    output_dir="checkpoints/",
    logging_steps=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    seed=42,
    generation_max_length=max_target_length,
    dataloader_drop_last=True,
)

In [18]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [19]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    print(f"Generated summary: {decoded_preds[0]}")

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    print(f"Gold summary: {decoded_labels[0]}")

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [20]:
trainer = MySeq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [21]:
trainer.train()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.4107,0.364735,0.4859,0.2455,0.4005,0.4003,25.6141
2,0.3445,0.300036,0.5222,0.2772,0.4318,0.4321,28.9735
3,0.3287,0.294353,0.5165,0.2726,0.4252,0.4248,28.1525
4,0.2353,0.292528,0.5305,0.2871,0.4393,0.4392,27.5451
5,0.2529,0.294038,0.5278,0.2849,0.4363,0.4361,28.9125
6,0.2719,0.294348,0.5347,0.292,0.4428,0.4427,28.5345
7,0.2275,0.294254,0.5314,0.285,0.4375,0.4373,28.5146
8,0.2386,0.294187,0.532,0.2892,0.4394,0.4394,28.4615
9,0.2275,0.295613,0.5347,0.2908,0.4419,0.4423,28.0637
10,0.2174,0.296355,0.5355,0.2903,0.4419,0.4421,28.5504


Generated summary:  A wants to get a puppy for her son. She took him to the animal shelter last Monday. He liked one that she showed him. He wanted to take it home right away.
Gold summary:  A will go to the animal shelter tomorrow to get a puppy for her son. They already visited the shelter last Monday and the son chose the puppy.
Generated summary:  A wants to get a puppy for her son. B will go with her to the animal shelter tomorrow afternoon. A took her son to the shelter last Monday and he liked the puppy. He wanted to take it home right away. He will name it Lemmy after his dead hamster.
Gold summary:  A will go to the animal shelter tomorrow to get a puppy for her son. They already visited the shelter last Monday and the son chose the puppy.
Generated summary:  A wants to get a puppy for her son. B agrees to go with her to the animal shelter tomorrow's afternoon. A will get him one of those little dogs. He will name it Lemmy after his dead hamster.
Gold summary:  A will go to th

TrainOutput(global_step=2300, training_loss=0.6204552578148634, metrics={'train_runtime': 1988.6734, 'train_samples_per_second': 74.08, 'train_steps_per_second': 1.157, 'total_flos': 7.97515351228416e+16, 'train_loss': 0.6204552578148634, 'epoch': 10.0})

In [1]:
!nvidia-smi

Sat Nov 25 09:16:13 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12             Driver Version: 535.104.12   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off | 00000000:A1:00.0 Off |                    0 |
| N/A   25C    P0              42W / 300W |      4MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [22]:
trainer.evaluate(tokenized_datasets['test'])

Generated summary:  Amanda will ask Larry for Betty's number. He called her last time they were at the park.
Gold summary:  Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


{'eval_loss': 0.30515652894973755,
 'eval_rouge1': 0.5179,
 'eval_rouge2': 0.2783,
 'eval_rougeL': 0.4321,
 'eval_rougeLsum': 0.4317,
 'eval_gen_len': 29.0119,
 'eval_runtime': 76.3435,
 'eval_samples_per_second': 10.728,
 'eval_steps_per_second': 0.17,
 'epoch': 10.0}

In [25]:
model.save_pretrained("summarizer_w_classifier_loss_frozen")