In [None]:
%cd /content/drive/MyDrive/CSE_497/Final

Installing dependencies

In [None]:
!pip install datasets
!pip install evaluate
!pip install rouge_score
!pip install bert_score

In [None]:
import pandas as pd
import numpy as np
from transformers import BartForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, BartTokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import evaluate
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

Loading the dataset

In [None]:
df = pd.read_csv("final_raw_data.csv")
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)
dataset = DatasetDict({"train": train_dataset, "eval": eval_dataset})

In [None]:
df.head()

Configuring the LLM

In [None]:
# Load pre-trained model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


# Load fine-tuned model and tokenizer for futher training or evaluation
# model_name = "./results_flan_long/checkpoint-7371"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Moving the model to GPU for faster training

In [None]:
import torch

device = torch.device("cpu")

if torch.cuda.is_available():
    device = torch.device("cuda")

model.to(device)

Preprocessing the dataset before training-

In [None]:
def preprocess(examples):
    inputs = tokenizer(examples['problem'], max_length=512, truncation=True, padding="max_length")
    outputs = tokenizer(examples['approach'], max_length=1024, truncation=True, padding="max_length")

    inputs['labels'] = outputs['input_ids']
    inputs['labels'] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels]
        for labels in inputs['labels']
    ]
    return inputs

In [None]:
tokenized_datasets = dataset.map(preprocess, batched=True)

Defining Evaluation metrics

In [None]:
# # Load the BERTScore metric
bertscore = evaluate.load("bertscore")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute BERTScore (using default settings)
    result = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")

    # BERTScore returns precision, recall, and F1 for each example
    # We can average over the F1 scores
    avg_f1 = np.mean(result["f1"])

    # Return a dictionary with the metric
    return {"bertscore_f1": round(avg_f1, 4)}

# ----------------------------------------------------------------------------------------------

# # Load the ROUGE metric
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

# -----------------------------------------------------------------------------------------------------------

# # Load the BLEU metric
bleu = evaluate.load("bleu")
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = bleu.compute(predictions=decoded_preds,
                            references=decoded_labels)

    prediction_lens = [np.count_nonzero(
        pred != tokenizer.pad_token_id) for pred in preds]

    result = {'bleu': result}

    return result

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results_bart_long",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    save_strategy="epoch",
    eval_strategy="epoch",
    do_train=True,
    do_eval=True,
    metric_for_best_model="rouge1",
    logging_steps=100,
    push_to_hub=False,
    no_cuda=False,
    eval_accumulation_steps = 2,
    predict_with_generate=True
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

Begin training and save the final model explicitly

In [None]:
trainer.train()

trainer.save_model("./fine_tuned_bart_final")

Manually evaluating the model

In [None]:
def generate_approach(problem):
    input_text = f"{problem}"
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, min_length=1024, max_length=1024)
    # print(outputs)
    approach = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return approach


problem = "Provide an approach for detecting spam emails."
generated_approach = generate_approach(problem)
print(generated_approach)