In [None]:
%pip install evaluate rouge_score

# Stuff from other notebooks

- make sure to load flan-t5 base model in bfloat16 before adding peft adapter

In [None]:
from datasets import load_dataset
dataset = load_dataset("MattBoraske/reddit-AITA-submissions-and-comments-top-2500")

## Single sample test

In [None]:
from random import randrange
sample = dataset['test'][randrange(len(dataset["test"]))]
sample

In [None]:
input_ids = tokenizer(sample['flanT5_instruction'], max_length=FLAN_T5_ENCODER_CONTEXT_WINDOW_SIZE, padding='max_length', return_tensors="pt", truncation=True).input_ids.cuda()

In [None]:
outputs = peft_model.generate(
  input_ids=input_ids,
  max_new_tokens=FLAN_T5_DECODER_CONTEXT_WINDOW_SIZE,
  repetition_penalty=1.4
)

In [None]:
prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
print(f"Prediction:\n{prediction}")

In [None]:
import evaluate
import numpy as np
from datasets import load_from_disk
from tqdm import tqdm

# Metric
metric = evaluate.load("rouge")

def evaluate_model(model, sample):

    # tokenize input
    input_text = sample["submission_text"]
    input_ids = tokenizer(INSTRUCTION_PREFIX + input_text, max_length=FLAN_T5_ENCODER_CONTEXT_WINDOW_SIZE, return_tensors="pt", truncation=True).input_ids.cuda()

    # generate and decode prediction
    outputs = model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, max_new_tokens=FLAN_T5_DECODER_CONTEXT_WINDOW_SIZE)
    prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)

    # get label
    label = sample['top_comment_1']

    # return prediction and label
    return input_text, prediction, label

## ROGUE Score Testing Loop

In [None]:
import evaluate
import numpy as np
from datasets import load_from_disk
from tqdm import tqdm

# Metric
metric = evaluate.load("rouge")

def evaluate_model(model, sample):

    # tokenize input
    input_text = sample["submission_text"]
    input_ids = tokenizer(INSTRUCTION_PREFIX + input_text, max_length=FLAN_T5_ENCODER_CONTEXT_WINDOW_SIZE, return_tensors="pt", truncation=True).input_ids.cuda()

    # generate and decode prediction
    outputs = model.generate(input_ids=input_ids, do_sample=True, top_p=0.9, max_new_tokens=FLAN_T5_DECODER_CONTEXT_WINDOW_SIZE)
    prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)

    # get label
    label = sample['top_comment_1']

    # return prediction and label
    return input_text, prediction, label

In [None]:
# load first N samples in test dataset
NUMBER_OF_SAMPLES = 100
test_dataset = dataset['test'].select(range(NUMBER_OF_SAMPLES))

# run predictions
input_texts, predictions, references = [] , [], []
for sample in tqdm(test_dataset):
    i,p,l = evaluate_model(peft_model, sample)
    input_texts.append(i)
    predictions.append(p)
    references.append(l)

In [None]:
# Compute ROGUE scores
rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)

rouge_scores = {
    'ROUGE-1': f"{rogue['rouge1'] * 100:.2f}%",
    'ROUGE-2': f"{rogue['rouge2'] * 100:.2f}%",
    'ROUGE-L': f"{rogue['rougeL'] * 100:.2f}%",
    'ROUGE-Lsum': f"{rogue['rougeLsum'] * 100:.2f}%"
}

print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
print(f"rouge2: {rogue['rouge2']* 100:2f}%")
print(f"rougeL: {rogue['rougeL']* 100:2f}%")
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")

In [None]:
import json

INSTRUCTION_PREFIX = "Classify the interpersonal conflict into one of the following categories. 'YTA' when the writer is causing the conflict. 'NTA' when the other person is causing the conflict. 'NAH' when both the writer and other person are not causing the conflict. 'ESH' when both the writer and other person are causing the conflict. 'INFO' if more information is needed for a judgement. Then, provide a short justification: "

results = {}
for i, (input_text, prediction, reference) in enumerate(zip(input_texts, predictions, references)):
    results[f'Sample {i+1}'] = {'Input Text': input_text, 'Prediction': prediction, 'Reference': reference}

final_output = {
    'Instruction Prefix': INSTRUCTION_PREFIX,
    'ROUGE Scores': rouge_scores,
    'Results': results,
}

with open('/content/drive/MyDrive/WCU_THESIS/AITA_Fine_Tuning/flanT5_xxl_400_samples_training_100_testing_samples_results.json', 'w') as file:
    json.dump(final_output, file, indent=4)
