# Reddit AITA Finetuned Model Evaluation

## Prepare environment

In [None]:
%pip install transformers accelerate datasets evaluate rouge_score unbabel-comet

In [2]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoTokenizer
import torch
from model_evaluator import Model_Evaluator # custom class for evaluating models - see model_evaluator.py

In [None]:
from huggingface_hub import login
login()

## Load testing dataset

In [3]:
# load dataset and get test partition
dataset = load_dataset("MattBoraske/reddit-AITA-binary-submissions-and-comments-top-2k")
test_dataset = dataset['test']

## Load model and tokenizer

In [6]:
# load model and tokenizer

hf_repo = "MattBoraske/flan-t5-xl-reddit-AITA-binary-top-2k"

tokenizer = AutoTokenizer.from_pretrained(hf_repo)

if 'flan-t5' in hf_repo:
  model = AutoModelForSeq2SeqLM.from_pretrained(
    hf_repo,
    device_map='auto',
    torch_dtype=torch.bfloat16
  )

elif 'llama-2' in hf_repo:
  model = AutoModelForCausalLM.from_pretrained(
    hf_repo,
    device_map='auto',
    torch_dtype=torch.bfloat16
  )
else:
  print('Error with HuggingFace model repository: Is neither flan-t5 or llama-2')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Generate model predictions

In [8]:
# generate model predictions on test dataset

output_file = 'flan-t5-xl-reddit-AITA-binary-predictions.json'

submission_texts, predictions, references, predicted_classes, correct_classes, ambiguity_scores = Model_Evaluator.get_model_predictions(
    model, 
    tokenizer,
    dataset['test'],
    output_file
)

Predicted AITA_classs: NTA	Correct AITA_classs: YTA
Predicted AITA_classs: NTA	Correct AITA_classs: NTA
Predicted AITA_classs: YTA	Correct AITA_classs: NTA
Predicted AITA_classs: YTA	Correct AITA_classs: YTA
Predicted AITA_classs: NTA	Correct AITA_classs: YTA
Predicted AITA_classs: NTA	Correct AITA_classs: NTA
Predicted AITA_classs: YTA	Correct AITA_classs: YTA
Predicted AITA_classs: NTA	Correct AITA_classs: NTA
Predicted AITA_classs: NTA	Correct AITA_classs: YTA
Predicted AITA_classs: NTA	Correct AITA_classs: YTA


## Evaluate Model Predictions

In [9]:
# evaluate model predictions

classification_type = 'binary'
output_files = [
    'classification_report.txt',
    ('Flan-T5-XL-Reddit-AITA-Binary-Top-2k Classifications', 'confusion_matrix.png'),
    'mcc.json',
    'ROGUE_scores.json',
    'BLEU_scores.json',
    'COMET_scores.json'
]

Model_Evaluator.evaluate_model(
    submission_texts=submission_texts,
    predictions=predictions,
    references=references,
    AITA_classes=predicted_classes,
    correct_AITA_classes=correct_classes,
    ambiguity_scores=ambiguity_scores,
    classification_type=classification_type,
    output_files=output_files
)