# Reddit AITA Finetuned Model Evaluation

Ensure the following is set before running
- Testing dataset (from Huggingface repository)
- Model (from HuggingFace repository)
- Results directory
- Confusion matrix title

## Prepare environment

In [None]:
%pip install transformers accelerate datasets evaluate rouge_score unbabel-comet scikit-learn matplotlib seaborn

In [None]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoTokenizer
import torch
from model_evaluator import Model_Evaluator # custom class for evaluating models - see model_evaluator.py

In [None]:
from huggingface_hub import login
login()

In [None]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
%cd /content/drive/MyDrive/WCU_THESIS/Evaluation

## Load testing dataset

In [None]:
# load dataset and get test partition
dataset = load_dataset("MattBoraske/reddit-AITA-submissions-and-comments")
test_dataset = dataset['test']

## Load model and tokenizer

In [None]:
# load model and tokenizer

hf_repo = "MattBoraske/llama-2-7b-chat-reddit-AITA"

tokenizer = AutoTokenizer.from_pretrained(hf_repo)

if 'flan-t5' in hf_repo:
  model = AutoModelForSeq2SeqLM.from_pretrained(
    hf_repo,
    device_map='auto',
    torch_dtype=torch.bfloat16
  )

elif 'llama' in hf_repo:
  model = AutoModelForCausalLM.from_pretrained(
    hf_repo,
    device_map='auto',
    torch_dtype=torch.bfloat16
  )
else:
  print('Error with HuggingFace model repository: Is neither flan-t5 or llama-2')

In [None]:
model.config

In [None]:
model.generation_config

## Generate Model Predictions

In [None]:
results_directory = 'llama2-7b-chat-reddit-AITA-eval-results/' # directory to store eval results

In [None]:
# generate model predictions on test dataset

submission_texts, predictions, references, predicted_classes, correct_classes, ambiguity_scores = Model_Evaluator.get_model_predictions(
    model, 
    tokenizer,
    test_dataset,
    f'{results_directory}generation_results.json'
)

## Evaluate Model Predictions

In [None]:
confusion_matrix_title = 'Llama2-7b-Chat-Reddit-AITA Classifications' # title for confusion matrix made during eval

In [None]:
# run model evaluation

classification_type = 'binary'
output_files = [
    f'{results_directory}classification_report.txt',
    (confusion_matrix_title, f'{results_directory}confusion_matrix.png'),
    f'{results_directory}mcc.json',
    f'{results_directory}ROGUE_scores.json',
    f'{results_directory}BLEU_scores.json',
    f'{results_directory}COMET_scores.json'
]

Model_Evaluator.evaluate_model(
    submission_texts=submission_texts,
    predictions=predictions,
    references=references,
    AITA_classes=predicted_classes,
    correct_AITA_classes=correct_classes,
    ambiguity_scores=ambiguity_scores,
    classification_type=classification_type,
    output_files=output_files
)