# Evaluate finetuned Mistral Models

## Imports and environment variables

In [6]:
import torch
import os
import sys
from dotenv import load_dotenv

from peft import PeftModel

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, AutoModelForCausalLM

# Load environment variables from .env file
load_dotenv()

# Add the parent directory to the Python path
# __file__


# Adjust this path to point to the directory containing rl_training_new
module_path = os.path.abspath(os.path.join('..')) # or another relative path
if module_path not in sys.path:
    sys.path.append(module_path)

from rl_training_new.utils import find_best_window


from bert_score import score

# load the relevant devices available on the server
os.environ["CUDA_VISIBLE_DEVICES"] = os.getenv("AVAILABLE_DEVICES")

# Enable expandable CUDA segments
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# load cuda
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print("CUDA is available. Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")


There are 1 GPU(s) available.
CUDA is available. Using GPU: NVIDIA L40S


In [7]:
MODEL = os.getenv("GENERATION_MODEL_NAME")
ALGORITHM = os.getenv("EVAL_MODEL_ALGORITHM")
RL_TRAINED_ADAPTERS = os.getenv("EVAL_MODEL_FOLDER")

## Load model

In [None]:
base_model_test = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=1)
new_model = PeftModel.from_pretrained(base_model_test, RL_TRAINED_ADAPTERS)

## Generate candidates

In [None]:
# load prompt dataset

import pandas as pd
from sklearn.model_selection import train_test_split

# Load the CSV file
df = pd.read_csv("your_file.csv")

# First split: train + temp (temp will be split into val/test)
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)

# Second split: validation + test
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Check the sizes
print(f"Train: {len(train_df)}")
print(f"Validation: {len(val_df)}")
print(f"Test: {len(test_df)}")

# Save to CSV files without the index column
train_df.to_csv("train.csv", index=False, sep=';')
val_df.to_csv("validation.csv", index=False, sep=';')
test_df.to_csv("test.csv", index=False, sep=';')

## ROUGE/BLEU on relevant sequence from the answer 

In [None]:
# Evaluation metric 1: ROUGE on relevant sequences
# WHy not BLEU --> penalizes missing ngrams, not something I m looking for here







## BERT Score --> based on embedding similarity

In [None]:
# Maybe add BERTScore --> semantic similarity based on sentencetransformer
P, R, F1 = score(candidates, references, model_type='bert-base-uncased', lang='en')

## G-Eval

In [None]:
# Evaluation metric 2: Evaluate whole answer on G-Eval

## Personal evaluation

In [None]:
# Generate about 10 answers with both models --> load them into the user interface and get reward score
# Also maybe do qualitative evaluation