# Evaluate finetuned Mistral Models

## Imports and environment variables

In [1]:
import torch
import os
import sys
from dotenv import load_dotenv
import ast
import evaluate

from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams, LLMTestCase

from distutils.util import strtobool

# from deepeval.models.azure_openai import AzureOpenAIModel

from peft import PeftModel

import pandas as pd

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, AutoModelForCausalLM

from deepeval.models import AzureOpenAIModel
# from deepeval.metrics import AnswerRelevancyMetric

# Load environment variables from .env file
load_dotenv()

# Add the parent directory to the Python path
# __file__


# Adjust this path to point to the directory containing rl_training_new
module_path = os.path.abspath(os.path.join('..')) # or another relative path
if module_path not in sys.path:
    sys.path.append(module_path)

from rl_training_new.utils import find_best_window


from bert_score import score

# load the relevant devices available on the server
os.environ["CUDA_VISIBLE_DEVICES"] = os.getenv("AVAILABLE_DEVICES")

# Enable expandable CUDA segments
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# load cuda
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print("CUDA is available. Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")


  from .autonotebook import tqdm as notebook_tqdm


There are 1 GPU(s) available.
CUDA is available. Using GPU: NVIDIA L40S


In [2]:
def str_to_bool(s):
    return bool(strtobool(s))

In [3]:
MODEL = os.getenv("GENERATION_MODEL_NAME")
ALGORITHM = os.getenv("EVAL_MODEL_ALGORITHM")
RL_TRAINED_ADAPTERS = os.getenv("EVAL_MODEL_FOLDER")
EVAL_ANSWERS_CSV = os.getenv("EVAL_ANSWERS_CSV")
GENERATE_RESPONSES = str_to_bool(os.getenv("GENERATE_RESPONSES"))
OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
# RL_DATA_PATH = os.getenv("RL_DATA_PATH")
EVAL_FILE = os.getenv("EVAL_FILE")
NUM_RESPONSES_EVAL = int(os.getenv("NUM_RESPONSES_EVAL"))  # Number of responses per model

## Load model

In [4]:
# base_model = AutoModelForCausalLM.from_pretrained(MODEL)
base_model = AutoModelForCausalLM.from_pretrained(MODEL,  
                                            #  device_map="auto",  # For GPU/TPU acceleration
                                            device_map=None,
                                            torch_dtype=torch.bfloat16,
                                            #  load_in_4bit=True,
                                            quantization_config={
                                                "load_in_4bit": True,
                                                "bnb_4bit_compute_dtype": torch.bfloat16,
                                                "bnb_4bit_use_double_quant": True,
                                                "bnb_4bit_quant_type": "nf4"
                                                }
                                            )   # Optimize precision)
# base_model_new = AutoModelForCausalLM.from_pretrained(MODEL)
base_model_new = AutoModelForCausalLM.from_pretrained(MODEL,  
                                            #  device_map="auto",  # For GPU/TPU acceleration
                                            device_map=None,
                                            torch_dtype=torch.bfloat16,
                                            #  load_in_4bit=True,
                                            quantization_config={
                                                "load_in_4bit": True,
                                                "bnb_4bit_compute_dtype": torch.bfloat16,
                                                "bnb_4bit_use_double_quant": True,
                                                "bnb_4bit_quant_type": "nf4"
                                                }
                                            )   # Optimize precision)
new_model = PeftModel.from_pretrained(base_model_new, RL_TRAINED_ADAPTERS)

tokenizer = AutoTokenizer.from_pretrained(MODEL)

base_model.eval()
new_model.eval()

base_model.to(device)
new_model.to(device)


Loading checkpoint shards: 100%|██████████| 3/3 [00:10<00:00,  3.50s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:10<00:00,  3.40s/it]


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32768, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_pro

## Get test dataset

In [5]:
df = pd.read_csv(EVAL_FILE, sep=';')

## Generate Model responses

In [6]:
# Function to generate response
def generate_response(prompt, tokenizer, model, max_length=1024):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    input_length = inputs['input_ids'].shape[1]
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=input_length + max_length, do_sample=True, top_k=50)
        generated_ids = outputs[0][input_length:]
    return tokenizer.decode(generated_ids, skip_special_tokens=True)

In [7]:
if GENERATE_RESPONSES:
    # Generate multiple responses for each prompt
    for i in range(NUM_RESPONSES):
        df[f'response_base_model_{i+1}'] = df['prompt'].apply(lambda x: generate_response(x, tokenizer, base_model))
        df[f'response_new_model_{i+1}'] = df['prompt'].apply(lambda x: generate_response(x, tokenizer, new_model))

    # Show result: displaying first response columns for brevity
    response_cols = [f'response_new_model_{i+1}' for i in range(NUM_RESPONSES)] + \
                    [f'response_base_model_{i+1}' for i in range(NUM_RESPONSES)]
    print(df[['prompt'] + response_cols])

    # Store in CSV
    df.to_csv(EVAL_ANSWERS_CSV, index=False, sep=';')

else:
    df = pd.read_csv(EVAL_ANSWERS_CSV, sep=';')



# if GENERATE_RESPONSES:
#     # Apply both models
#     df['response_base_model'] = df['prompt'].apply(lambda x: generate_response(x, tokenizer, base_model))
#     df['response_new_model'] = df['prompt'].apply(lambda x: generate_response(x, tokenizer, new_model))

#     # Show result
#     print(df[['prompt', 'response_new_model', 'response_base_model']])

#     # store response df in csv
#     df.to_csv(EVAL_ANSWERS_CSV, index=False, sep=';')
# else:
#     pd.read_csv(EVAL_ANSWERS_CSV, sep=';')

NameError: name 'NUM_RESPONSES' is not defined

In [None]:
# print(df['response_base_model'][0])
# print(df['response_new_model'][0])

KeyError: 'response_base_model'

In [None]:

# Create list of column names
new_cols = [f'response_new_model_{j+1}' for j in range(NUM_RESPONSES)]
base_cols = [f'response_base_model_{j+1}' for j in range(NUM_RESPONSES)]

# Select columns and convert to list of lists (rows)
candidates_new = df[new_cols].values.tolist()
candidates_base = df[base_cols].values.tolist()

precon_text_list = df['precondition_texts'].to_list()
precon_pos_list = df["precondition_positions"].to_list()

references = []
for dict1, dict2 in zip(precon_text_list, precon_pos_list):
    dict1 = ast.literal_eval(dict1)
    dict2 = ast.literal_eval(dict2)
    combined = []
    for key in dict1.keys():  # or use sorted(dict1.keys()) if key order isn't guaranteed
        combined.append(str(dict1[key]) + '\n')
        combined.append(str(dict2[key]) + '\n\n')
    references.append(''.join(combined))

print(references)

['een overzicht van de uitgaven en de ontvangsten in de begrotingen voor het begrotingsjaar en de vier daarop aansluitende jaren.\nArtikel 2.23 sectie 4c IN Comptabiliteitswet 2016\n\nde budgettaire beschouwingen over het voorgenomen beleid voor de collectieve sector\nArtikel 2.23 sectie 4b IN Comptabiliteitswet 2016\n\nhet budgettaire totaalbeeld voor het betrokken begrotingsjaar en de vier daaropvolgende jaren van de rijksbegroting en de niet tot de rijksbegroting behorende budgetdisciplinesectoren\nArtikel 2.23 sectie 4a IN Comptabiliteitswet 2016\n\n', 'de verblijfsvergunning wordt verleend met ingang van de dag waarop de vreemdeling heeft aangetoond dat hij aan alle voorwaarden voldoet\nArtikel 26, sectie 1 IN Vreemdelingenwet 2024\n\nde verblijfsvergunning wordt verleend met ingang van een dag eerder dan de dag waarop de aanvraag is ontvangen\nArtikel 26, sectie 1 IN Vreemdelingenwet 2024\n\n', 'Begroting bevat begrotingsstaat\nArtikel 2.2 IN Comptabiliteitswet 2016\n\n', 'colleg

In [None]:
# Flatten the candidate lists
candidates_new_flat = [resp for row in candidates_new for resp in row]
candidates_base_flat = [resp for row in candidates_base for resp in row]

# Repeat each reference NUM_RESPONSES times to match the flattened predictions
references_flat = [ref for ref in references for _ in range(NUM_RESPONSES)]


## ROUGE/BLEU on relevant sequence from the answer 

In [None]:
# Evaluation metric 1: ROUGE on relevant sequences
# WHy not BLEU --> penalizes missing ngrams, not something I m looking for here

rouge = evaluate.load('rouge')


results_new = rouge.compute(predictions=candidates_new_flat, references=references_flat)
results_base = rouge.compute(predictions=candidates_base_flat, references=references_flat)


print(f"Results new: {results_new}")
print(f"Results base: {results_base}")

Results new: {'rouge1': np.float64(0.27001352892246916), 'rouge2': np.float64(0.09322759038840496), 'rougeL': np.float64(0.19553374880287844), 'rougeLsum': np.float64(0.2602304569036782)}
Results base: {'rouge1': np.float64(0.24465001162862643), 'rouge2': np.float64(0.0811503655720357), 'rougeL': np.float64(0.16359792413074326), 'rougeLsum': np.float64(0.23334599359548214)}


## BERT Score --> based on embedding similarity

In [None]:
# Maybe add BERTScore --> semantic similarity based on sentencetransformer
P_new, R_new, F1_new = score(
    candidates_new_flat, 
    references_flat, 
    model_type='answerdotai/ModernBERT-base', 
    num_layers=22,
    lang='nl')

P_base, R_base, F1_base = score(
    candidates_base_flat, 
    references_flat, 
    model_type='answerdotai/ModernBERT-base', 
    num_layers=22,
    lang='nl')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
print(f"BERT Score metrics new: {P_new, R_new, F1_new}")
print(f"BERT Score metrics base: {P_base, R_base, F1_base}")

print(f"F1 base: {F1_base.mean()}, F1 new: {F1_new.mean()}")



BERT Score metrics new: (tensor([0.7933, 0.8294, 0.8098, 0.8270, 0.8279, 0.8057, 0.7394, 0.7518, 0.7185,
        0.7795, 0.7721, 0.7247, 0.7345, 0.7336, 0.7939, 0.8140, 0.7858, 0.7998,
        0.7510, 0.7587]), tensor([0.7802, 0.8213, 0.7809, 0.8345, 0.7905, 0.8432, 0.8291, 0.8384, 0.8564,
        0.8490, 0.8264, 0.8140, 0.8213, 0.8025, 0.8578, 0.8010, 0.8116, 0.8046,
        0.7838, 0.7920]), tensor([0.7867, 0.8253, 0.7951, 0.8307, 0.8088, 0.8240, 0.7816, 0.7927, 0.7814,
        0.8127, 0.7983, 0.7668, 0.7755, 0.7665, 0.8246, 0.8074, 0.7985, 0.8022,
        0.7671, 0.7750]))
BERT Score metrics base: (tensor([0.7348, 0.7843, 0.8411, 0.7709, 0.7690, 0.8050, 0.7043, 0.7389, 0.7662,
        0.7754, 0.7376, 0.6555, 0.6495, 0.7245, 0.6337, 0.7457, 0.8181, 0.8178,
        0.7632, 0.7629]), tensor([0.7778, 0.7803, 0.8784, 0.8060, 0.7952, 0.8540, 0.8341, 0.8437, 0.8502,
        0.8580, 0.8220, 0.7818, 0.7714, 0.8182, 0.8074, 0.7769, 0.8027, 0.8191,
        0.8001, 0.8091]), tensor([0.7557, 0.7

## G-Eval

In [None]:
# Evaluation metric 2: Evaluate whole answer on G-Eval

prompts = df['prompt'].tolist()

# setup variables for the Azure OpenAI API
endpoint = "https://openai-ds-instance-sweden.openai.azure.com/"
model_name = "gpt-4.1"
deployment = "deze-voor-alles"



subscription_key = os.getenv("AZURE_OPENAI_API_KEY")
api_version = "2024-12-01-preview"



azure_model = AzureOpenAIModel(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
)


criteria = """


Evalueer hoe goed een taalmodel presteerde in de taak van voorwaarde-extractie uit Nederlandse juridische teksten.

Voor elke lijst van precondities/subfacts werd de act/fact aan een taalmodel gegeven als onderdeel van een prompt, met de opdracht om alle bijbehorende subfact/preconditie(s) en hun respectieve positie(s) in de tekst terug te geven.

Uw taak is om per paar te evalueren hoe goed het model presteerde op twee punten (op een 4-punt Likert-schaal):

1. **Het vinden van alle relevante precondities in de tekst**
2. **Hoe duidelijk de positie in de tekst is die het model aanduidde**

Voor het modelantwoord krijgt u het hele antwoord voor een act/fact (die dus meredere precondities/subfacts kan beïnhouden) en moet u voor elke ground truth preconditie afzonderlijk evalueren of deze 
    a) aanwezig is in het antwoord en 
    b) of de positie ervan ook goed is aangegeven in het antwoord. 


""" #TODO: think about criteria...

precondition_extraction_metric = GEval(
    name="Precondition_Extraction",
    criteria=criteria,
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
    model=azure_model,

)





NameError: name 'AzureOpenAIModel' is not defined

In [None]:
# Store results
scores_new, reasons_new = [], []
scores_base, reasons_base = [], []

# Evaluate each prompt individually
for prompt, new_out, base_out, ref in zip(prompts, candidates_new_flat, candidates_base_flat, references_flat):
 
    # New model evaluation
    g_eval_new = LLMTestCase(
        input=prompt,
        actual_output=f"Extracted Preconditions: {new_out}",
        expected_output=f"Expected Preconditions: {ref}"
    )
    precondition_extraction_metric.measure(g_eval_new)
    scores_new.append(precondition_extraction_metric.score)
    reasons_new.append(precondition_extraction_metric.reason)

    # Base model evaluation
    g_eval_base = LLMTestCase(
        input=prompt,
        actual_output=f"Extracted Preconditions: {base_out}",
        expected_output=f"Expected Preconditions: {ref}"
    )   
    precondition_extraction_metric.measure(g_eval_base)
    scores_base.append(precondition_extraction_metric.score)
    reasons_base.append(precondition_extraction_metric.reason)

# Optionally: print or analyze results
print("New Model Scores:", scores_new)
print("Base Model Scores:", scores_base)

In [None]:
############ OLD VERSION ######################

# Now define your test case, actual_output is your LLM output
g_eval_new = LLMTestCase(input=prompts, actual_output=f"Extracetd Preconditions: {candidates_new}", expected_output=f"Expected preconditions: {references}")
g_eval_base = LLMTestCase(input=prompts, actual_output=f"Extracetd Preconditions: {candidates_base}", expected_output=f"Expected preconditions: {references}")


# Use G-Eval metric
precondition_extraction_metric.measure(g_eval_new)
print("Score:", precondition_extraction_metric.score)
print("Reason:", precondition_extraction_metric.reason)

precondition_extraction_metric.measure(g_eval_base)
print("Score:", precondition_extraction_metric.score)
print("Reason:", precondition_extraction_metric.reason)

Run this in the proper environment tomorrow:

https://deepeval.com/docs/metrics-introduction#using-azure-openai

deepeval set-azure-openai \
    --openai-endpoint=<endpoint> \ # e.g. https://example-resource.azure.openai.com/
    --openai-api-key=<api_key> \
    --openai-model-name=<model_name> \ # e.g. gpt-4o
    --deployment-name=<deployment_name> \  # e.g. Test Deployment
    --openai-api-version=<openai_api_version> \ # e.g. 2025-01-01-preview
    --model-version=<model_version> # e.g. 2024-11-20

## Personal evaluation

In [None]:
# Generate about 10 answers with both models --> load them into the user interface and get reward score
# Also maybe do qualitative evaluation