# Evaluate finetuned Mistral Models

## Imports and environment variables

In [1]:
import torch
import os
import sys
from dotenv import load_dotenv
import ast
import evaluate

from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams, LLMTestCase

from distutils.util import strtobool

# from deepeval.models.azure_openai import AzureOpenAIModel

from peft import PeftModel

import pandas as pd

from transformers import AutoTokenizer, AutoModelForCausalLM

from deepeval.models import AzureOpenAIModel
# from deepeval.metrics import AnswerRelevancyMetric

# Load environment variables from .env file
load_dotenv()

# Add the parent directory to the Python path
# __file__


# Adjust this path to point to the directory containing rl_training_new
module_path = os.path.abspath(os.path.join('..')) # or another relative path
if module_path not in sys.path:
    sys.path.append(module_path)

# from rl_training_new.utils import find_best_window


from bert_score import score

# load the relevant devices available on the server
os.environ["CUDA_VISIBLE_DEVICES"] = os.getenv("AVAILABLE_DEVICES")

# Enable expandable CUDA segments
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# load cuda
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print("CUDA is available. Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")


  from .autonotebook import tqdm as notebook_tqdm


There are 1 GPU(s) available.
CUDA is available. Using GPU: NVIDIA L40S


In [2]:
def str_to_bool(s):
    return bool(strtobool(s))

In [3]:
MODEL = os.getenv("GENERATION_MODEL_NAME")
ALGORITHM = os.getenv("EVAL_MODEL_ALGORITHM")
RL_TRAINED_ADAPTERS = os.getenv("EVAL_MODEL_FOLDER")
EVAL_ANSWERS_CSV = os.getenv("EVAL_ANSWERS_CSV")
# GENERATE_RESPONSES = str_to_bool(os.getenv("GENERATE_RESPONSES"))
OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
# RL_DATA_PATH = os.getenv("RL_DATA_PATH")
EVAL_FILE = os.getenv("EVAL_FILE")
NUM_RESPONSES_EVAL = int(os.getenv("NUM_RESPONSES_EVAL"))  # Number of responses per model

## Load model

In [4]:
# base_model = AutoModelForCausalLM.from_pretrained(MODEL)
base_model = AutoModelForCausalLM.from_pretrained(MODEL,  
                                            #  device_map="auto",  # For GPU/TPU acceleration
                                            device_map=None,
                                            torch_dtype=torch.bfloat16,
                                            #  load_in_4bit=True,
                                            quantization_config={
                                                "load_in_4bit": True,
                                                "bnb_4bit_compute_dtype": torch.bfloat16,
                                                "bnb_4bit_use_double_quant": True,
                                                "bnb_4bit_quant_type": "nf4"
                                                }
                                            )   # Optimize precision)
# base_model_new = AutoModelForCausalLM.from_pretrained(MODEL)
base_model_new = AutoModelForCausalLM.from_pretrained(MODEL,  
                                            #  device_map="auto",  # For GPU/TPU acceleration
                                            device_map=None,
                                            torch_dtype=torch.bfloat16,
                                            #  load_in_4bit=True,
                                            quantization_config={
                                                "load_in_4bit": True,
                                                "bnb_4bit_compute_dtype": torch.bfloat16,
                                                "bnb_4bit_use_double_quant": True,
                                                "bnb_4bit_quant_type": "nf4"
                                                }
                                            )   # Optimize precision)
new_model = PeftModel.from_pretrained(base_model_new, RL_TRAINED_ADAPTERS)

tokenizer = AutoTokenizer.from_pretrained(MODEL)

base_model.eval()
new_model.eval()

base_model.to(device)
new_model.to(device)


Loading checkpoint shards: 100%|██████████| 3/3 [00:10<00:00,  3.65s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:10<00:00,  3.57s/it]


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32768, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_pro

## Get test dataset

In [5]:
df = pd.read_csv(EVAL_FILE, sep=';')

## Generate Model responses

In [6]:
# Function to generate response
def generate_response(prompt, tokenizer, model, max_length=1024):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    input_length = inputs['input_ids'].shape[1]
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=input_length + max_length, do_sample=True, top_k=50)
        generated_ids = outputs[0][input_length:]
    return tokenizer.decode(generated_ids, skip_special_tokens=True)

In [7]:
if not os.path.exists(EVAL_ANSWERS_CSV):
    # Generate multiple responses for each prompt
    for i in range(NUM_RESPONSES_EVAL):
        df[f'response_base_model_{i+1}'] = df['prompt'].apply(lambda x: generate_response(x, tokenizer, base_model))
        df[f'response_new_model_{i+1}'] = df['prompt'].apply(lambda x: generate_response(x, tokenizer, new_model))

    # Show result: displaying first response columns for brevity
    response_cols = [f'response_new_model_{i+1}' for i in range(NUM_RESPONSES_EVAL)] + \
                    [f'response_base_model_{i+1}' for i in range(NUM_RESPONSES_EVAL)]
    print(df[['prompt'] + response_cols])

    # Store in CSV
    df.to_csv(EVAL_ANSWERS_CSV, index=False, sep=';')

else:
    df = pd.read_csv(EVAL_ANSWERS_CSV, sep=';')



# if GENERATE_RESPONSES:
#     # Apply both models
#     df['response_base_model'] = df['prompt'].apply(lambda x: generate_response(x, tokenizer, base_model))
#     df['response_new_model'] = df['prompt'].apply(lambda x: generate_response(x, tokenizer, new_model))

#     # Show result
#     print(df[['prompt', 'response_new_model', 'response_base_model']])

#     # store response df in csv
#     df.to_csv(EVAL_ANSWERS_CSV, index=False, sep=';')
# else:
#     pd.read_csv(EVAL_ANSWERS_CSV, sep=';')

In [8]:
# print(df['response_base_model'][0])
# print(df['response_new_model'][0])

In [9]:

# Create list of column names
new_cols = [f'response_new_model_{j+1}' for j in range(NUM_RESPONSES_EVAL)]
base_cols = [f'response_base_model_{j+1}' for j in range(NUM_RESPONSES_EVAL)]

# Select columns and convert to list of lists (rows)
candidates_new = df[new_cols].values.tolist()
candidates_base = df[base_cols].values.tolist()

precon_text_list = df['precondition_texts'].to_list()
precon_pos_list = df["precondition_positions"].to_list()

references = []
for dict1, dict2 in zip(precon_text_list, precon_pos_list):
    dict1 = ast.literal_eval(dict1)
    dict2 = ast.literal_eval(dict2)
    combined = []
    for key in dict1.keys():  # or use sorted(dict1.keys()) if key order isn't guaranteed
        combined.append(str(dict1[key]) + '\n')
        combined.append(str(dict2[key]) + '\n\n')
    references.append(''.join(combined))

print(references)

['een overzicht van de uitgaven en de ontvangsten in de begrotingen voor het begrotingsjaar en de vier daarop aansluitende jaren.\nArtikel 2.23 sectie 4c IN Comptabiliteitswet 2016\n\nde budgettaire beschouwingen over het voorgenomen beleid voor de collectieve sector\nArtikel 2.23 sectie 4b IN Comptabiliteitswet 2016\n\nhet budgettaire totaalbeeld voor het betrokken begrotingsjaar en de vier daaropvolgende jaren van de rijksbegroting en de niet tot de rijksbegroting behorende budgetdisciplinesectoren\nArtikel 2.23 sectie 4a IN Comptabiliteitswet 2016\n\n', 'de verblijfsvergunning wordt verleend met ingang van de dag waarop de vreemdeling heeft aangetoond dat hij aan alle voorwaarden voldoet\nArtikel 26, sectie 1 IN Vreemdelingenwet 2024\n\nde verblijfsvergunning wordt verleend met ingang van een dag eerder dan de dag waarop de aanvraag is ontvangen\nArtikel 26, sectie 1 IN Vreemdelingenwet 2024\n\n', 'Begroting bevat begrotingsstaat\nArtikel 2.2 IN Comptabiliteitswet 2016\n\n', 'colleg

In [21]:
# Flatten the candidate lists
candidates_new_flat = [resp for row in candidates_new for resp in row]
candidates_base_flat = [resp for row in candidates_base for resp in row]

# Repeat each reference NUM_RESPONSES times to match the flattened predictions
references_flat = [ref for ref in references for _ in range(NUM_RESPONSES_EVAL)]

print(candidates_new_flat)

print(references_flat)

print(len(references_flat))
print(len(candidates_new_flat))

[' Subfact: budgettaire totaalbeeld \n  Positie: Artikel 4.3, vierde lid IN Rijksbegroting 2016\n\n                  Subfact: budgettaire beschouwingen\n  Positie: Artikel 4.3, vierde lid IN Rijksbegroting 2016\n\n                  Subfact: een overzicht van de uitgaven en ontvangsten\n  Positie: Artikel 4.3, vierde lid IN Rijksbegroting 2016', ' Subfact: Budgettaire totaalbeeld\n                  Positie: Artikel 4.4, vierde lid IN Comptabiliteitswet 2016\n\n                  Subfact: Budgettaire beschouwingen\n                  Positie: Artikel 4.4, vierde lid IN Comptabiliteitswet 2016\n\n                  Subfact: Overzicht van uitgaven en ontvangsten\n                  Positie: Artikel 4.4, vierde lid IN Comptabiliteitswet 2016', '----------\n\n                Subfact: Budgettaire totaalbeeld \n  Positie: Inhoud van het Artikel 4.4 van de Hoofdstuk 4 van de Comptabiliteitswet 2016\n\n              Subfact: Budgettaire beschouwingen \n              Positie: Inhoud van het Artikel 4

## ROUGE/BLEU on relevant sequence from the answer 

In [11]:
# Evaluation metric 1: ROUGE on relevant sequences
# WHy not BLEU --> penalizes missing ngrams, not something I m looking for here

rouge = evaluate.load('rouge')


results_new = rouge.compute(predictions=candidates_new_flat, references=references_flat)
results_base = rouge.compute(predictions=candidates_base_flat, references=references_flat)


print(f"Results new: {results_new}")
print(f"Results base: {results_base}")

Results new: {'rouge1': np.float64(0.26903899959926475), 'rouge2': np.float64(0.08075648293634999), 'rougeL': np.float64(0.19610885576452364), 'rougeLsum': np.float64(0.25988549214648693)}
Results base: {'rouge1': np.float64(0.2380949946009449), 'rouge2': np.float64(0.0951200057279944), 'rougeL': np.float64(0.15279703243016496), 'rougeLsum': np.float64(0.22638269227554414)}


## BERT Score --> based on embedding similarity

In [12]:
# Maybe add BERTScore --> semantic similarity based on sentencetransformer
P_new, R_new, F1_new = score(
    candidates_new_flat, 
    references_flat, 
    model_type='answerdotai/ModernBERT-base', 
    num_layers=22,
    lang='nl')

P_base, R_base, F1_base = score(
    candidates_base_flat, 
    references_flat, 
    model_type='answerdotai/ModernBERT-base', 
    num_layers=22,
    lang='nl')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
print(f"BERT Score metrics new: {P_new, R_new, F1_new}")
print(f"BERT Score metrics base: {P_base, R_base, F1_base}")

print(f"F1 base: {F1_base.mean()}, F1 new: {F1_new.mean()}")



BERT Score metrics new: (tensor([0.8326, 0.8375, 0.8130, 0.8244, 0.7785, 0.8020, 0.8106, 0.8212, 0.8264,
        0.8142, 0.7983, 0.7946, 0.8134, 0.8024, 0.7708, 0.8099, 0.8149, 0.7864,
        0.7789, 0.7603, 0.7087, 0.7537, 0.7002, 0.8246, 0.6824, 0.6269, 0.7587,
        0.7403, 0.6926, 0.7183, 0.8008, 0.8092, 0.7641, 0.7999, 0.7648, 0.7609,
        0.7746, 0.8002, 0.7682, 0.7822]), tensor([0.8096, 0.8064, 0.8182, 0.8040, 0.7976, 0.7822, 0.8075, 0.8123, 0.7996,
        0.8117, 0.8598, 0.8527, 0.8541, 0.8567, 0.8374, 0.8513, 0.8310, 0.8402,
        0.8516, 0.8377, 0.8530, 0.8259, 0.8119, 0.8758, 0.7947, 0.7866, 0.8298,
        0.8683, 0.7876, 0.8056, 0.8115, 0.7494, 0.7933, 0.7474, 0.8008, 0.7892,
        0.8143, 0.7981, 0.8053, 0.8137]), tensor([0.8210, 0.8217, 0.8156, 0.8141, 0.7880, 0.7920, 0.8091, 0.8167, 0.8128,
        0.8130, 0.8279, 0.8226, 0.8332, 0.8287, 0.8027, 0.8301, 0.8229, 0.8124,
        0.8136, 0.7971, 0.7742, 0.7882, 0.7519, 0.8494, 0.7342, 0.6977, 0.7927,
        0.7

## G-Eval

In [14]:
# Evaluation metric 2: Evaluate whole answer on G-Eval

prompts = df['prompt'].tolist()

# setup variables for the Azure OpenAI API
endpoint = "https://openai-ds-instance-sweden.openai.azure.com/"
model_name = "gpt-4.1"
deployment = "deze-voor-alles"



subscription_key = os.getenv("AZURE_OPENAI_API_KEY")
api_version = "2024-12-01-preview"



azure_model = AzureOpenAIModel(
    openai_api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
    deployment_name=deployment
)


criteria = """


Evalueer hoe goed een taalmodel presteerde in de taak van voorwaarde-extractie uit Nederlandse juridische teksten.

Voor elke lijst van precondities/subfacts werd de act/fact aan een taalmodel gegeven als onderdeel van een prompt, met de opdracht om alle bijbehorende subfact/preconditie(s) en hun respectieve positie(s) in de tekst terug te geven.

Uw taak is om per paar te evalueren hoe goed het model presteerde op twee punten (op een 4-punt Likert-schaal):

1. **Het vinden van alle relevante precondities in de tekst**
2. **Hoe duidelijk de positie in de tekst is die het model aanduidde**

Voor het modelantwoord krijgt u het hele antwoord voor een act/fact (die dus meredere precondities/subfacts kan beïnhouden) en moet u voor elke ground truth preconditie afzonderlijk evalueren of deze 
    a) aanwezig is in het antwoord en 
    b) of de positie ervan ook goed is aangegeven in het antwoord. 


""" #TODO: think about criteria...

precondition_extraction_metric = GEval(
    name="Precondition_Extraction",
    criteria=criteria,
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
    model=azure_model,
)


In [23]:
#TODO: the flat version does not make tons of sense at the moment, double-check

# Store results
scores_new, reasons_new = [], []
scores_base, reasons_base = [], []

# print(len(candidates_new_flat))

# Evaluate each prompt individually
for i, (new_out, base_out, ref) in enumerate(zip(candidates_new_flat, candidates_base_flat, references_flat)):

    prompt = prompts[i // NUM_RESPONSES_EVAL]
 
    # New model evaluation
    g_eval_new = LLMTestCase(
        input=prompt,
        actual_output=f"Extracted Preconditions: {new_out}",
        expected_output=f"Expected Preconditions: {ref}"
    )
    precondition_extraction_metric.measure(g_eval_new)
    scores_new.append(precondition_extraction_metric.score)
    reasons_new.append(precondition_extraction_metric.reason)

    # Base model evaluation
    g_eval_base = LLMTestCase(
        input=prompt,
        actual_output=f"Extracted Preconditions: {base_out}",
        expected_output=f"Expected Preconditions: {ref}"
    )   
    precondition_extraction_metric.measure(g_eval_base)
    scores_base.append(precondition_extraction_metric.score)
    reasons_base.append(precondition_extraction_metric.reason)

# Optionally: print or analyze results
print("New Model Scores:", scores_new)
print("Base Model Scores:", scores_base)

40


New Model Scores: [0.18807970764133558, 0.1952574124802879, 0.19890130559121316, 0.25, 0.19399133515091155, 0.20758581873336315, 0.20070006516916536, 0.512097041916076, 0.25, 0.277729987149254, 0.13208213073183842, 0.10758581852443776, 0.10373268892182123, 0.10373268839975532, 0.09706877698846184, 0.1, 0.1, 0.10229773702863632, 0.10373268838636902, 0.1966615486368886, 0.19770226314280637, 0.1992999349904389, 0.26013242076506676, 0.19859363731633717, 0.1377540675801802, 0.20000000000000004, 0.20474258741873436, 0.20953494584723922, 0.19241418231126367, 0.20179862069353516, 0.0, 0.0, 0.0, 0.0, 0.09046505286741727, 0.1, 0.0, 0.022270014398423358, 0.0, 0.0]
Base Model Scores: [0.28506347230854856, 0.2970637542742415, 0.20179862106858773, 0.28698864638409305, 0.3073008942459313, 0.21480472036701054, 0.19241418241572636, 0.343466732515267, 0.19241418163225626, 0.2850634719832917, 0.14999999999999997, 0.1095349469397812, 0.14378234991142017, 0.18175744806385544, 0.11480472036701053, 0.1022977

In [24]:
geval_scores_new_average = sum(scores_new) / len(scores_new)
geval_scores_base_average = sum(scores_base) / len(scores_base)

print("New Model Scores average:", geval_scores_new_average)
print("Base Model Scores average:", geval_scores_base_average)


print(reasons_new)
print(reasons_base)

len(reasons_new)


New Model Scores average: 0.1458986119695314
Base Model Scores average: 0.1595106983601926
["De drie verwachte subfacts zijn inhoudelijk wel herkend in de Actual Output, maar de posities zijn volledig onjuist: alle subfacts worden geplaatst in Artikel 4.3, vierde lid van 'Rijksbegroting 2016', terwijl ze volgens de Expected Output in Artikel 2.23, sectie 4a-c van de Comptabiliteitswet 2016 horen. De namen van de subfacts zijn bovendien minder volledig geformuleerd dan in de Expected Output. De extractie is dus incompleet qua precisie en de positieaanduiding is onjuist en onduidelijk.", 'De Actual Output bevat de drie gevraagde subfacts qua naam, maar de posities zijn volledig onjuist: alle subfacts worden geplaatst in Artikel 4.4, vierde lid, terwijl ze volgens de Expected Output in Artikel 2.23, sectie 4a, 4b en 4c van de Comptabiliteitswet 2016 horen. De inhoud van de subfacts is bovendien te summier en niet volledig geformuleerd zoals in de Expected Output. Hierdoor is de extractie 

40

In [17]:
# ############ OLD VERSION ######################

# # Now define your test case, actual_output is your LLM output
# g_eval_new = LLMTestCase(input=prompts, actual_output=f"Extracetd Preconditions: {candidates_new}", expected_output=f"Expected preconditions: {references}")
# g_eval_base = LLMTestCase(input=prompts, actual_output=f"Extracetd Preconditions: {candidates_base}", expected_output=f"Expected preconditions: {references}")


# # Use G-Eval metric
# precondition_extraction_metric.measure(g_eval_new)
# print("Score:", precondition_extraction_metric.score)
# print("Reason:", precondition_extraction_metric.reason)

# precondition_extraction_metric.measure(g_eval_base)
# print("Score:", precondition_extraction_metric.score)
# print("Reason:", precondition_extraction_metric.reason)

## Personal evaluation

In [18]:
# Generate about 10 answers with both models --> load them into the user interface and get reward score
# Also maybe do qualitative evaluation