# Evaluate finetuned Mistral Models

## Imports and environment variables

In [1]:
import torch
import os
import sys
from dotenv import load_dotenv
import ast
import evaluate

from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams, LLMTestCase

from distutils.util import strtobool

# from deepeval.models.azure_openai import AzureOpenAIModel

from peft import PeftModel

import pandas as pd

from transformers import AutoTokenizer, AutoModelForCausalLM

from deepeval.models import AzureOpenAIModel
# from deepeval.metrics import AnswerRelevancyMetric

# Load environment variables from .env file
load_dotenv()

# Add the parent directory to the Python path
# __file__


# Adjust this path to point to the directory containing rl_training_new
module_path = os.path.abspath(os.path.join('..')) # or another relative path
if module_path not in sys.path:
    sys.path.append(module_path)

# from rl_training_new.utils import find_best_window


from bert_score import score

# load the relevant devices available on the server
os.environ["CUDA_VISIBLE_DEVICES"] = os.getenv("AVAILABLE_DEVICES")

# Enable expandable CUDA segments
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# load cuda
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print("CUDA is available. Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")


  from .autonotebook import tqdm as notebook_tqdm


There are 3 GPU(s) available.
CUDA is available. Using GPU: NVIDIA L40S


In [2]:
def str_to_bool(s):
    return bool(strtobool(s))

In [3]:
MODEL = os.getenv("GENERATION_MODEL_NAME")
ALGORITHM = os.getenv("EVAL_MODEL_ALGORITHM")
RL_TRAINED_ADAPTERS = os.getenv("EVAL_MODEL_FOLDER")
EVAL_ANSWERS_CSV = os.getenv("EVAL_ANSWERS_CSV")
# GENERATE_RESPONSES = str_to_bool(os.getenv("GENERATE_RESPONSES"))
OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
# RL_DATA_PATH = os.getenv("RL_DATA_PATH")
EVAL_FILE = os.getenv("EVAL_FILE")
NUM_RESPONSES_EVAL = int(os.getenv("NUM_RESPONSES_EVAL"))  # Number of responses per model

## Load model

In [4]:
# base_model = AutoModelForCausalLM.from_pretrained(MODEL)
base_model = AutoModelForCausalLM.from_pretrained(MODEL,  
                                            #  device_map="auto",  # For GPU/TPU acceleration
                                            device_map=None,
                                            torch_dtype=torch.bfloat16,
                                            #  load_in_4bit=True,
                                            quantization_config={
                                                "load_in_4bit": True,
                                                "bnb_4bit_compute_dtype": torch.bfloat16,
                                                "bnb_4bit_use_double_quant": True,
                                                "bnb_4bit_quant_type": "nf4"
                                                }
                                            )   # Optimize precision)
# base_model_new = AutoModelForCausalLM.from_pretrained(MODEL)
base_model_new = AutoModelForCausalLM.from_pretrained(MODEL,  
                                            #  device_map="auto",  # For GPU/TPU acceleration
                                            device_map=None,
                                            torch_dtype=torch.bfloat16,
                                            #  load_in_4bit=True,
                                            quantization_config={
                                                "load_in_4bit": True,
                                                "bnb_4bit_compute_dtype": torch.bfloat16,
                                                "bnb_4bit_use_double_quant": True,
                                                "bnb_4bit_quant_type": "nf4"
                                                }
                                            )   # Optimize precision)
new_model = PeftModel.from_pretrained(base_model_new, RL_TRAINED_ADAPTERS)

tokenizer = AutoTokenizer.from_pretrained(MODEL)

base_model.eval()
new_model.eval()

base_model.to(device)
new_model.to(device)


Loading checkpoint shards: 100%|██████████| 3/3 [00:11<00:00,  3.91s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:11<00:00,  3.75s/it]


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32768, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_pro

## Get test dataset

In [5]:
df = pd.read_csv(EVAL_FILE, sep=';')

## Generate Model responses

In [6]:
# Function to generate response
def generate_response(prompt, tokenizer, model, max_length=1024):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    input_length = inputs['input_ids'].shape[1]
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=input_length + max_length, do_sample=True, top_k=50)
        generated_ids = outputs[0][input_length:]
    return tokenizer.decode(generated_ids, skip_special_tokens=True)

In [7]:
if not os.path.exists(EVAL_ANSWERS_CSV):
    # Generate multiple responses for each prompt
    for i in range(NUM_RESPONSES_EVAL):
        df[f'response_base_model_{i+1}'] = df['prompt'].apply(lambda x: generate_response(x, tokenizer, base_model))
        df[f'response_new_model_{i+1}'] = df['prompt'].apply(lambda x: generate_response(x, tokenizer, new_model))

    # Show result: displaying first response columns for brevity
    response_cols = [f'response_new_model_{i+1}' for i in range(NUM_RESPONSES_EVAL)] + \
                    [f'response_base_model_{i+1}' for i in range(NUM_RESPONSES_EVAL)]
    print(df[['prompt'] + response_cols])

    # Store in CSV
    df.to_csv(EVAL_ANSWERS_CSV, index=False, sep=';')

else:
    df = pd.read_csv(EVAL_ANSWERS_CSV, sep=';')



# if GENERATE_RESPONSES:
#     # Apply both models
#     df['response_base_model'] = df['prompt'].apply(lambda x: generate_response(x, tokenizer, base_model))
#     df['response_new_model'] = df['prompt'].apply(lambda x: generate_response(x, tokenizer, new_model))

#     # Show result
#     print(df[['prompt', 'response_new_model', 'response_base_model']])

#     # store response df in csv
#     df.to_csv(EVAL_ANSWERS_CSV, index=False, sep=';')
# else:
#     pd.read_csv(EVAL_ANSWERS_CSV, sep=';')

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

                                              prompt  \
0  \n\n\n                --- Definitie ---\n\n   ...   
1  \n\n\n                --- Definitie ---\n\n   ...   
2  \n\n\n                --- Definitie ---\n\n   ...   
3  \n\n\n                --- Definitie ---\n\n   ...   

                                response_new_model_1  \
0   Preconditie: Een belanghebbende moet zijn woo...   
1   Subfact: Budgettaire totaalbeeld\n           ...   
2  -----------------------\n\n\n\n               ...   
3  1. Voorwaarden: Artikel 14, eerste lid, 1 IN V...   

                                response_new_model_2  \
0   Preconditie: Belanghebbende moet woonplaats h...   
1  --------------------------------------------\n...   
2  ..............\n\n                Preconditie:...   
3  1. Subfact: De verblijfsvergunning voor bepaal...   

                                response_new_model_3  \
0   Preconditie: De belanghebbende moet woonplaat...   
1         inhoud: <inhoud>\n\n               

In [8]:
# print(df['response_base_model'][0])
# print(df['response_new_model'][0])

In [9]:

# Create list of column names
new_cols = [f'response_new_model_{j+1}' for j in range(NUM_RESPONSES_EVAL)]
base_cols = [f'response_base_model_{j+1}' for j in range(NUM_RESPONSES_EVAL)]

# Select columns and convert to list of lists (rows)
candidates_new = df[new_cols].values.tolist()
candidates_base = df[base_cols].values.tolist()

precon_text_list = df['precondition_texts'].to_list()
precon_pos_list = df["precondition_positions"].to_list()

references = []
for dict1, dict2 in zip(precon_text_list, precon_pos_list):
    dict1 = ast.literal_eval(dict1)
    dict2 = ast.literal_eval(dict2)
    combined = []
    for key in dict1.keys():  # or use sorted(dict1.keys()) if key order isn't guaranteed
        combined.append(str(dict1[key]) + '\n')
        combined.append(str(dict2[key]) + '\n\n')
    references.append(''.join(combined))

print(references)

['vastgestelling hoogte algemene bijstand per kalendermaand\nArtikel 19, sectie 2 IN Participatiewet\n\n', 'een overzicht van de uitgaven en de ontvangsten in de begrotingen voor het begrotingsjaar en de vier daarop aansluitende jaren.\nArtikel 2.23 sectie 4c IN Comptabiliteitswet 2016\n\nde budgettaire beschouwingen over het voorgenomen beleid voor de collectieve sector\nArtikel 2.23 sectie 4b IN Comptabiliteitswet 2016\n\nhet budgettaire totaalbeeld voor het betrokken begrotingsjaar en de vier daaropvolgende jaren van de rijksbegroting en de niet tot de rijksbegroting behorende budgetdisciplinesectoren\nArtikel 2.23 sectie 4a IN Comptabiliteitswet 2016\n\n', 'Wij dienen de voorstellen van wet tot vaststelling van de begrotingsstaten op de derde dinsdag van september van het jaar voorafgaande aan het begrotingsjaar in.\nArtikel 2.23 sectie 1 IN Comptabiliteitswet 2016\n\nDe miljoenennota bevat in elk geval het budgettaire totaalbeeld, de budgettaire beschouwingen en een overzicht van 

In [10]:
# Flatten the candidate lists
candidates_new_flat = [resp for row in candidates_new for resp in row]
candidates_base_flat = [resp for row in candidates_base for resp in row]

# Repeat each reference NUM_RESPONSES times to match the flattened predictions
references_flat = [ref for ref in references for _ in range(NUM_RESPONSES_EVAL)]

print(candidates_new_flat)

print(references_flat)

print(len(references_flat))
print(len(candidates_new_flat))

[' Preconditie: Een belanghebbende moet zijn woonplaats hebben in de gemeente van het college waar hij wil toekennen een recht op algemene bijstand.\n\n                  Positie: Artikel 4a, Wet algemene bijstand 2022\n\n                  Preconditie: Het college ter verleening van het recht moet hebben toelating in uitzondering van de bepalingen van artikel 3, eerste lid van de Wet algemene bijstand 2022.\n\n                  Positie: Artikel 4a, Wet algemene bijstand 2022\n\n                  Preconditie: Het college moet voldoen aan de belangingheidswaakzaamheidscriteria, opgenomen in de Wet algemene bijstand 2022.\n\n                  Positie: Artikel 4a, Wet algemene bijstand 2022\n\n                  Preconditie: Het college moet zich voldoen aan de bepalingen van de Wet algemene bijstand 2022 en aan alle bepalingen van andere rechtshandelingen die met de toekennen van de bijstand dienen of kunnen dienen, zoals belastingenwetten en de Wet belastingrechtheidsvordering 1993.\n\n   

## ROUGE/BLEU on relevant sequence from the answer 

In [11]:
# Evaluation metric 1: ROUGE on relevant sequences
# WHy not BLEU --> penalizes missing ngrams, not something I m looking for here

rouge = evaluate.load('rouge')


results_new = rouge.compute(predictions=candidates_new_flat, references=references_flat)
results_base = rouge.compute(predictions=candidates_base_flat, references=references_flat)


print(f"Results new: {results_new}")
print(f"Results base: {results_base}")

Results new: {'rouge1': np.float64(0.37413471701599377), 'rouge2': np.float64(0.2061159018975033), 'rougeL': np.float64(0.27570939611442186), 'rougeLsum': np.float64(0.36307947042520583)}
Results base: {'rouge1': np.float64(0.3354711263032375), 'rouge2': np.float64(0.20485168254652764), 'rougeL': np.float64(0.23537551021117858), 'rougeLsum': np.float64(0.32474278011344454)}


## BERT Score --> based on embedding similarity

In [12]:
# Maybe add BERTScore --> semantic similarity based on sentencetransformer
P_new, R_new, F1_new = score(
    candidates_new_flat, 
    references_flat, 
    model_type='answerdotai/ModernBERT-base', 
    num_layers=22,
    lang='nl')

P_base, R_base, F1_base = score(
    candidates_base_flat, 
    references_flat, 
    model_type='answerdotai/ModernBERT-base', 
    num_layers=22,
    lang='nl')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
print(f"BERT Score metrics new: {P_new, R_new, F1_new}")
print(f"BERT Score metrics base: {P_base, R_base, F1_base}")

print(f"F1 base: {F1_base.mean()}, F1 new: {F1_new.mean()}")



BERT Score metrics new: (tensor([0.6572, 0.7572, 0.6787, 0.6537, 0.7546, 0.6557, 0.7464, 0.6744, 0.6725,
        0.6584, 0.8219, 0.7862, 0.7956, 0.8051, 0.8491, 0.8390, 0.7381, 0.8618,
        0.8345, 0.8368, 0.8556, 0.7796, 0.8546, 0.8697, 0.8449, 0.8606, 0.8677,
        0.7820, 0.7182, 0.7977, 0.8103, 0.8522, 0.8223, 0.8321, 0.8194, 0.8355,
        0.8240, 0.8347, 0.7997, 0.8545]), tensor([0.7730, 0.8097, 0.7491, 0.7564, 0.8015, 0.7662, 0.8224, 0.7809, 0.7716,
        0.7836, 0.7940, 0.7776, 0.8121, 0.8039, 0.8812, 0.8882, 0.7970, 0.9138,
        0.9146, 0.8085, 0.8955, 0.8681, 0.8724, 0.8643, 0.8928, 0.8844, 0.8793,
        0.8366, 0.8400, 0.9020, 0.8350, 0.8673, 0.8140, 0.8571, 0.8176, 0.8700,
        0.8478, 0.8674, 0.8097, 0.7942]), tensor([0.7104, 0.7826, 0.7121, 0.7013, 0.7773, 0.7067, 0.7826, 0.7237, 0.7186,
        0.7156, 0.8077, 0.7819, 0.8038, 0.8045, 0.8649, 0.8629, 0.7664, 0.8870,
        0.8727, 0.8225, 0.8751, 0.8215, 0.8634, 0.8670, 0.8682, 0.8723, 0.8735,
        0.8

## G-Eval

In [14]:
if False:    
    # Evaluation metric 2: Evaluate whole answer on G-Eval

    prompts = df['prompt'].tolist()

    # setup variables for the Azure OpenAI API
    endpoint = "https://openai-ds-instance-sweden.openai.azure.com/"
    model_name = "gpt-4.1"
    deployment = "deze-voor-alles"



    subscription_key = os.getenv("AZURE_OPENAI_API_KEY")
    api_version = "2024-12-01-preview"



    azure_model = AzureOpenAIModel(
        openai_api_version=api_version,
        azure_endpoint=endpoint,
        api_key=subscription_key,
        deployment_name=deployment
    )


    criteria = """


    Evalueer hoe goed een taalmodel presteerde in de taak van voorwaarde-extractie uit Nederlandse juridische teksten.

    Voor elke lijst van precondities/subfacts werd de act/fact aan een taalmodel gegeven als onderdeel van een prompt, met de opdracht om alle bijbehorende subfact/preconditie(s) en hun respectieve positie(s) in de tekst terug te geven.

    Uw taak is om per paar te evalueren hoe goed het model presteerde op twee punten (op een 4-punt Likert-schaal):

    1. **Het vinden van alle relevante precondities in de tekst**
    2. **Hoe duidelijk de positie in de tekst is die het model aanduidde**

    Voor het modelantwoord krijgt u het hele antwoord voor een act/fact (die dus meredere precondities/subfacts kan beïnhouden) en moet u voor elke ground truth preconditie afzonderlijk evalueren of deze 
        a) aanwezig is in het antwoord en 
        b) of de positie ervan ook goed is aangegeven in het antwoord. 


    """ #TODO: think about criteria...

    precondition_extraction_metric = GEval(
        name="Precondition_Extraction",
        criteria=criteria,
        evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
        model=azure_model,
    )


In [15]:
if False:    

    # Store results
    scores_new, reasons_new = [], []
    scores_base, reasons_base = [], []

    # print(len(candidates_new_flat))

    # Evaluate each prompt individually
    for i, (new_out, base_out, ref) in enumerate(zip(candidates_new_flat, candidates_base_flat, references_flat)):

        prompt = prompts[i // NUM_RESPONSES_EVAL]
    
        # New model evaluation
        g_eval_new = LLMTestCase(
            input=prompt,
            actual_output=f"Extracted Preconditions: {new_out}",
            expected_output=f"Expected Preconditions: {ref}"
        )
        precondition_extraction_metric.measure(g_eval_new)
        scores_new.append(precondition_extraction_metric.score)
        reasons_new.append(precondition_extraction_metric.reason)

        # Base model evaluation
        g_eval_base = LLMTestCase(
            input=prompt,
            actual_output=f"Extracted Preconditions: {base_out}",
            expected_output=f"Expected Preconditions: {ref}"
        )   
        precondition_extraction_metric.measure(g_eval_base)
        scores_base.append(precondition_extraction_metric.score)
        reasons_base.append(precondition_extraction_metric.reason)

    # Optionally: print or analyze results
    print("New Model Scores:", scores_new)
    print("Base Model Scores:", scores_base)

In [None]:
if False:
    geval_scores_new_average = sum(scores_new) / len(scores_new)
    geval_scores_base_average = sum(scores_base) / len(scores_base)

    print("New Model Scores average:", geval_scores_new_average)
    print("Base Model Scores average:", geval_scores_base_average)


    print(reasons_new)
    print(reasons_base)

    len(reasons_new)


NameError: name 'scores_new' is not defined

In [None]:
# ############ OLD VERSION ######################

# # Now define your test case, actual_output is your LLM output
# g_eval_new = LLMTestCase(input=prompts, actual_output=f"Extracetd Preconditions: {candidates_new}", expected_output=f"Expected preconditions: {references}")
# g_eval_base = LLMTestCase(input=prompts, actual_output=f"Extracetd Preconditions: {candidates_base}", expected_output=f"Expected preconditions: {references}")


# # Use G-Eval metric
# precondition_extraction_metric.measure(g_eval_new)
# print("Score:", precondition_extraction_metric.score)
# print("Reason:", precondition_extraction_metric.reason)

# precondition_extraction_metric.measure(g_eval_base)
# print("Score:", precondition_extraction_metric.score)
# print("Reason:", precondition_extraction_metric.reason)

## Personal evaluation

In [None]:
# Generate about 10 answers with both models --> load them into the user interface and get reward score
# Also maybe do qualitative evaluation