In [1]:
from mlx_lm import load, generate
import json
from tqdm import tqdm
import csv
import difflib
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PROMPT_FILLING = ''' 
                    You are an advanced language model that receives questions and must generate answers. 
                        For each request, produce a clear and synthetic answer, maximum one paragraph long.
                        The answers must be direct and specific, without using bullet points or numbered lists.
                '''

In [3]:
OUT_CSV = './data/eval_out.csv'

In [4]:
def run_prompt(prompt, model, tokenizer):
	response = generate(model, tokenizer, prompt=prompt, verbose=False)
	return response

In [5]:
# Load test data
with open("data/processed/test.jsonl", "r") as f:
    test_data = [json.loads(line) for line in f]

In [6]:
print('Loading base model...')
model_base, tokenizer_base = load('./models/base/Phi-3-mini-128k-instruct-4bit')

print('Loading fine-tuned model...')
model_ft, tokenizer_ft= load('./models/fused/fused_Phi-3-mini-128k-instruct-4bit_2bs_4ls')

Loading base model...
Loading fine-tuned model...


In [7]:
# prepare OUT CSV
with open(OUT_CSV, 'w', newline='') as csvfile:
	csv_writer = csv.writer(csvfile)
	# Write the header
	csv_writer.writerow([ 'Question Prompt', 'expected_answer', 
					  	'Base - Answer', 'base_similarity', 'bleu_base',
						'Finetuned - Answer', 'ft_similarity', 'bleu_ft'])

In [8]:
for item in test_data:
    prompt = item['prompt']
    expected_answer = item['completion']

    filled_prompt_b = PROMPT_FILLING + prompt

    answer_base = run_prompt(filled_prompt_b, model_base, tokenizer_base)
    answer_ft = run_prompt(filled_prompt_b, model_ft, tokenizer_ft)

    # Calcola la similarità tra la answer attesa e quelle generate
    base_similarity = difflib.SequenceMatcher(None, expected_answer, answer_base).ratio()
    ft_similarity = difflib.SequenceMatcher(None, expected_answer, answer_ft).ratio()

    # Utilizzo del tokenizer del modello per una tokenizzazione coerente
    # Per il modello base
    expected_tokens_base = tokenizer_base.tokenize(expected_answer)
    answer_base_tokens = tokenizer_base.tokenize(answer_base)

    # Per il modello fine-tuned
    expected_tokens_ft = tokenizer_ft.tokenize(expected_answer)
    answer_ft_tokens = tokenizer_ft.tokenize(answer_ft)

    smoothing = SmoothingFunction().method1
    bleu_base = sentence_bleu([expected_tokens_base], answer_base_tokens, smoothing_function=smoothing)
    bleu_ft = sentence_bleu([expected_tokens_ft], answer_ft_tokens, smoothing_function=smoothing)

    # Scrive i risultati nel CSV
    with open(OUT_CSV, 'a', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow([
            prompt, expected_answer,
            answer_base, base_similarity, bleu_base,  # il dizionario verrà convertito in stringa
            answer_ft, ft_similarity, bleu_ft])

KeyboardInterrupt: 