In [1]:
import ollama
import pandas as pd
import evaluate
from scipy.stats import ks_2samp
from io import StringIO
import json

  from .autonotebook import tqdm as notebook_tqdm


## Config

In [2]:
MODEL_NAME = "qwen2.5:3b"

# Dataset di domande e risposte
with open('qa_dataset.json', 'r', encoding='utf-8') as f:
    qa_dataset = json.load(f)

# Dati reali per confronto tabellare
real_data = pd.read_excel("2022.11ÎË▒Ý▓╣│õÍð╬─Ê¹╩│/Shanghai_T1DM_Summary.xlsx")

## Funzioni per evaluation del qa

In [3]:
def query_ollama(prompt):
    """Manda un prompt al modello Ollama e restituisce la risposta."""
    response = ollama.chat(
        model=MODEL_NAME,
        messages=[{"role": "user", "content": prompt}]
    )
    return response['message']['content']

def evaluate_qa(predictions, references):
    """Valuta domande e risposte con BERTScore."""
    bertscore = evaluate.load("bertscore")
    results = bertscore.compute(predictions=predictions, references=references, lang="it")
    return results

def run_qa_evaluation():
    print("\n=== Valutazione Risposte a Domande ===")
    predictions = []
    references = []

    for item in qa_dataset:
        prompt = f"Rispondi in modo conciso alla domanda: {item['question']}"
        response = query_ollama(prompt)
        predictions.append(response.strip())
        references.append(item['answer'])

    results = evaluate_qa(predictions, references)
    f1_scores = results['f1']
    f1_medio = sum(f1_scores) / len(f1_scores)

    print("\nBERTScore medio (F1):", round(f1_medio, 4))

In [None]:
run_qa_evaluation()