# Bewertung mittels Verschiedenen Metriken 

In [38]:
import pandas as pd
import re
import string
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import nltk
import evaluate
from bert_score import score
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from transformers import BertTokenizer, BertForMaskedLM, BertModel
from bert_score import BERTScorer

MissingCUDAException: CUDA_HOME does not exist, unable to compile CUDA op(s)

In [None]:
import torch
print("CUDA verfügbar:", torch.cuda.is_available())
print("torch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)
print("GPU-Name:", torch.cuda.get_device_name(0))


In [None]:
nltk.download('wordnet', force=True)
nltk.download('omw-1.4', force=True)

In [None]:
data = pd.read_csv('../docs/docs/results_finetuned_qwen_val_with_system_message_05_04_25.csv')

## Corpus cleanup

In [25]:
def normalize_answer(ans):
    ans = ans.lower()
    ans = re.sub(f"[{re.escape(string.punctuation)}]", "", ans)  # remove punctuation
    ans = ans.strip()
    return ans

In [26]:
data['correct_answer'] = data['correct_answer'].apply(normalize_answer)
data['model_output'] = data['model_output'].apply(normalize_answer)

# BLEU Score

In [27]:
smoothie = SmoothingFunction().method4  # handles edge cases like short answers

# Define functions for BLEU-1, BLEU-2, and BLEU-4
def bleu_1(gt, pred):
    return sentence_bleu([gt.split()], pred.split(), weights=(1.0, 0, 0, 0), smoothing_function=smoothie)

def bleu_2(gt, pred):
    return sentence_bleu([gt.split()], pred.split(), weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie)

def bleu_4(gt, pred):
    return sentence_bleu([gt.split()], pred.split(), weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)


data['bleu'] = data.apply(lambda row: bleu_4(row['correct_answer'], row['model_output']), axis=1)


data['bleu1'] = data.apply(lambda row: bleu_1(row['correct_answer'], row['model_output']), axis=1)
data['bleu2'] = data.apply(lambda row: bleu_2(row['correct_answer'], row['model_output']), axis=1)
data['bleu4'] = data.apply(lambda row: bleu_4(row['correct_answer'], row['model_output']), axis=1)


In [28]:
mean_bleu = data['bleu'].mean()
mean_bleu

np.float64(0.40306083321666547)

# ROUGE Score

In [29]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def compute_rouge(ref, pred):
    scores = scorer.score(ref, pred)
    return {
        'rouge1': scores['rouge1'].fmeasure,
        'rouge2': scores['rouge2'].fmeasure,
        'rougeL': scores['rougeL'].fmeasure
    }

In [30]:
rouge_scores = data.apply(lambda row: compute_rouge(row['correct_answer'], row['model_output']), axis=1)
rouge_df = pd.DataFrame(list(rouge_scores))
data = pd.concat([data, rouge_df], axis=1)

In [31]:
avg_rouge1 = data['rouge1'].mean()
avg_rouge2 = data['rouge2'].mean()
avg_rougeL = data['rougeL'].mean()

print(f"ROUGE-1: {avg_rouge1:.4f}")
print(f"ROUGE-2: {avg_rouge2:.4f}")
print(f"ROUGE-L: {avg_rougeL:.4f}")

TypeError: unsupported format string passed to Series.__format__

# METEOR Score

In [32]:
import evaluate
# Load the METEOR metric
meteor = evaluate.load("meteor")

# Compute METEOR score for each row individually
data['meteor'] = [
    meteor.compute(predictions=[pred], references=[ref])['meteor']
    for pred, ref in zip(data['model_output'], data['correct_answer'])
]

MissingCUDAException: CUDA_HOME does not exist, unable to compile CUDA op(s)

In [33]:
data.head(10)

Unnamed: 0,ID,question,correct_answer,model_output,bleu,bleu1,bleu2,bleu4,rouge1,rouge2,rougeL,rouge1.1,rouge2.1,rougeL.1
0,1,what have lost their nuclei?,neutrophils,lymphocytes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,whose nuclei have been lost?,neutrophils,lymphocytes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,are two small pulmonary arterioles packed with...,yes,yes,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0
3,4,what is acute viral hepatitis characterized by?,predominantly lymphocytic infiltrate,hepatocytes with intracytoplasmic vacuoles and...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,what do the cells have?,wavy nuclei,abundant cytoplasm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,6,do the cells have wavy nuclei?,yes,yes,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0
6,7,do individual myocardial fibres have wavy nuclei?,no,yes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8,where is this area in the body?,abdomen,abdomen,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0
8,9,what does this image show?,peritoneal carcinomatosis,cancer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,10,does this image show peritoneal carcinomatosis...,yes,yes,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0


In [18]:
avg_meteor = data['meteor'].mean()
print(avg_meteor)

KeyError: 'meteor'

# BERTScore, F1, Precision, Recall

In [None]:
# Get lists of predictions and references
preds = data['model_output'].tolist()
refs = data['correct_answer'].tolist()

# Compute BERTScore
P, R, F1 = score(preds, refs, lang='en', rescale_with_baseline=False)

# Assign to DataFrame
data['bertscore_precision_roberta'] = P.tolist()
data['bertscore_recall_roberta'] = R.tolist()
data['bertscore_f1_roberta'] = F1.tolist()

# Optional: mean values
print("Mean BERTScore F1:", data['bertscore_f1_roberta'].mean())

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Get lists of predictions and references
preds = data['model_output'].tolist()
refs = data['correct_answer'].tolist()
scorer = BERTScorer(model_type='bert-base-uncased')
# Compute BERTScore
P, R, F1 = scorer.score(preds, refs)

# Assign to DataFrame
data['bertscore_precision_bbu'] = P.tolist()
data['bertscore_recall_bbu'] = R.tolist()
data['bertscore_f1_bbu'] = F1.tolist()

# Optional: mean values
print("Mean BERTScore F1:", data['bertscore_f1_bbu'].mean())

Mean BERTScore F1: 0.7753047066996475


In [None]:
# Lade Tokenizer und Model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Modell in Inferenzmodus

# Stelle sicher, dass keine GPU verwendet wird (oder passe an, wenn du CUDA nutzen willst)
device = torch.device("cpu")
model.to(device)

# Funktion zur Berechnung eines Embeddings
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

# Leere Liste für die Scores
similarities = []

# Iteration über das ganze DataFrame
for ref, pred in tqdm(zip(data['correct_answer'], data['model_output']), total=len(data)):
    emb1 = get_embedding(ref)
    emb2 = get_embedding(pred)
    sim = cosine_similarity([emb1], [emb2])[0][0]
    similarities.append(sim)

# Füge die Ähnlichkeitswerte dem DataFrame hinzu
data['bert_similarity_cos_bbu'] = similarities
print(data['bert_similarity_cos_bbu'].mean())

100%|██████████| 6259/6259 [07:28<00:00, 13.96it/s]

                         correct_answer  \
0                           neutrophils   
1                           neutrophils   
2                                   yes   
3  predominantly lymphocytic infiltrate   
4                           wavy nuclei   

                            model_output  bert_similarity  
0  the cells in the centre of the tumour         0.617441  
1                                  cells         0.409428  
2                                    yes         1.000000  
3       a marked inflammatory infiltrate         0.564771  
4                       irregular nuclei         0.875140  





# Token-based F1 Score

In [16]:
from collections import Counter

def compute_token_f1(pred, ref):
    pred_tokens = pred.lower().split()
    ref_tokens = ref.lower().split()
    
    common = Counter(pred_tokens) & Counter(ref_tokens)
    num_same = sum(common.values())

    if num_same == 0:
        return 0.0

    precision = num_same / len(pred_tokens)
    recall = num_same / len(ref_tokens)
    f1 = 2 * precision * recall / (precision + recall)
    return f1
data['token_f1'] = data.apply(lambda row: compute_token_f1(row['model_output'], row['correct_answer']), axis=1)

In [17]:
print("Mean Token F1:", data['token_f1'].mean())

Mean Token F1: 0.565667794127871


# Evaluation using Chat-GPt 4o


In [18]:
from openai import OpenAI

client = OpenAI(api_key='KEY')


In [19]:
def evaluate_answer(question, prediction, reference):

    prompt = f"""
You are an expert medical examiner.

Question: {question}

Predicted Answer: {prediction}

Correct Answer: {reference}

Is the Predicted Answer the same as the Correct Answer? If the Predicted Answer is correct return 1, if not return 0. Do not add any explanation, only the number."Evaluation
"""
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0
    )

    return response.choices[0].message.content.strip()


In [20]:
from tqdm import tqdm

# Optional: for better progress tracking
tqdm.pandas()

# Apply function to each row
data['gpt_eval'] = data.progress_apply(
    lambda row: evaluate_answer(row['question'], row['model_output'], row['correct_answer']),
    axis=1
)

  0%|          | 0/6259 [00:00<?, ?it/s]

  1%|          | 52/6259 [00:27<53:54,  1.92it/s]  


KeyboardInterrupt: 

In [21]:
data

Unnamed: 0,ID,question,correct_answer,model_output,bleu,bleu1,bleu2,bleu4,rouge1,rouge2,rougeL,meteor,bertscore_precision,bertscore_recall,bertscore_f1,token_f1
0,1,what have lost their nuclei?,neutrophils,the cells in the centre of the tumour,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.835960,0.807426,0.821445,0.000000
1,2,whose nuclei have been lost?,neutrophils,cells,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.869684,0.713799,0.784069,0.000000
2,3,are two small pulmonary arterioles packed with...,yes,yes,1.000000,1.000000,1.000000,1.000000,1.000000,0.0,1.000000,0.500000,1.000000,1.000000,1.000000,1.000000
3,4,what is acute viral hepatitis characterized by?,predominantly lymphocytic infiltrate,a marked inflammatory infiltrate,0.061033,0.250000,0.107482,0.061033,0.285714,0.0,0.285714,0.161290,0.910176,0.869850,0.889556,0.285714
4,5,what do the cells have?,wavy nuclei,irregular nuclei,0.067544,0.500000,0.186165,0.067544,0.500000,0.0,0.500000,0.250000,0.958191,0.892069,0.923948,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6254,6255,are many of the tumour cells separated by muco...,no,yes,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.995485,0.995485,0.995485,0.000000
6255,6256,what is also identified?,cells have wavy nuclei and a residual nerve fibre,a large number of atypical cells,0.027116,0.202177,0.066290,0.027116,0.266667,0.0,0.133333,0.114943,0.869317,0.857880,0.863561,0.266667
6256,6257,is a residual nerve fibre also identified?,no,no,1.000000,1.000000,1.000000,1.000000,1.000000,0.0,1.000000,0.500000,1.000000,1.000000,1.000000,1.000000
6257,6258,is broad base also identified?,no,no,1.000000,1.000000,1.000000,1.000000,1.000000,0.0,1.000000,0.500000,1.000000,1.000000,1.000000,1.000000
