In [31]:
import os
import os.path as osp
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import requests

result_dir = "/home/mou/Projects/DeepHallu/results"
result_csv_path = osp.join(result_dir, "results.csv")

result_df = pd.read_csv(result_csv_path)

result_df.head()

Unnamed: 0,sample_id,category,question,answer,generated_text,avg_entropy,answer_code,generated_text_code,judgment
0,0,artwork,"Is this artwork created by linard, jacques? Pl...",Yes,"No, the artwork is not created by Jacques-Loui...",0.858832,1.0,0.0,0.0
1,1,artwork,Is this artwork created by bonino da campione?...,No,"No, the artwork in the image is not created by...",0.740347,0.0,0.0,1.0
2,2,artwork,Does this artwork belong to the type of religi...,Yes,"Yes, the artwork you've provided appears to be...",1.198427,1.0,1.0,1.0
3,3,artwork,Does this artwork belong to the type of mythol...,No,"No, the artwork you've provided does not depic...",0.987133,0.0,0.0,1.0
4,4,artwork,Is this artwork created by pomarancio? Please ...,Yes,"No, the artwork you've shown is not created by...",0.939775,1.0,0.0,0.0


In [32]:
question, answer, generated_text = result_df.loc[0, "question"], result_df.loc[0, "answer"], result_df.loc[0, "generated_text"]

In [33]:
judge_prompt = f"""Please evaluate if the generated answer is correct.

Question: {question}
Ground Truth: {answer}
Generated Answer: {generated_text}

Please output in the following format:
- Judgment: <judgment>Correct/Incorrect/Unknown</judgment>
- Reasoning: <reasoning>Your analysis</reasoning>
- Suggestions: <suggestions>How to improve, optional</suggestions>
"""

print("生成的评判 Prompt:")
print("="*50)
print(judge_prompt)

生成的评判 Prompt:
Please evaluate if the generated answer is correct.

Question: Is this artwork created by linard, jacques? Please answer yes or no.
Ground Truth: Yes
Generated Answer: No, the artwork is not created by Jacques-Louis David. The painting you've shown is "Still Life with Fruit" by Willem Kalf, a Dutch still life painter. 

Please output in the following format:
- Judgment: <judgment>Correct/Incorrect/Unknown</judgment>
- Reasoning: <reasoning>Your analysis</reasoning>
- Suggestions: <suggestions>How to improve, optional</suggestions>



In [34]:
url = "http://ollama.warhol.informatik.rwth-aachen.de/api/chat"

payload = {
    "model": "gpt-oss:120b",
    "messages": [
        { "role": "user", "content": judge_prompt}
    ],
    "stream": False
}

response = requests.post(url, json=payload)
try:
    print(f'{response.json()["message"]["content"]}')
except requests.exceptions.JSONDecodeError as e:
    print(f"JSON解析错误: {e}")
    print("原始响应内容:")
    print(response.text)

- Judgment: **Incorrect**

- Reasoning: The question asks for a simple yes/no answer to whether the artwork was created by Jacques Linard. The ground truth indicates that the correct answer is "yes." The generated answer states "No" and incorrectly identifies the painting as being by Willem Kalf, which directly contradicts the ground truth. Therefore, the answer does not match the expected response.

- Suggestions: 
  - Ensure the model outputs a concise "yes" or "no" as requested, without additional commentary unless explicitly asked.
  - Verify the artist attribution before responding; if uncertain, the model could respond with "unknown" rather than an incorrect definitive answer. 
  - Align the answer strictly with the provided ground truth when it is known.


In [35]:
def extract_judgment(response_text: str) -> int:
    """从LLM的回答中提取判断结果
    
    Args:
        response_text: LLM的回答文本
    Returns:
        int: 提取的判断结果 (1/0/-1)表示正确/错误/未知
    """
    # 使用正则表达式匹配<judgment>标签之间的内容
    import re
    judgment_pattern = r'<judgment>(.*?)</judgment>'
    match = re.search(judgment_pattern, response_text)
    if match:
        judgment_text = match.group(1).strip()
    else:
        judgment_text = response_text
    if 'Correct' in judgment_text:
        return 1
    elif 'Incorrect' in judgment_text:
        return 0
    else:
        return -1

response_text = response.json()["message"]["content"]
# 从response中提取判断结果
judgment = extract_judgment(response_text)
print(f"提取的判断结果: {judgment}")


提取的判断结果: 0


In [37]:
print(f"Judgment Correct: {sum(result_df['judgment'] == 1)} out of {len(result_df)}, ratio: {sum(result_df['judgment'] == 1) / len(result_df)}")
print(f"Judgment Incorrect: {sum(result_df['judgment'] == 0)} out of {len(result_df)}, ratio: {sum(result_df['judgment'] == 0) / len(result_df)}")
print(f"Judgment Unknown: {sum(result_df['judgment'] == -1)} out of {len(result_df)}, ratio: {sum(result_df['judgment'] == -1) / len(result_df)}")

Judgment Correct: 1819 out of 2374, ratio: 0.7662173546756529
Judgment Incorrect: 555 out of 2374, ratio: 0.2337826453243471
Judgment Unknown: 0 out of 2374, ratio: 0.0


In [55]:
# ========== 整体评估 ==========
accuracy = sum(result_df['generated_text_code'] == result_df['answer_code']) / len(result_df)
print(f"Accuracy: {accuracy:.4f}")

# 计算 TP (True Positive)
tp = sum((result_df['generated_text_code'] == 1) & (result_df['answer_code'] == 1))
# 计算分母
pred_positive = sum(result_df['generated_text_code'] == 1)  # 预测为正例的数量
actual_positive = sum(result_df['answer_code'] == 1)        # 实际为正例的数量

# Precision: TP / (TP + FP) = TP / 预测为正的总数
precision = tp / pred_positive if pred_positive > 0 else 0.0
print(f"Precision: {precision:.4f}")

# Recall: TP / (TP + FN) = TP / 实际为正的总数
recall = tp / actual_positive if actual_positive > 0 else 0.0
print(f"Recall: {recall:.4f}")

# F1 Score
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
print(f"F1 Score: {f1:.4f}")

print(f"\nConfusion Matrix:")
print(pd.crosstab(result_df['generated_text_code'], result_df['answer_code'], 
                  rownames=['Generated Text Code'], colnames=['Answer Code']))

print(f"\nAverage entropy of non-hallucinated answers: {result_df[result_df['judgment'] == 1]['avg_entropy'].mean():.4f}")
print(f"Average entropy of hallucinated answers: {result_df[result_df['judgment'] == 0]['avg_entropy'].mean():.4f}")

# ========== 分类别评估 ==========
categories = result_df['category'].unique()
for category in categories:
    print(f"\n{'='*25} Category: {category} {'='*25}")
    cat_df = result_df[result_df['category'] == category]
    
    cat_accuracy = sum(cat_df['generated_text_code'] == cat_df['answer_code']) / len(cat_df)
    print(f"Accuracy: {cat_accuracy:.4f}")
    
    cat_tp = sum((cat_df['generated_text_code'] == 1) & (cat_df['answer_code'] == 1))
    cat_pred_positive = sum(cat_df['generated_text_code'] == 1)
    cat_actual_positive = sum(cat_df['answer_code'] == 1)
    
    cat_precision = cat_tp / cat_pred_positive if cat_pred_positive > 0 else 0.0
    print(f"Precision: {cat_precision:.4f}")
    
    cat_recall = cat_tp / cat_actual_positive if cat_actual_positive > 0 else 0.0
    print(f"Recall: {cat_recall:.4f}")
    
    cat_f1 = 2 * (cat_precision * cat_recall) / (cat_precision + cat_recall) if (cat_precision + cat_recall) > 0 else 0.0
    print(f"F1 Score: {cat_f1:.4f}")
    
    print(f"\nConfusion Matrix:")
    print(pd.crosstab(cat_df['generated_text_code'], cat_df['answer_code'], 
                      rownames=['Generated Text Code'], colnames=['Answer Code']))
    
    print(f"\nAverage entropy of non-hallucinated answers: {cat_df[cat_df['judgment'] == 1]['avg_entropy'].mean():.4f}")
    print(f"Average entropy of hallucinated answers: {cat_df[cat_df['judgment'] == 0]['avg_entropy'].mean():.4f}")

Accuracy: 0.7662
Precision: 0.8141
Recall: 0.6900
F1 Score: 0.7469

Confusion Matrix:
Answer Code           0.0  1.0
Generated Text Code           
0.0                  1000  368
1.0                   187  819

Average entropy of non-hallucinated answers: 0.6413
Average entropy of hallucinated answers: 0.7359

Accuracy: 0.6750
Precision: 0.6667
Recall: 0.7000
F1 Score: 0.6829

Confusion Matrix:
Answer Code          0.0  1.0
Generated Text Code          
0.0                  130   60
1.0                   70  140

Average entropy of non-hallucinated answers: 0.9016
Average entropy of hallucinated answers: 0.9476

Accuracy: 0.7588
Precision: 0.9231
Recall: 0.5647
F1 Score: 0.7007

Confusion Matrix:
Answer Code          0.0  1.0
Generated Text Code          
0.0                  162   74
1.0                    8   96

Average entropy of non-hallucinated answers: 0.2774
Average entropy of hallucinated answers: 0.2603

Accuracy: 0.5500
Precision: 0.6250
Recall: 0.2500
F1 Score: 0.3571

Conf

In [59]:
step_details_df = pd.read_csv(osp.join(result_dir, "step_details.csv"))
step_details_df.head()

Unnamed: 0,sample_id,category,step,entropy,top1_token,top1_prob,top1_token_id,top2_token,top2_prob,top2_token_id,top3_token,top3_prob,top3_token_id,top4_token,top4_prob,top4_token_id,top5_token,top5_prob,top5_token_id
0,0,artwork,0,0.879988,No,0.570474,1770,Yes,0.392975,5592,The,0.01895,415,Based,0.005484,17158,I,0.004166,315
1,0,artwork,1,0.077887,",",0.985565,28725,.,0.014068,28723,,0.000255,28705,pe,2.9e-05,386,to,1.6e-05,298
2,0,artwork,2,1.000442,the,0.49648,272,this,0.446246,456,that,0.017773,369,it,0.015082,378,I,0.00667,315
3,0,artwork,3,0.561032,artwork,0.879845,27261,artist,0.055217,7325,painting,0.040446,11514,image,0.009699,3469,name,0.003735,1141
4,0,artwork,4,1.553135,is,0.581078,349,in,0.143341,297,you,0.109461,368,depicted,0.050892,28264,was,0.024242,403
