In [None]:
import json
from pathlib import Path
from typing import List, Dict

# Import deepeval related modules
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.metrics.g_eval import Rubric

import pandas as pd
from tqdm.auto import tqdm

In [None]:
# Set OpenAI API Key
import os
from getpass import getpass

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


In [None]:

import sys
import subprocess

subprocess.check_call([sys.executable, "-m", "pip", "install", 
                      "-i", "https://pypi.tuna.tsinghua.edu.cn/simple", "pymupdf"])

In [None]:
# Configure Path
OUTPUT_DIR = Path(r"./annual-reports_output")
INPUT_DIR = Path(r"./annual-reports")

# Evaluation Results Save Directory
EVAL_OUTPUT_DIR = OUTPUT_DIR / "evaluations_geval"
EVAL_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Define Stage Directory
STAGE_DIRS = {
    "stage1_baseline": OUTPUT_DIR / "stage1_baseline",
    "stage2_role_background": OUTPUT_DIR / "stage2_role_background",
    "stage3_constraints": OUTPUT_DIR / "stage3_constraints",
    "stage4_cot_laysummary": OUTPUT_DIR / "stage4_cot_laysummary",
}


## Define Evaluation Metrics (Based on the Four Dimensions from the Paper)

In [None]:
# Set deepeval timeout
os.environ["DEEPEVAL_PER_TASK_TIMEOUT_SECONDS"] = "300"
os.environ["DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS"] = "120"

# 1. Numerical Factuality
numerical_factuality_metric = GEval(
    name="Numerical Factuality",
    criteria="""Evaluate whether the numerical data in the generated financial summary is consistent with the original annual report (text and embedded tables).
                1. Strictly check for "numerical hallucinations" or fabricated numbers;
                2. Verify that numbers are not incorrectly associated with mismatched indicators;
                3. Special attention: If the model converts long numerical strings into public-friendly units (e.g., "¥2.45 billion" instead of "2,450,123,000.00"), as long as the magnitude is accurate, it should be considered factual and audience-friendly.。""",
    evaluation_params=[
        LLMTestCaseParams.ACTUAL_OUTPUT,
        LLMTestCaseParams.CONTEXT,
    ],
    threshold=0.7,
    model="gpt-3.5-turbo", # It is recommended to use a higher-performance model as an evaluator to simulate expert logic 
    async_mode=False,
    rubric=[
        Rubric(score_range=(0, 3), expected_outcome="Obvious numerical errors, false fabrication, or serious indicator misguidance exist"),
        Rubric(score_range=(4, 6), expected_outcome="Core numbers are basically correct, but there are individual data deviations or unit conversion errors"),
        Rubric(score_range=(7, 8), expected_outcome="Numbers are accurate, highly consistent with the original text, and the data presentation is suitable for public understanding"),
        Rubric(score_range=(9, 10), expected_outcome="Numbers are completely accurate with no hallucinations, precisely extracting core financial indicators that support decision-making"),
    ],
)

# 2. Readability & Accessibility
readability_metric = GEval(
    criteria="""Evaluate whether the summary successfully converts obscure accounting terminology into business language that ordinary readers can understand.
    1. Focus on whether analogies or explanatory descriptions are used (e.g., explaining "goodwill impairment" as "the acquired company is worth less than expected");
    2. Simple verbatim extraction from the original text is strictly prohibited; semantic reconstruction must be performed;
    3. Language should be concise and barrier-free, ensuring that non-professional investors can understand how the company makes money."""
   ),
    threshold=0.7,
    model="gpt-3.5-turbo",
    async_mode=False,
    rubric=[
        Rubric(score_range=(0, 3), expected_outcome="Filled with accounting terminology, directly extracted from the original text, extremely unfriendly to the general public"),
        Rubric(score_range=(4, 6), expected_outcome="Attempts to popularize, but still contains many professional terms, explanations are not straightforward enough"),
        Rubric(score_range=(7, 8), expected_outcome="Language is accessible, successfully removed most terminology barriers, suitable for ordinary investors to read"),
        Rubric(score_range=(9, 10), expected_outcome="High-level semantic reconstruction, using vivid analogies to resolve professional terminology, with extremely strong popularization and communication value"),
    ],
)

# 3. Coherence
coherence_metric = GEval(
    name="Coherence",
    criteria="""Evaluate whether the summary maintains clear narrative logic when processing financial reports that are hundreds of pages long.
    1. Check for logical gaps between paragraphs or information fragmentation caused by insufficient long-text processing capabilities;
    2. The summary should follow a clear contextual flow (e.g., business overview -> financial performance -> risk warnings);
    3. Ensure there are no contradictions or meaningless repetitive statements [cite: 26].""",
    evaluation_params=[
        LLMTestCaseParams.ACTUAL_OUTPUT,
    ],
    threshold=0.7,
    model="gpt-3.5-turbo",
    async_mode=False,
    rubric=[
        Rubric(score_range=(0, 3), expected_outcome="Extremely chaotic logic, severe information fragmentation, readers cannot connect the company's overall operations"),
        Rubric(score_range=(4, 6), expected_outcome="Basically coherent, but some sections have awkward transitions, with certain logical jumps"),
        Rubric(score_range=(7, 8), expected_outcome="Clear logic, rigorous structure, successfully condensed massive long text into a coherent guide"),
        Rubric(score_range=(9, 10), expected_outcome="Extremely fluent narrative, impeccable contextual logic, demonstrating strong cross-chapter data correlation capabilities"),
    ],
)

# 4. Informativeness
informativeness_metric = GEval(
    name="Informativeness",
    criteria="""Verify whether the summary completely covers core sections such as balance sheet, income statement, and risk factors.
    1. Must include core operational information sufficient to support public investment decisions;
    2. Focus on checking whether "negative indicators" and "risk warnings" are mandatorily retained to ensure the summary is objective and neutral;
    3. The evaluation standard is whether "simplified without losing key points" has been achieved, not omitting major financial changes while reducing length."""
    evaluation_params=[
        LLMTestCaseParams.ACTUAL_OUTPUT,
        LLMTestCaseParams.CONTEXT,
    ],
    threshold=0.7,
    model="gpt-3.5-turbo",
    async_mode=False,
    rubric=[
        Rubric(score_range=(0, 3), expected_outcome="Severe information deficiency, missing key sections such as income statement or risk warnings"),
        Rubric(score_range=(4, 6), expected_outcome="Covers basic data, but extraction of risk factors or business details is not comprehensive enough"),
        Rubric(score_range=(7, 8), expected_outcome="Complete content, comprehensively covers the three core financial statements and operational risks, with moderate information volume"),
        Rubric(score_range=(9, 10), expected_outcome="Extremely complete and well-prioritized, conveying the most decision-valuable financial information in a very short length"),
    ],
)
all_metrics = [numerical_factuality_metric, readability_metric, coherence_metric, informativeness_metric]
print(f"Defined {len(all_metrics)} evaluation metrics (based on the four dimensions from the paper)")

In [None]:
import fitz  # pymupdf
import re
import unicodedata

def pdf_to_text(pdf_path: str) -> str:
    """Extract text from PDF"""
    doc = fitz.open(pdf_path)
    parts = []
    for page in doc:
        t = page.get_text("text")
        if t and t.strip():
            parts.append(t)
    return "\n\n".join(parts)

def clean_text(t: str) -> str:
    """Clean text"""
    t = unicodedata.normalize("NFKC", t)
    t = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]", " ", t)
    t = "\n".join(line for line in t.splitlines() if not re.fullmatch(r"\s*\d+\s*", line))
    t = re.sub(r"\n{3,}", "\n\n", t)
    return t

def get_original_text(report_file: Path) -> str:
    """Find the corresponding original PDF based on the report filename and extract text"""
    # Extract original filename from report filename (remove _short_report.txt suffix)
    pdf_name = report_file.stem.replace("_short_report", "") + ".pdf"
    pdf_path = INPUT_DIR / pdf_name
    
    if pdf_path.exists():
        text = pdf_to_text(str(pdf_path))
        return clean_text(text)
    else:
        print(f"Warning: Corresponding PDF file not found: {pdf_path}")
        return ""


## Execute Evaluation (Supports Multiple Stages)

In [None]:
# Evaluate reports from all stages
all_evaluation_results = []

for stage_name, stage_dir in STAGE_DIRS.items():
    print(f"\n{'='*60}")
    print(f"Starting evaluation for stage: {stage_name}")
    print(f"{'='*60}")
    
    if not stage_dir.exists():
        print(f"Warning: Stage directory does not exist: {stage_dir}")
        continue
    
    # Get all report files for this stage
    report_files = sorted(stage_dir.glob("*_short_report.txt"))
    print(f"Found {len(report_files)} report files")
    
    for report_file in tqdm(report_files, desc=f"Evaluating {stage_name}"):
        # Read the generated report
        actual_output = report_file.read_text(encoding="utf-8")
        
        # Read original annual report text as context
        original_text = get_original_text(report_file)
        
        # If original text is too long, truncate to first 50,000 characters
        if len(original_text) > 50000:
            original_text = original_text[:50000] + "\n\n[Text truncated...]"
        
        # Create test case
        context_list = [original_text] if original_text else None
        test_case = LLMTestCase(
            input=f"Evaluate report file: {report_file.name}",
            actual_output=actual_output,
            context=context_list,
        )
        
        # Evaluate each metric
        result = {
            "stage": stage_name,
            "file": report_file.name,
            "report_length": len(actual_output),
        }
        
        for metric in all_metrics:
            try:
                score = metric.measure(test_case, _show_indicator=True)
                result[f"{metric.name}_score"] = score
                result[f"{metric.name}_success"] = metric.success
                result[f"{metric.name}_reason"] = metric.reason
                
                if hasattr(metric, 'evaluation_cost') and metric.evaluation_cost:
                    result[f"{metric.name}_cost"] = metric.evaluation_cost
                
            except Exception as e:
                print(f"Error evaluating {metric.name} ({report_file.name}): {e}")
                result[f"{metric.name}_score"] = None
                result[f"{metric.name}_success"] = False
                result[f"{metric.name}_reason"] = f"Error: {str(e)}"
        
        all_evaluation_results.append(result)
        
        # Save detailed evaluation results for a single report
        eval_file = EVAL_OUTPUT_DIR / f"{stage_name}_{report_file.stem}_evaluation.json"
        with open(eval_file, "w", encoding="utf-8") as f:
            json.dump(result, f, ensure_ascii=False, indent=2)

print(f"\nEvaluation completed, evaluated {len(all_evaluation_results)} reports")

## Results Summary and Analysis

In [None]:
# Convert to DataFrame
df_results = pd.DataFrame(all_evaluation_results)

# Display all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 100)

print("Evaluation Results Summary:")
print(df_results.to_string())

# Save as CSV
csv_file = EVAL_OUTPUT_DIR / "evaluation_summary.csv"
df_results.to_csv(csv_file, index=False, encoding="utf-8-sig")
print(f"\nResults saved to: {csv_file}")

In [None]:
# View detailed evaluation results for a report
if len(all_evaluation_results) > 0:
    first_result = all_evaluation_results[0]
    print(f"\nReport: {first_result['file']} (Stage: {first_result['stage']})")
    print("=" * 80)
    
    for metric in all_metrics:
        metric_name = metric.name
        score = first_result.get(f"{metric_name}_score")
        reason = first_result.get(f"{metric_name}_reason", "None")
        
        print(f"\n【{metric_name}】")
        print(f"Score: {score}")
        print(f"Evaluation Reason: {reason}")
        print("-" * 80)


In [None]:
# Calculate statistics
score_columns = [col for col in df_results.columns if col.endswith("_score")]

print("\nAverage scores for all metrics across all stages:")
for col in score_columns:
    metric_name = col.replace("_score", "")
    avg_score = df_results[col].mean()
    print(f"{metric_name}: {avg_score:.3f}")

print("\nPass rates for each metric (>= threshold):")
success_columns = [col for col in df_results.columns if col.endswith("_success")]
for col in success_columns:
    metric_name = col.replace("_success", "")
    pass_rate = df_results[col].sum() / len(df_results) * 100
    print(f"{metric_name}: {pass_rate:.1f}%")

# Statistics by stage
print("\nAverage score comparison by stage:")
if 'stage' in df_results.columns:
    stage_stats = df_results.groupby('stage')[score_columns].mean()
    print(stage_stats.to_string())