In [None]:
import json
import re
import unicodedata
import sys
!{sys.executable} -m pip install -U vllm -v --timeout 100

In [None]:
import os
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["TRANSFORMERS_OFFLINE"] = "1"

MODEL_DIR = "./Qwen2.5-14B-Instruct"  # Change to your actual model directory (with config.json)

from vllm import LLM
llm = LLM(
    model=MODEL_DIR,
    tensor_parallel_size=2,   # You have 2 A800 GPUs, recommended=2; if error occurs, change to 1 first
    dtype="auto",
    max_model_len=32768,
)

print("llm loaded")


In [None]:
!pip -q install -U pymupdf
import sys
!{sys.executable} -m pip -q install -U pandas
!pip -q install -U transformers

In [None]:
from pathlib import Path

INPUT_DIR = Path("./annual-reports_Microsoft")   # Directory containing annual report PDFs
OUTPUT_DIR = Path("./annual-reports_Microsoft_output")  # Output directory (change to yours)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

pdf_paths = sorted(INPUT_DIR.glob("*.pdf"))
len(pdf_paths), [p.name for p in pdf_paths[:5]]


In [None]:
import fitz  # pymupdf

def pdf_to_text(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    parts = []
    for page in doc:
        t = page.get_text("text")
        if t and t.strip():
            parts.append(t)
    return "\n\n".join(parts)


In [None]:
# ========== Instruction Definitions for Four Prompt Evolution Stages ==========
def get_instruction_by_stage(stage: int) -> str:
    """
    Return corresponding instruction based on stage
    stage: 1-4, corresponding to four evolution stages
    """
    if stage == 1:
        # Stage 1: Simple and direct Baseline Prompt
        return """Please summarize this annual financial report in English. Output approximately 500 words."""
    
    elif stage == 2:
        # Stage 2: Role and Background Setting (Role & Background Alignment)
        return """You are a professional financial analyst. Please write a financial report summary in English for non-professional investors.

Target audience: Non-professional investors who are not familiar with financial terminology and need easy-to-understand explanations.

Please summarize this annual financial report in English. Output approximately 500 words."""
    
    elif stage == 3:
       # Phase 3: Constraint Conditions and Format Specifications (Constraint-led Prompt)
        return """You are a professional financial analyst. Please write a financial report summary in English for non-professional investors.

Target audience: Non-professional investors who are not familiar with financial terminology and need easy-to-understand explanations.

Strict constraints:
- Must include risk warnings (Negative News) from the report. If the original text does not mention any risks, clearly state "Risk information not disclosed in the original report"
- Must include key financial indicators and data
- Ensure the summary is objective and comprehensive, without omitting important information
- If certain information is not present in the original text, clearly mark it as "Not disclosed in the original report"

Please summarize this annual financial report in English. Output approximately 500 words."""
    
    elif stage == 4:
       #Stage 4: Format Thinking and Popularization Enhancement (CoT & Lay-summary Prompt)
        return """You are a professional financial analyst. Please write a financial report summary in English for non-professional investors.

Target audience: Non-professional investors who are not familiar with financial terminology and need easy-to-understand explanations.

Writing process (please follow these steps):
Step 1: Identify technical terms - Find professional financial terms in the report (such as "Deferred Revenue", "Goodwill Impairment", "EBITDA", etc.)
Step 2: Understand term meanings - Understand the professional meaning and business logic of each term
Step 3: Simplify technical terms - Rewrite professional terms into easy-to-understand language (e.g., "Deferred Revenue" → "Customers paid in advance but services haven't been delivered yet"; "Goodwill Impairment" → "The acquired company is worth less than expected")
Step 4: Logical organization - Understand the logical relationships between data, identify growth points and risk points
Step 5: Write the summary - Use simplified language to write an easy-to-understand report summary

Strict constraints:
- Must include risk warnings (Negative News) from the report. If the original text does not mention any risks, clearly state "Risk information not disclosed in the original report"
- Must include key financial indicators and data, and explain their meanings in plain language
- Ensure the summary is objective and comprehensive, without omitting important information
- If certain information is not present in the original text, clearly mark it as "Not disclosed in the original report"
- Avoid using technical terms. If necessary, explain them in parentheses when first mentioned

Please follow the above steps to summarize this annual financial report in English. Output approximately 500 words."""
    
    else:
        raise ValueError(f"Stage must be an integer between 1-4, current value: {stage}")



In [None]:
from transformers import AutoTokenizer
from vllm import SamplingParams

MODEL_DIR = "./Qwen2.5-14B-Instruct"  # Change to your model directory
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True)

MAX_MODEL_LEN = 32768
RESERVE_FOR_OUTPUT = 1500
def clean_text(t: str) -> str:

    t = unicodedata.normalize("NFKC", t)
    t = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]", " ", t)
    t = "\n".join(line for line in t.splitlines() if not re.fullmatch(r"\s*\d+\s*", line))
    t = re.sub(r"\n{3,}", "\n\n", t)
    return t

def chunk_by_tokens(text, chunk_tokens=6000, overlap_tokens=300):
    ids = tokenizer.encode(text)
    step = chunk_tokens - overlap_tokens
    chunks = []
    for start in range(0, len(ids), step):
        end = min(start + chunk_tokens, len(ids))
        chunks.append(tokenizer.decode(ids[start:end]))
        if end >= len(ids):
            break
    return chunks

def build_one_shot_prompt(annual_text: str, instruction: str, stage: int = 1) -> str:
    """Build one-shot prompt, adjust prefix based on stage"""
    if stage == 1:
        # Stage 1: No role setting
        prefix = ""
    elif stage >= 2:
        # Stage 2 and above: Include role setting
        prefix = "You are a rigorous financial analyst. You must write reports based only on the given original text.\n\n"
    else:
        prefix = ""
    
    return f"""{prefix}Instruction:
{instruction}

Original Text:
{annual_text}

IMPORTANT: You MUST write your response entirely in English. Do not use Chinese or any other language. Write the entire summary in English only.
"""
def extract_first_json(text: str) -> dict:
    m = re.search(r"\{.*\}", text, flags=re.S)
    if not m:
        raise ValueError("No JSON object found in output")
    return json.loads(m.group(0))

def dedup_keep_order(items):
    seen = set()
    out = []
    for x in items:
        if not isinstance(x, str):
            continue
        x2 = x.strip()
        if x2 and x2 not in seen:
            seen.add(x2)
            out.append(x2)
    return out

def normalize_card(card: dict) -> dict:
    for k in ["facts", "major_events", "risks", "quotes"]:
        if k in card and isinstance(card[k], list):
            card[k] = dedup_keep_order(card[k])
    return card

def build_map_prompts(chunks, stage: int = 1):
    """Build map stage prompts, adjust requirements based on stage"""
    prompts = []
    for i, ch in enumerate(chunks, 1):
        if stage <= 2:
            # Stage 1 and 2: Basic extraction
            risk_instruction = "risks: 0~5 risk points"
        else:
            # Stage 3 and 4: Emphasize must include risk information
            risk_instruction = "risks: 0~5 risk points (must extract, if no risks please mark 'No risks mentioned in this segment')"
        
        prompts.append(f"""
You will receive a segment of annual report text. Please strictly extract information only based on the original text, do not fabricate.
Output only one JSON object (no markdown, no explanations, no "please confirm").
Output the JSON marker <END_JSON> immediately after the JSON and stop.

JSON fields as follows:
- chunk_id: "c{i:03d}"
- facts: 3~8 key facts (preferably include numbers)
- financial_metrics: If financial indicators appear, output as an array, e.g., {{"name":"revenue","value":"...","period":"..."}}
- major_events: 0~5 major events
- {risk_instruction}
- quotes: 2~5 short quotes from the original text (to support facts/numbers)

Original Text:
{ch}
<END_JSON>
""".strip())
    return prompts


def build_reduce_prompt(cards, instruction: str, stage: int = 1) -> str:
    """Build reduce stage prompt, adjust rules based on stage"""
    if stage == 1:
        # Stage 1: Basic rules
        rules = """Strict rules:
- Only use facts/numbers/events that appear in the cards; do not add or guess.
- If information is missing, write "Not disclosed in the original report".
- No repetition: Do not repeat the same sentence/fact."""
    elif stage == 2:
        # Stage 2: Add role and reader perspective
        rules = """Strict rules:
- Only use facts/numbers/events that appear in the cards; do not add or guess.
- If information is missing, write "Not disclosed in the original report".
- Use easy-to-understand language, avoid technical terms, target audience is non-professional investors.
- No repetition: Do not repeat the same sentence/fact."""
    elif stage == 3:
        # Stage 3: Emphasize must include risks and key indicators
        rules = """Strict rules:
- Only use facts/numbers/events that appear in the cards; do not add or guess.
- Must include risk information: If there are risk warnings in the cards, include all of them; if there is no risk information, clearly state "Risk information not disclosed in the original report".
- Must include key financial indicators and data.
- If other information is missing, write "Not disclosed in the original report".
- Use easy-to-understand language, avoid technical terms, target audience is non-professional investors.
- No repetition: Do not repeat the same sentence/fact."""
    else:  # stage == 4
        # Stage 4: Add popularization requirements
        rules = """Strict rules:
- Only use facts/numbers/events that appear in the cards; do not add or guess.
- Must include risk information: If there are risk warnings in the cards, include all of them; if there is no risk information, clearly state "Risk information not disclosed in the original report".
- Must include key financial indicators and data, and explain their meanings in plain language.
- Popularize technical terms: Rewrite technical terms into easy-to-understand language (e.g., "Deferred Revenue" → "Customers paid in advance but services haven't been delivered yet"; "Goodwill Impairment" → "The acquired company is worth less than expected").
- If other information is missing, write "Not disclosed in the original report".
- Use easy-to-understand language, target audience is non-professional investors.
- No repetition: Do not repeat the same sentence/fact."""
    
    return f"""
You will receive several JSON cards (extracted results from the annual report text).
Please strictly follow the Instruction to write the final short report.

{rules}

Instruction:
{instruction}

Cards:
{chr(10).join(cards)}

IMPORTANT: You MUST write your response entirely in English. Do not use Chinese or any other language. Write the entire report summary in English only.
""".strip()



In [None]:
ONE_SHOT_SAMPLING = SamplingParams(
    temperature=0.2,
    max_tokens=1100,
    repetition_penalty=1.08,
)

MAP_SAMPLING = SamplingParams(
    temperature=0.0,
    max_tokens=650,              # Don't make it too large, avoid going off track
    repetition_penalty=1.15,
    stop=["<END_JSON>"],
)

MAP_RETRY_SAMPLING = SamplingParams(
    temperature=0.0,
    max_tokens=650,
    repetition_penalty=1.25,
    stop=["<END_JSON>"],
)

REDUCE_SAMPLING = SamplingParams(
    temperature=0.15,
    max_tokens=1200,
    repetition_penalty=1.12,
)

def generate_report_for_text(annual_text: str, llm, instruction: str,
                             chunk_tokens=6000, overlap_tokens=300, stage: int = 1):
    """
    Generate report
    stage: Prompt evolution stage (1-4)
    """
    total_tokens = len(tokenizer.encode(annual_text))
    can_one_shot = total_tokens <= (MAX_MODEL_LEN - RESERVE_FOR_OUTPUT)

    if can_one_shot:
        prompt = build_one_shot_prompt(annual_text, instruction, stage=stage)
        out = llm.generate([prompt], ONE_SHOT_SAMPLING)[0].outputs[0].text
        return out, {"mode": "one_shot", "total_tokens": total_tokens, "chunks": 0, "stage": stage}

    chunks = chunk_by_tokens(annual_text, chunk_tokens=chunk_tokens, overlap_tokens=overlap_tokens)
    map_prompts = build_map_prompts(chunks, stage=stage)

    map_outputs = llm.generate(map_prompts, MAP_SAMPLING)

    cards_dicts = []
    bad = 0

    for idx, o in enumerate(map_outputs, 1):
        raw = o.outputs[0].text
        try:
            card = extract_first_json(raw)
        except Exception:
            retry_raw = llm.generate([map_prompts[idx-1]], MAP_RETRY_SAMPLING)[0].outputs[0].text
            try:
                card = extract_first_json(retry_raw)
            except Exception:
                bad += 1
                card = {
                    "chunk_id": f"c{idx:03d}",
                    "facts": [],
                    "financial_metrics": [],
                    "major_events": [],
                    "risks": [],
                    "quotes": [],
                    "missing": ["Chunk extraction failed"]
                }

        card = normalize_card(card)
        card.setdefault("chunk_id", f"c{idx:03d}")
        cards_dicts.append(card)

    cards_clean_json = [json.dumps(c, ensure_ascii=False) for c in cards_dicts]

    reduce_prompt = build_reduce_prompt(cards_clean_json, instruction, stage=stage)
    report = llm.generate([reduce_prompt], REDUCE_SAMPLING)[0].outputs[0].text

    return report, {"mode": "map_reduce", "total_tokens": total_tokens, "chunks": len(chunks), "bad_cards": bad, "stage": stage}



In [None]:
from tqdm.auto import tqdm
import json

# ========== Configuration: Select Stages to Run ==========
# Can be set to 1, 2, 3, 4 or [1, 2, 3, 4] to run single or multiple stages
STAGES_TO_RUN = [1, 2, 3, 4]  # Run all four stages


# Stage name mapping
STAGE_NAMES = {
    1: "stage1_baseline",
    2: "stage2_role_background", 
    3: "stage3_constraints",
    4: "stage4_cot_laysummary"
}

results_all_stages = {}

for stage in STAGES_TO_RUN:
    print(f"\n{'='*60}")
    print(f"Starting processing stage {stage}: {STAGE_NAMES[stage]}")
    print(f"{'='*60}\n")
    
    # Get instruction for current stage
    instruction = get_instruction_by_stage(stage)
    
    # Create independent output directory for each stage
    stage_output_dir = OUTPUT_DIR / STAGE_NAMES[stage]
    stage_output_dir.mkdir(parents=True, exist_ok=True)
    
    results = []

    for pdf_path in tqdm(pdf_paths, desc=f"Stage {stage}"):
        out_txt = stage_output_dir / (pdf_path.stem + "_short_report.txt")
        out_meta = stage_output_dir / (pdf_path.stem + "_meta.json")

        # Skip if results already exist (can delete this section to allow overwriting)
        if out_txt.exists() and out_meta.exists():
            continue

        annual_text = clean_text(pdf_to_text(str(pdf_path)))
        report, meta = generate_report_for_text(annual_text, llm, instruction, stage=stage)

        out_txt.write_text(report, encoding="utf-8")
        out_meta.write_text(json.dumps({"file": pdf_path.name, **meta}, ensure_ascii=False, indent=2), encoding="utf-8")
        results.append({"file": pdf_path.name, **meta})

    results_all_stages[stage] = results
    print(f"\nStage {stage} completed, processed {len(results)} files\n")

# Display result summary for all stages
print(f"\n{'='*60}")
print("All stages processing completed! Result summary:")
print(f"{'='*60}\n")
for stage, results in results_all_stages.items():
    print(f"Stage {stage} ({STAGE_NAMES[stage]}): {len(results)} files")
    if results:
        print(f"  Example: {results[0]['file']} - {results[0]['mode']} - tokens: {results[0]['total_tokens']}")
    print()
