In [None]:
import os, json, ast, time, math
import pandas as pd
import numpy as np
import openai, tiktoken
from PyPDF2 import PdfReader
from tqdm.auto import tqdm
from dotenv import load_dotenv
import matplotlib.pyplot as plt

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

PDF_DIR          = "data/tcfd_report_pdf_preprocessed"     # 54 ‰ªΩ PDF
LABEL_XLSX       = "data/tcfdÁ¨¨ÂõõÂ±§Êè≠Èú≤ÊåáÂºï.xlsx"
GROUND_TRUTH_XLSX= "data/answer/rank.xlsx"
OUT_DIR          = "data/llm_question_answering_results"
os.makedirs(OUT_DIR, exist_ok=True)

MODEL_NAME       = "gpt-4o-mini"
TOKENIZER        = tiktoken.encoding_for_model(MODEL_NAME)
CHUNK_TOKENS     = 1500 
OVERLAP_TOKENS   = 150
MAX_WORKERS      = 8
SAVE_INTERVAL    = 100


In [None]:
df_labels       = pd.read_excel(LABEL_XLSX).dropna(subset=["Label", "Definition"])
label_mapping   = dict(zip(df_labels["Label"], df_labels["Definition"]))
all_label_list  = list(label_mapping.keys())        
print(f"ÂÖ±ËºâÂÖ• {len(all_label_list)} ÂÄãÊ®ôÁ±§Ôºö", all_label_list[:10], "...")


ÂÖ±ËºâÂÖ• 91 ÂÄãÊ®ôÁ±§Ôºö ['G-1-1_1', 'G-1-1_2', 'G-1-2_3', 'G-1-2_4', 'G-1-3_5', 'G-2-1_6', 'G-2-1_7', 'G-2-1_8', 'G-2-2_9', 'G-2-2_10'] ...


In [None]:
def pdf_to_chunks(pdf_path: str,
                  max_tokens: int = CHUNK_TOKENS,
                  overlap_tokens: int = OVERLAP_TOKENS):

    try:
        reader = PdfReader(pdf_path)
        pages  = len(reader.pages)
        full_text = "\n".join(p.extract_text() or "" for p in reader.pages)
    except Exception as e:
        print("‚ùå Ëß£ÊûêÂ§±Êïó:", pdf_path, e, flush=True)
        return

    tokens   = TOKENIZER.encode(full_text, disallowed_special=())
    total_tok= len(tokens)

    if total_tok <= max_tokens:
        est_chunks = 1
    else:
        step = max_tokens - overlap_tokens
        est_chunks = math.ceil((total_tok - max_tokens) / step) + 1

    print(f"  üìÑ {os.path.basename(pdf_path)} | È†ÅÊï∏={pages} | tokens={total_tok} | ‰º∞Ë®à chunks={est_chunks}",
          flush=True)

    start   = 0
    chunk_id= 0
    while start < total_tok:
        end          = min(start + max_tokens, total_tok)
        chunk_tokens = tokens[start:end]
        chunk_text   = TOKENIZER.decode(chunk_tokens)
        yield chunk_id, chunk_text

        chunk_id += 1
        start = end - overlap_tokens


In [None]:

def build_matched_label_json(label_list):
    return json.dumps(
        [{"label": lb, "label_definition": label_mapping.get(lb, "")}
         for lb in label_list],
        ensure_ascii=False,
    )

ALL_LABEL_JSON = build_matched_label_json(all_label_list)
MAX_RETRY = 5 

def query_llm_for_verification(chunk: str):
    prompt = f"""
        ### ËÉåÊôØË≥áË®ä ###
        ‰Ω†ÊòØÊ∞£ÂÄôÁõ∏ÈóúË≤°ÂãôÊè≠Èú≤Ê®ôÊ∫ñÂ∞àÂÆ∂ÔºåÁÜüÊÇâ TCFD Á¨¨ÂõõÂ±§Êè≠Èú≤ÊåáÂºïÁöÑÊ®ôÊ∫ñËàáÂÆöÁæ©„ÄÇ‰Ω†Â∞áÊúÉÊî∂Âà∞‰∏ÄÊÆµÂ†±ÂëäÊõ∏ÂÖßÂÆπÔºå‰∏¶‰∏îÈúÄË¶ÅÂà§Êñ∑Ë©≤Â†±ÂëäÊõ∏ÂÖßÂÆπÊòØÂê¶Á¨¶ÂêàÊüê‰∫õÁâπÂÆöÊè≠Èú≤Ê®ôÊ∫ñ„ÄÇ

        Ë´ãÊ†πÊìö‰ª•‰∏ãÂ†±ÂëäÊõ∏ÂÖßÂÆπÈÄ≤Ë°åÂà§Êñ∑Ôºö
        {chunk}

        Ë´ãÂÉÖÈáùÂ∞ç‰ª•‰∏ãÊè≠Èú≤Ê®ôÊ∫ñÈÄ≤Ë°åË©ï‰º∞Ôºå‰∏çË¶ÅË©ï‰º∞ÊàñÂåÖÂê´ÂÖ∂‰ªñ‰ªª‰ΩïÊ®ôÊ∫ñÔºö
        {ALL_LABEL_JSON}

        ### ÂõûË¶ÜÊ†ºÂºè ###
        Ë´ãÂÉÖÂõûË¶ÜÁ¥î JSON Ê†ºÂºèÔºå‰∏çË¶ÅÂåÖÂê´‰ªª‰Ωï Markdown Ë™ûÊ≥ï„ÄÅÁ®ãÂºèÁ¢ºÂçÄÂ°äÊàñÈ°çÂ§ñË™™ÊòéÊñáÂ≠ó„ÄÇÊØèÂÄã JSON Áâ©‰ª∂ÂøÖÈ†àÂåÖÂê´‰ª•‰∏ãÊ¨Ñ‰ΩçÔºö
        1. chunk: stringÔºåÂ†±ÂëäÊõ∏ÂÖßÂÆπ„ÄÇ
        2. label: stringÔºåÂ∞çÊáâÁöÑÊè≠Èú≤Ê®ôÊ∫ñ‰ª£Á¢º„ÄÇ
        3. reason: stringÔºåË©≥Á¥∞Ë™™ÊòéÂà§Êñ∑ÁöÑÊé®ÁêÜÈÅéÁ®ãÔºåËß£ÈáãÁÇ∫‰ΩïË©≤Êè≠Èú≤Ê®ôÊ∫ñÊúâÊàñÊ≤íÊúâË¢´Êè≠Èú≤„ÄÇ
        4. is_disclosed: booleanÔºåËã•Â†±ÂëäÊõ∏‰∏≠ÊúâÊè≠Èú≤Ë©≤Ê®ôÊ∫ñÂâáÂõûË¶Ü 1ÔºõÊú™Êè≠Èú≤ÂâáÂõûË¶Ü 0„ÄÇ

        Ë´ãÂÉÖÈáùÂ∞çÊàëÊèê‰æõÁöÑÊ®ôÊ∫ñÂàóË°®‰∏≠ÁöÑÊ®ôÊ∫ñÊèê‰æõË©ï‰º∞Ôºå‰∏çË¶ÅÊ∑ªÂä†‰ªª‰ΩïÂÖ∂‰ªñÊ®ôÊ∫ñ„ÄÇ‰Ω†ÁöÑÂõûË¶ÜÊáâË©≤ÂåÖÂê´ÂêåÊ®£Êï∏ÈáèÁöÑ JSON Áâ©‰ª∂ÔºåÊØèÂÄãÂ∞çÊáâÂà∞ÊàëÊèê‰æõÁöÑ‰∏ÄÂÄãÊ®ôÊ∫ñ„ÄÇ

        [{', '.join([f'"{lb}"' for lb in all_label_list])}]
    """

    in_tok = len(TOKENIZER.encode(prompt))
    for retry in range(MAX_RETRY):
        try:
            start_t = time.time()
            resp = openai.chat.completions.create(
                model=MODEL_NAME,
                messages=[
                    {"role": "system", "content": "‰Ω†ÊòØÊ∞£ÂÄôÁõ∏ÈóúË≤°ÂãôÊè≠Èú≤Ê®ôÊ∫ñÂ∞àÂÆ∂„ÄÇ"},
                    {"role": "user",   "content": prompt}
                ],
                timeout=300,
            )
            latency = time.time() - start_t
            out_tok = (resp.usage.total_tokens - in_tok) if getattr(resp, "usage", None) else "?"
            print(f"      ‚Ü™ LLM ok  | retry={retry} | in_tok={in_tok} | out_tok={out_tok} | "
                  f"latency={latency:.1f}s", flush=True)
            return resp.choices[0].message.content.strip(), latency
        except Exception as e:
            wait = (2 ** retry) + np.random.rand()
            print(f"‚ö†Ô∏è  LLM retry {retry+1}/{MAX_RETRY} after {wait:.1f}s ‚Üí {e}", flush=True)
            time.sleep(wait)
    return "Error", None


In [None]:
output_rows  = []
csv_path_tmp = os.path.join(OUT_DIR, "llm_raw_responses_tmp.csv")

pdf_files = sorted([f for f in os.listdir(PDF_DIR) if f.lower().endswith(".pdf")])
print("Âç≥Â∞áËôïÁêÜ PDF Êï∏ÈáèÔºö", len(pdf_files))

global_chunk_idx = 0

for pdf_idx, pdf_name in enumerate(tqdm(pdf_files, desc="PDF Ê™îÊ°à")):
    pdf_path = os.path.join(PDF_DIR, pdf_name)

    # **ÈÇäÁî¢ÁîüÈÇäËôïÁêÜ**Ôºå‰∏çÁî® list() ÂÖàÂÖ®ÈÉ®ËºâÂÖ•
    
    for chunk_id, chunk_text in pdf_to_chunks(pdf_path):
        tk_len = len(TOKENIZER.encode(chunk_text))
        print(f"  ‚ñ∂ [{pdf_idx+1}/{len(pdf_files)}] {pdf_name} | local_chunk={chunk_id} "
            f"global={global_chunk_idx} | len_tok={tk_len}", flush=True)

        print(f"      ‚è≥ call LLM {time.strftime('%H:%M:%S')} ‚Ä¶", flush=True)
        response, latency = query_llm_for_verification(chunk_text)
        print(f"      ‚úÖ done {time.strftime('%H:%M:%S')}  latency={latency:.1f}s", flush=True)
        if latency is None:
            print("    ‚úò LLM ÂõûÂÇ≥ Error", flush=True)

        output_rows.append(
            {
                "Filename": pdf_name,
                "Chunk_ID": chunk_id,
                "Chunk_Text": chunk_text,
                "LLM_Response": response,
            }
        )
        global_chunk_idx += 1

        # ÂÆöÊúü flush
        if global_chunk_idx % SAVE_INTERVAL == 0:
            pd.DataFrame(output_rows).to_csv(
                csv_path_tmp, mode="a", index=False,
                header=not os.path.exists(csv_path_tmp),
                encoding="utf-8",
            )
            print(f"üíæ flush @ {global_chunk_idx} chunks ‚Üí {csv_path_tmp}", flush=True)
            output_rows.clear()

    

    print(f"‚úÖ {pdf_name} ÂÆåÊàê | Á¥ØË®à chunks={global_chunk_idx}", flush=True)

if output_rows:
    pd.DataFrame(output_rows).to_csv(
        csv_path_tmp, mode="a", index=False,
        header=not os.path.exists(csv_path_tmp), encoding="utf-8"
    )
    print(f"üíæ ÊúÄÁµÇ flushÔºåÁ∏Ω chunks={global_chunk_idx}", flush=True)

print("üéâ ÂÖ®ÈÉ® PDF ËôïÁêÜÂÆåÊàêÔºÅCSV ‰ΩçÁΩÆÔºö", csv_path_tmp)


Âç≥Â∞áËôïÁêÜ PDF Êï∏ÈáèÔºö 54


PDF Ê™îÊ°à:   0%|          | 0/54 [00:00<?, ?it/s]

  üìÑ ‰∏äÊµ∑ÂïÜÈäÄ_2022.pdf | È†ÅÊï∏=47 | tokens=25412 | ‰º∞Ë®à chunks=19
  ‚ñ∂ [1/54] ‰∏äÊµ∑ÂïÜÈäÄ_2022.pdf | local_chunk=0 global=0 | len_tok=1500
      ‚Ü™ LLM ok  | retry=0 | in_tok=7043 | out_tok=1596 | latency=28.4s
  ‚ñ∂ [1/54] ‰∏äÊµ∑ÂïÜÈäÄ_2022.pdf | local_chunk=1 global=1 | len_tok=1500


In [None]:
def parse_llm_json(resp_str):
    try:
        return ast.literal_eval(resp_str)
    except Exception as e:
        print("‚ùå ÂõûÂÇ≥Ê†ºÂºèËß£ÊûêÂ§±ÊïóÔºö", e)
        return []

raw_df        = pd.read_csv(csv_path_tmp)
raw_df["Parsed"] = raw_df["LLM_Response"].apply(parse_llm_json)
exploded_df   = raw_df.explode("Parsed").reset_index(drop=True)

exploded_df["Label"]        = exploded_df["Parsed"].apply(
    lambda x: x.get("label") if isinstance(x, dict) else None
)
exploded_df["is_disclosed"] = exploded_df["Parsed"].apply(
    lambda x: x.get("is_disclosed") if isinstance(x, dict) else None
)
exploded_df = exploded_df[
    ["Filename", "Chunk_ID", "Label", "is_disclosed"]
]

print("Â±ïÈñãÂæåË≥áÊñôÁ≠ÜÊï∏Ôºö", len(exploded_df))
display(exploded_df.head(5))


In [None]:
def extract_institution(fn):
    base = fn.split('.')[0]
    return base.split('_')[0]

def extract_year(fn):
    base = fn.split('.')[0]
    parts = base.split('_')
    return parts[1] if len(parts) > 1 else None

exploded_df["Institution"]   = exploded_df["Filename"].map(extract_institution)
exploded_df["Year"]          = exploded_df["Filename"].map(extract_year)
exploded_df["Answer"]        = exploded_df["is_disclosed"].map(lambda x: "Y" if x==1 else "N")

pivot_df = (
    exploded_df
    .pivot_table(index=["Institution", "Year"], columns="Label",
                 values="Answer", aggfunc="first")
    .reset_index()
)
pivot_path = os.path.join(OUT_DIR, "pdf_direct_llm_pivot.csv")
pivot_df.to_csv(pivot_path, index=False, encoding="utf-8")
print("Â∑≤Áî¢Áîü pivot CSVÔºö", pivot_path)
display(pivot_df.head(3))


In [None]:
answer_df = pd.read_excel(GROUND_TRUTH_XLSX)
answer_df.columns = answer_df.columns.astype(str)
pivot_df.columns  = pivot_df.columns.astype(str)

ans_lookup = {
    (str(r["Financial_Institutions"]), str(r["Year"])): r
    for _, r in answer_df.iterrows()
}

common_q = [c for c in pivot_df.columns
            if c not in ("Institution", "Year") and c in answer_df.columns]

print(f"ÂÖ±ÂêåÊØîÂ∞çÊ¨Ñ‰ΩçÔºö{len(common_q)}")

total, correct = 0, 0
per_inst = {}
errors   = {}

for _, row in pivot_df.iterrows():
    inst  = str(row["Institution"])
    year  = str(row["Year"])
    key   = (inst if inst.endswith("Èáë") else inst, year)   # ‰æù‰Ω†ÂéüÈÇèËºØÂæÆË™ø
    if key not in ans_lookup: 
        continue
    ans_row = ans_lookup[key]
    for q in common_q:
        pred = row[q]
        truth= ans_row[q]
        if pd.isna(truth): 
            continue
        total += 1
        per_inst.setdefault(inst, {"correct":0,"total":0})
        per_inst[inst]["total"] += 1
        if pred == truth:
            correct += 1
            per_inst[inst]["correct"] += 1
        else:
            errors.setdefault(inst, []).append(
                {"Year": year, "Question": q,
                 "Pred": pred, "Truth": truth}
            )

print(f"ÂÖ®È´îÊ∫ñÁ¢∫ÁéáÔºö{correct}/{total} = {correct/total:.3%}")
for inst, cnt in per_inst.items():
    acc = cnt["correct"]/cnt["total"] if cnt["total"] else np.nan
    print(f"{inst}: {acc:.3%}")

for inst, err_list in errors.items():
    if err_list:
        print(f"\n{inst} Êúâ {len(err_list)} ÂÄã‰∏ç‰∏ÄËá¥Ê®£‰æãÔºö")
        for e in err_list[:5]:
            print(e)


In [None]:
label_acc = (
    exploded_df.groupby("Label")["is_disclosed"]
    .apply(lambda x: (x==1).sum()/len(x))
    .reset_index(name="Accuracy")
    .sort_values("Label")
)

plt.rcParams["font.sans-serif"] = ["Microsoft JhengHei"]
plt.figure(figsize=(18,6))
plt.bar(label_acc["Label"], label_acc["Accuracy"])
plt.ylim(0,1)
plt.xlabel("È°åÁõÆÊ®ôÁ±§"); plt.ylabel("Ê≠£Á¢∫Áéá"); plt.title("ÂêÑÈ°åÁõÆÊ≠£Á¢∫Áéá")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
