In [2]:
import os
import ast
import pandas as pd
import os
import ast
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# 在某個 Cell (或在 load_answer 裡面) 建立對應表
name_map = {
    "台灣企銀": "臺企銀",
    # 如果未來還有其它名稱不一致，可在此陸續添加
    # "台企銀": "臺企銀",
    # "台北富邦銀行": "北富銀",
    # ...
}
def load_answer(institution, year, latest_answer_df=None, answer_path="data/answer/rank.xlsx"):
    """
    根據金融機構名稱與年度，從 rank.xlsx 中載入該機構當年的正解(ground truth)。
    如果已經有讀進來的 DataFrame (latest_answer_df)，可以傳進來，否則就自動讀取。
    """
    # 如果沒有傳入 DataFrame，就在函式內讀取
    if latest_answer_df is None:
        latest_answer_df = pd.read_excel(answer_path)

    # 1. 檢查是不是「金控」 => 符合你原先的特殊處理 (若有需要)
    if institution[2:4] == "金控":
        institution = institution[0:2] + "金"

    # 2. 檢查在 name_map 裡是否有對應
    if institution in name_map:
        institution = name_map[institution]

    # 確保 year 為數字
    year = int(year)

    # 3. 從最新的 answer_df 選出該機構 + 年度
    answer_for_institution = latest_answer_df[latest_answer_df['Financial_Institutions'] == institution]
    answer_for_institution = answer_for_institution[answer_for_institution['Year'] == year]

    if not answer_for_institution.empty:
        columns_to_print = answer_for_institution.loc[:, "Q1":"Q82"].iloc[0].to_dict()
    else:
        raise ValueError(f"No matching answers found for institution {institution} in year {year}")

    return columns_to_print


In [5]:
def create_accuracy_dataframe(answer, report_dict):
    """
    回傳一個 DataFrame：
    index = 問題 Q1 ~ Q82
    columns = ['Answer', 'Report_Match'] 分別表示「答案的值」和「報告是否偵測到對應類別」(0或1)
    """
    # 取出所有問題的 QID（字典的 key）
    question_ids = sorted(answer.keys())

    # 建立空的 DataFrame，以問題 id 當索引
    df = pd.DataFrame(index=question_ids, columns=['Answer', 'Report_Match'])

    # 'Answer' 欄位
    df['Answer'] = [answer[qid] if qid in answer else 0.0 for qid in question_ids]

    # 從 report_dict 中取得所有 matched_categories
    matched_categories = set()
    for entry in report_dict:
        if 'Matched_Categories' in entry:
            for category in entry['Matched_Categories']:
                clean_category = category.replace('#', '').split('_')[-1]  # 移除# 並取最後
                matched_categories.add(clean_category)

    # 'Report_Match' 欄位：如果 question_ids = "Q1", 實際檢查 "1" 是否在 matched_categories
    df['Report_Match'] = [
        1 if qid[1:] in matched_categories else 0 for qid in question_ids
    ]

    return df


In [10]:
def batch_accuracy_analysis(
    directory="data/tcfd_report_pdf_chunks_matching_result_第四層",
    answer_path="data/answer/rank.xlsx",
    output_dir="data/results"
):
    """
    批次讀取指定資料夾底下的所有 *matched_chunks.csv，
    計算每家金融機構對應年度的準確率，以及整體平均準確率，
    並將各種結果輸出到 output_dir (預設 data/results)。
    同時回傳:
      - results (list of dict) => 每份報告的詳細資訊
      - combined_df => 所有機構的題目粒度分析
      - question_accuracy => 針對每個題目的平均答對率 (Series)
      - results_summary => 每家機構(年)整體準確率 (DataFrame)
    """

    # 先將答案檔整份讀進 DataFrame，後續就不用每次重複讀取
    latest_answer_df = pd.read_excel(answer_path)

    # 儲存每份報告的結果
    results = []

    # 走訪資料夾中的所有 CSV 檔案
    for filename in os.listdir(directory):
        if filename.endswith("_matched_chunks.csv"):
            # 解析檔名，取得 [機構名稱, 年度]。格式例如：
            # "永豐銀行_2022_300_50_matched_chunks.csv"
            parts = filename.split("_")
            institution = parts[0]  # e.g. '永豐銀行'
            year = parts[1]        # e.g. '2022'

            # 讀取該 CSV 檔
            file_path = os.path.join(directory, filename)
            df_csv = pd.read_csv(file_path)

            # 轉成 list of dict 結構
            report_dict = []
            for _, row in df_csv.iterrows():
                report_dict.append({
                    'Filename': row['Filename'],
                    'Chunk_ID': row['Chunk_ID'],
                    'Chunk_Text': row['Chunk_Text'],
                    'Matched_Categories': ast.literal_eval(row['Matched_Categories']),
                    'Embedding': ast.literal_eval(row['Embedding'])
                })

            # 載入該機構、該年度的 ground truth
            try:
                answer_dict = load_answer(institution, year, latest_answer_df)
            except ValueError as e:
                # 若找不到答案，就跳過或印出警告
                print(e)
                continue

            # 建立題目分析 DataFrame
            df_ans = create_accuracy_dataframe(answer_dict, report_dict)

            # 計算準確率
            df_ans['correct'] = (df_ans['Answer'] == df_ans['Report_Match']).astype(int)
            accuracy = df_ans['correct'].mean()

            # 儲存結果
            results.append({
                'institution': institution,
                'year': year,
                'accuracy': accuracy,
                'df_detail': df_ans  # 後續若要做更多分析，可以保留
            })

    if not results:
        print("No results found. Please check your files and directory path.")
        return None, None, None, None

    # -----------------------
    # 列印並計算整體分析
    # -----------------------
    print("===== 各家金融機構（每份報告）準確率 =====")
    for res in results:
        print(f"{res['institution']}-{res['year']}: accuracy = {res['accuracy']:.2%}")

    overall_accuracy = sum(r['accuracy'] for r in results) / len(results)
    print("====================================")
    print(f"整體平均準確率: {overall_accuracy:.2%}\n")

    # 2. 合併所有機構資料（combined_df）
    combined_list = []
    for res in results:
        df_tmp = res['df_detail'].copy()
        df_tmp['institution'] = res['institution']
        df_tmp['year'] = res['year']
        df_tmp['QID'] = df_tmp.index  # 將索引 Q1, Q2, ... 移到新欄位
        combined_list.append(df_tmp)
    combined_df = pd.concat(combined_list, axis=0).reset_index(drop=True)

    # 3. 分析：哪些題目最容易錯 (question_accuracy)
    question_accuracy = combined_df.groupby('QID')['correct'].mean().sort_values()

    print("===== 各題平均答對率（由低到高） =====")
    for qid, acc in question_accuracy.items():
        print(f"{qid}: {acc:.2%}")

    # 4. 各家機構(每份報告)整體準確率 (results_summary)
    results_summary = pd.DataFrame([
        {
            "institution": r['institution'],
            "year": r['year'],
            "accuracy": r['accuracy']
        }
        for r in results
    ])
    # 計算整體平均
    overall_accuracy = results_summary['accuracy'].mean()

    # 在末端新增一列
    # 以下示範兩種方式：

    # 方式 A：直接用 loc 新增一列
    results_summary.loc[len(results_summary)] = ["ALL", "", overall_accuracy]

    # 或者 方式 B：concat 多一列 DataFrame
    # extra_row = pd.DataFrame([["ALL", "", overall_accuracy]], 
    #                          columns=["institution", "year", "accuracy"])
    # results_summary = pd.concat([results_summary, extra_row], ignore_index=True)

    # 將含有總平均的 results_summary 輸出到 CSV (覆蓋掉舊檔或另存新檔)
    results_summary.to_csv("results_summary_with_avg.csv", index=False, encoding='utf-8-sig')

    print("完成新增總平均並輸出檔案！")

    # -----------------------------------------------------------
    # 將結果輸出到 data/results (或自訂目錄) 以便後續分析/記錄
    # -----------------------------------------------------------
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # (a) 各家機構的詳細答題 CSV
    for r in results:
        out_name = f"{r['institution']}_{r['year']}_accuracy_detail.csv"
        out_path = os.path.join(output_dir, out_name)
        r['df_detail'].to_csv(out_path, index=True, encoding='utf-8-sig')

    # (b) combined_df => 彙整所有機構
    combined_out_path = os.path.join(output_dir, "combined_analysis.csv")
    combined_df.to_csv(combined_out_path, index=False, encoding='utf-8-sig')

    # (c) question_accuracy => 各題目的平均答對率
    qa_out_path = os.path.join(output_dir, "question_accuracy.csv")
    # question_accuracy 是一個 Series，直接轉成 DataFrame再輸出
    question_accuracy.to_frame("accuracy").to_csv(qa_out_path, index=True, encoding='utf-8-sig')

    # (d) results_summary => 各家機構(每份報告)的整體準確度
    results_summary_path = os.path.join(output_dir, "results_summary.csv")
    results_summary.to_csv(results_summary_path, index=False, encoding='utf-8-sig')

    return results, combined_df, question_accuracy, results_summary


# 執行範例
# results, combined_df, question_accuracy, results_summary = batch_accuracy_analysis()

In [8]:
def generate_plots(
    results_summary: pd.DataFrame,
    question_accuracy: pd.Series,
    output_dir: str = "data/results"
):
    """
    根據分析結果，產生多張圖表並輸出成 PNG。
      1. 各家金融機構(年)整體準確率 (Bar Chart)
      2. 各題平均答對率 (Bar Chart)
      3. 各題答對率分布 (Histogram / Boxplot)
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # ---- 1. 各家金融機構(每份報告)整體準確率 ----
    # 先合併 institution + year 做為一個標籤 (若有需要)
    results_summary = results_summary.copy()
    results_summary['institution_year'] = (
        results_summary['institution'].astype(str) + "_" + results_summary['year'].astype(str)
    )

    plt.figure(figsize=(8, 6))
    sns.barplot(
        data=results_summary,
        x='institution_year',
        y='accuracy',
        palette="Blues_d"
    )
    plt.xticks(rotation=45, ha='right')
    plt.ylim(0, 1)
    plt.xlabel("Institution_Year")
    plt.ylabel("Accuracy")
    plt.title("各家金融機構(年)整體準確率")
    plt.tight_layout()
    out_path_1 = os.path.join(output_dir, "institution_year_accuracy.png")
    plt.savefig(out_path_1, dpi=200)
    plt.close()

    # ---- 2. 各題平均答對率 (Bar Chart，從低到高) ----
    question_df = question_accuracy.reset_index()
    question_df.columns = ['QID', 'accuracy']  # 原先是 (index=QID, values=accuracy)
    question_df = question_df.sort_values('accuracy', ascending=True)

    plt.figure(figsize=(10, 6))
    sns.barplot(
        data=question_df,
        x='accuracy',
        y='QID',
        palette="Reds_r"
    )
    plt.xlim(0, 1)
    plt.xlabel("平均答對率")
    plt.ylabel("題目 (QID)")
    plt.title("各題平均答對率（由低到高）")
    plt.tight_layout()
    out_path_2 = os.path.join(output_dir, "question_accuracy_bar.png")
    plt.savefig(out_path_2, dpi=200)
    plt.close()

    # ---- 3. 各題答對率分布 (Histogram / Boxplot) ----
    plt.figure(figsize=(6, 4))
    sns.histplot(question_df['accuracy'], bins=10, color='green')
    plt.xlabel("答對率")
    plt.title("各題答對率分布 (Histogram)")
    plt.tight_layout()
    out_path_3 = os.path.join(output_dir, "question_accuracy_hist.png")
    plt.savefig(out_path_3, dpi=200)
    plt.close()

    # 也可以做 Boxplot
    plt.figure(figsize=(2, 6))
    sns.boxplot(y=question_df['accuracy'], color='lightblue')
    plt.ylim(0, 1)
    plt.title("各題答對率分布 (Boxplot)")
    plt.tight_layout()
    out_path_4 = os.path.join(output_dir, "question_accuracy_box.png")
    plt.savefig(out_path_4, dpi=200)
    plt.close()

    print(f"已在 {output_dir} 輸出多張分析圖表！")


In [11]:

# 執行主分析
results, combined_df, question_accuracy, results_summary = batch_accuracy_analysis(
    directory="data/tcfd_report_pdf_chunks_matching_result_第四層", 
    answer_path="data/answer/rank.xlsx",
    output_dir="data/results"
)

# 繪製圖表
if results is not None:
    generate_plots(
        results_summary=results_summary,
        question_accuracy=question_accuracy,
        output_dir="data/results"  # 可自訂
    )


===== 各家金融機構（每份報告）準確率 =====
上海商銀-2022: accuracy = 81.32%
中信金控-2021: accuracy = 35.16%
中信金控-2022: accuracy = 74.73%
中信銀行-2022: accuracy = 52.75%
中輸銀行-2022: accuracy = 69.23%
元大金控-2021: accuracy = 84.62%
元大金控-2022: accuracy = 80.22%
元大銀行-2022: accuracy = 79.12%
兆豐金控-2021: accuracy = 80.22%
兆豐金控-2022: accuracy = 83.52%
兆豐銀行-2022: accuracy = 72.53%
凱基銀行-2022: accuracy = 72.53%
北富銀-2022: accuracy = 72.53%
台中銀行-2022: accuracy = 71.43%
台新金控-2022: accuracy = 79.12%
台新銀行-2022: accuracy = 73.63%
台灣企銀-2021: accuracy = 67.03%
台灣企銀-2022: accuracy = 79.12%
合庫金控-2022: accuracy = 89.01%
國泰金控-2021: accuracy = 81.32%
國泰金控-2022: accuracy = 62.64%
國泰銀行-2022: accuracy = 82.42%
國票金控-2022: accuracy = 76.92%
安泰銀行-2022: accuracy = 78.02%
富邦金控-2020: accuracy = 71.43%
富邦金控-2021: accuracy = 69.23%
富邦金控-2022: accuracy = 90.11%
彰化銀行-2020: accuracy = 60.44%
彰化銀行-2021: accuracy = 56.04%
彰化銀行-2022: accuracy = 71.43%
新光金控-2021: accuracy = 74.73%
新光金控-2022: accuracy = 80.22%
新光銀行-2022: accuracy = 78.02%
星展銀行-2022: accur


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()

已在 data/results 輸出多張分析圖表！


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(out_path_3, dpi=200)
  plt.savefig(out_path_3, dpi=200)
  plt.savefig(out_path_3, dpi=200)
  plt.savefig(out_path_3, dpi=200)
  plt.savefig(out_path_3, dpi=200)
  plt.savefig(out_path_3, dpi=200)
  plt.savefig(out_path_3, dpi=200)
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(out_path_4, dpi=200)
  plt.savefig(out_path_4, dpi=200)
  plt.savefig(out_path_4, dpi=200)
  plt.savefig(out_path_4, dpi=200)
  plt.savefig(out_path_4, dpi=200)
  plt.savefig(out_path_4, dpi=200)
  plt.savefig(out_path_4, dpi=200)
