In [2]:
import pandas as pd
import os
import json

# --- 1. Set file paths ---
# ✅ Please manually modify your input CSV file path
Document_ID = "g33a"
input_csv_path = fr"C:\Users\User\Desktop\Compliance-GraphRAG\UnstructuredData_Transformation_Pipeline\SPM\csv_data\{Document_ID}.csv"

# ✅ Please manually set your output CSV file path
output_csv_path = fr"C:\Users\User\Desktop\Compliance-GraphRAG\UnstructuredData_Transformation_Pipeline\graphrag_csv\{Document_ID}.csv"


# Check if input file exists
if not os.path.exists(input_csv_path):
    print(f"❌ Error: Input file not found {input_csv_path}")
else:
    # Ensure output directory exists
    output_dir = os.path.dirname(output_csv_path)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created directory: {output_dir}")

    print(f"Reading file: {input_csv_path}")
    print(f"Output file: {output_csv_path}")

    # --- 2. Read CSV and aggregate ---
    df = pd.read_csv(input_csv_path)
    
    aggregated_rows = []
    current_section_row = None

    for index, row in df.iterrows():
        # 確保 level 是數字進行比較
        try:
            is_level_one = int(row['level']) == 1
        except:
            is_level_one = False

        if is_level_one:
            if current_section_row is not None:
                aggregated_rows.append(current_section_row)
            current_section_row = row.to_dict()
            
            # --- ✅ 修正邏輯開始 ---
            # 1. 安全取得字串 (避免 "nan")
            cid = str(row['clause_id']) if pd.notna(row.get('clause_id')) else ""
            title = str(row['title']) if pd.notna(row.get('title')) else ""

            if cid.endswith(".0"):
                cid = cid[:-2]
            
            cid = cid.strip()
            title = title.strip()

            # 2. 根據 ID 是否為數字開頭決定格式
            if cid and cid[0].isdigit():
                # 數字開頭 -> 加點 (例如: "1. Introduction")
                current_section_row['section_title'] = f"{cid}. {title}".strip()
            else:
                # 文字開頭 -> 空白 (例如: "ANNEX 1 Repealed")
                current_section_row['section_title'] = f"{cid} {title}".strip()
            # --- ✅ 修正邏輯結束 ---

        elif current_section_row is not None:
            # 將子章節內容合併到主章節
            current_section_row['full_text'] += "\n\n" + str(row['full_text'])
            # 更新結束頁碼
            if row['page_end'] > current_section_row['page_end']:
                current_section_row['page_end'] = row['page_end']

    # 加入最後一筆
    if current_section_row is not None:
        aggregated_rows.append(current_section_row)

    # --- 3. Convert to GraphRAG format ---
    if aggregated_rows:
        df_aggregated = pd.DataFrame(aggregated_rows)
        
        # Create columns required by GraphRAG
        df_graphrag = pd.DataFrame()
        
        # Core columns
        df_graphrag['text'] = df_aggregated['full_text']
        df_graphrag['doc_title'] = df_aggregated.apply(
            lambda row: f"{row['document_id']}: {row['doc_title']}", 
            axis=1
        )
        
        # Split metadata into separate columns
        df_graphrag['version'] = df_aggregated['version']
        df_graphrag['author'] = df_aggregated['author']
        df_graphrag['section_title'] = df_aggregated['section_title']

        # --- 4. Save as GraphRAG formatted CSV ---
        df_graphrag.to_csv(output_csv_path, index=False, encoding='utf-8')
        print(f"✅ Success! Saved GraphRAG formatted data to: {output_csv_path}")
        
        # Display result in Notebook
        display(df_graphrag.head())
    else:
        print("ℹ️ No section with level = 1 found in the file.")

Reading file: C:\Users\User\Desktop\Compliance-GraphRAG\UnstructuredData_Transformation_Pipeline\SPM\csv_data\g33a.csv
Output file: C:\Users\User\Desktop\Compliance-GraphRAG\UnstructuredData_Transformation_Pipeline\graphrag_csv\g33a.csv
✅ Success! Saved GraphRAG formatted data to: C:\Users\User\Desktop\Compliance-GraphRAG\UnstructuredData_Transformation_Pipeline\graphrag_csv\g33a.csv


Unnamed: 0,text,doc_title,version,author,section_title
0,1. Introduction\n\n1.1 The current HKMA Guidel...,g33a: nan,,,1. Introduction
1,2. Customer acceptance policy\n\n2.1 This is a...,g33a: nan,,,2. Customer acceptance policy
2,3. Customer due diligence\n\n3.1 This section ...,g33a: nan,,,3. Customer due diligence
3,4. Corporate customers\n\n4.1 This section sup...,g33a: nan,,,4. Corporate customers
4,5. Trust and nominee accounts\n\n5.1 This sect...,g33a: nan,,,5. Trust and nominee accounts
