In [2]:
import pandas as pd
import os
import json

# --- 1. Set file paths ---
# ✅ Please manually modify your input CSV file path
Document_ID = "SA-1"
input_csv_path = fr"C:\Users\User\Desktop\Microsoft_GraphRAG\UnstructuredData_Transformation_Pipeline\SPM\csv_data\{Document_ID}.csv"

# ✅ Please manually set your output CSV file path
output_csv_path = fr"C:\Users\User\Desktop\Microsoft_GraphRAG\UnstructuredData_Transformation_Pipeline\graphrag_csv\{Document_ID}.csv"


# Check if input file exists
if not os.path.exists(input_csv_path):
    print(f"❌ Error: Input file not found {input_csv_path}")
else:
    # Ensure output directory exists
    output_dir = os.path.dirname(output_csv_path)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created directory: {output_dir}")

    print(f"Reading file: {input_csv_path}")
    print(f"Output file: {output_csv_path}")

    # --- 2. Read CSV and aggregate ---
    df = pd.read_csv(input_csv_path)
    
    aggregated_rows = []
    current_section_row = None

    for index, row in df.iterrows():
        if row['level'] == 1:
            if current_section_row is not None:
                aggregated_rows.append(current_section_row)
            current_section_row = row.to_dict()
            current_section_row['section_title'] = str(row['full_text'])
        elif current_section_row is not None:
            current_section_row['full_text'] += "\n\n" + str(row['full_text'])
            if row['page_end'] > current_section_row['page_end']:
                current_section_row['page_end'] = row['page_end']

    if current_section_row is not None:
        aggregated_rows.append(current_section_row)

    # --- 3. Convert to GraphRAG format ---
    if aggregated_rows:
        df_aggregated = pd.DataFrame(aggregated_rows)
        
        # Create columns required by GraphRAG
        df_graphrag = pd.DataFrame()
        
        # Core columns
        df_graphrag['text'] = df_aggregated['full_text']
        df_graphrag['doc_title'] = df_aggregated.apply(
            lambda row: f"{row['document_id']}: {row['doc_title']}", 
            axis=1
        )
        
        # Split metadata into separate columns
        df_graphrag['version'] = df_aggregated['version']
        df_graphrag['author'] = df_aggregated['author']
        df_graphrag['section_title'] = df_aggregated['section_title']

        # --- 4. Save as GraphRAG formatted CSV ---
        df_graphrag.to_csv(output_csv_path, index=False, encoding='utf-8')
        print(f"✅ Success! Saved GraphRAG formatted data to: {output_csv_path}")
        
        # Display result in Notebook
        display(df_graphrag.head())
    else:
        print("ℹ️ No section with level = 1 found in the file.")

Reading file: C:\Users\User\Desktop\Microsoft_GraphRAG\UnstructuredData_Transformation_Pipeline\SPM\csv_data\SA-1.csv
Output file: C:\Users\User\Desktop\Microsoft_GraphRAG\UnstructuredData_Transformation_Pipeline\graphrag_csv\SA-1.csv
✅ Success! Saved GraphRAG formatted data to: C:\Users\User\Desktop\Microsoft_GraphRAG\UnstructuredData_Transformation_Pipeline\graphrag_csv\SA-1.csv


Unnamed: 0,text,doc_title,version,author,section_title
0,1. Supervisory framework\n\n1.1 Introduction\n...,SA-1: Risk-based Supervisory Approach,V.2-23.12.2022,HKMA,1. Supervisory framework
1,2. The eight types of inherent risk\n\n2.1 Cre...,SA-1: Risk-based Supervisory Approach,V.2-23.12.2022,HKMA,2. The eight types of inherent risk
2,3. Four elements of a sound risk management sy...,SA-1: Risk-based Supervisory Approach,V.2-23.12.2022,HKMA,3. Four elements of a sound risk management sy...
3,4. Rating risk management\n\n4.1 Factors consi...,SA-1: Risk-based Supervisory Approach,V.2-23.12.2022,HKMA,4. Rating risk management
