## 1. Import Required Libraries

In [24]:
import os
import json
from pathlib import Path
from dotenv import load_dotenv

from openai import OpenAI

# Import our config
import sys
sys.path.append(str(Path.cwd().parent))
from src.config import load_config

## 2. Load Configuration and Environment Variables

In [25]:
# Load environment variables from .env file
project_root = Path.cwd().parent
env_path = project_root / ".env"

if not env_path.exists():
    raise FileNotFoundError(
        f"'.env' file not found at {env_path}\n"
        "Please copy .env.example to .env and add your OpenAI API key."
    )

# Load from specific path
load_dotenv(env_path, override=True)

# Load project config
config = load_config()

# Get OpenAI API key from environment
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Verify API key is set
if not OPENAI_API_KEY:
    raise ValueError("Missing required environment variable: OPENAI_API_KEY")

print("✓ Environment variables loaded")
print(f"  API Key: {OPENAI_API_KEY[:10]}...{OPENAI_API_KEY[-4:]}")

✓ Environment variables loaded
  API Key: sk-proj-cj...__0A


## 3. Set Up Paths

In [26]:
# Get paths
md_dir = project_root / config.paths.markdown
prompts_dir = project_root / "prompts"

# Target markdown file
target_file = md_dir / "P075941_1.md"
project_id = "P075941"

print(f"Markdown directory: {md_dir}")
print(f"Prompts directory: {prompts_dir}")
print(f"Target file: {target_file}")
print(f"File exists: {target_file.exists()}")

Markdown directory: /Users/lauren/repos/PAD2Skills/data/silver/pads_md
Prompts directory: /Users/lauren/repos/PAD2Skills/prompts
Target file: /Users/lauren/repos/PAD2Skills/data/silver/pads_md/P075941_1.md
File exists: True


## 4. Load Markdown Content

In [27]:
# Load markdown file
if not target_file.exists():
    raise FileNotFoundError(f"Markdown file not found: {target_file}")

markdown_content = target_file.read_text(encoding="utf-8")

# Prepare input message for custom GPT
input_message = f"Project ID: {project_id}\n\n{markdown_content}"

print(f"✓ Markdown content loaded ({len(markdown_content)} chars)")
print(f"Input message length: {len(input_message)} chars")
print(f"\nMarkdown preview (first 500 chars):")
print("=" * 60)
print(markdown_content[:500])
print("...")

✓ Markdown content loaded (736923 chars)
Input message length: 736944 chars

Markdown preview (first 500 chars):
Public Disclosure Authorized

Public Disclosure Authorized

Public Disclosure Authorized Public Disclosure Authorized

Public Disclosure Authorized

Public Disclosure Authorized

Document of The World Bank

## FOR OFFICIAL USE ONLY

## INTERNATIONAL DEVELOPMENT ASSOCIATION

## PROJECT APPRAISAL DOCUMENT

ON  A PROPOSED CREDIT IN THE AMOUNT OF  SDR 37.80  MILLION (US$56.65 MILLION EQUIVALENT) AND A PROPOSED GRANT IN THE AMOUNT OF SDR 37.80 MILLION (US$ 56.65 MILLION EQUIVALENT) TO  THE  REPUBLIC 
...


## 5. Initialize OpenAI Client

In [28]:
# Initialize OpenAI client
client = OpenAI()

print("✓ OpenAI client initialized")

✓ OpenAI client initialized


## 6. Send Request to Identify Document Sections

In [None]:
print("Sending request to custom GPT...")
print(f"Input length: {len(input_message)} chars")
print()

# Call custom GPT with prompt ID
response = client.responses.create(
    prompt={
        "id": "pmpt_6950b4992fcc8194b89fc2d87be08bf8088afcd3c3f3a4d7",
        "version": "6"
    },
    input=[
        {"role": "user", "content": input_message}
    ],
    reasoning={
        "summary": "auto"
    },
    store=True,
    include=[
        "reasoning.encrypted_content",
        "web_search_call.action.sources"
    ]
)

Sending request to custom GPT...
Input length: 736944 chars

✓ Response received
  Response ID: resp_0731195c302b01bb006951bbf10bdc8196ab439d52324f7d54
  Status: completed


In [37]:
# Extract the text from the response
# response.output is a list: [ResponseReasoningItem, ResponseOutputMessage]
# The message content is in output[1].content[0].text
for item in response.output:
    if hasattr(item, 'content') and hasattr(item, 'role'):
        result = item.content[0].text
        break

print("✓ Response received")
print(f"  Response ID: {response.id}")
print(f"  Status: {response.status}")


✓ Response received
  Response ID: resp_0731195c302b01bb006951bbf10bdc8196ab439d52324f7d54
  Status: completed


## 7. Save Results to File

In [38]:
# Save result directly to JSON file
output_dir = project_root / "data" / "silver" / "document_sections"
output_dir.mkdir(parents=True, exist_ok=True)

output_file = output_dir / f"{project_id}_sections.json"

with open(output_file, 'w', encoding='utf-8') as f:
    f.write(result)

print(f"✓ Saved sections to: {output_file}")
print(f"  File size: {output_file.stat().st_size / 1024:.2f} KB")


✓ Saved sections to: /Users/lauren/repos/PAD2Skills/data/silver/document_sections/P075941_sections.json
  File size: 2.73 KB


## 8. Read and Parse Section JSON

In [39]:
# Read and parse the sections JSON file
with open(output_file, 'r', encoding='utf-8') as f:
    sections_data = json.load(f)

sections = sections_data['sections']

print(f"✓ Loaded {len(sections)} sections")
for section in sections[:5]:  # Show first 5
    print(f"  {section['section_id']}: {section['section_title']}")
if len(sections) > 5:
    print(f"  ... and {len(sections) - 5} more")


✓ Loaded 16 sections
  0: STRATEGIC CONTEXT
  1: PROJECT DEVELOPMENT OBJECTIVES
  2: PROJECT DESCRIPTION
  3: IMPLEMENTATION
  4: KEY RISKS AND MITIGATION MEASURES
  ... and 11 more


## 9. Split Markdown into Section Chunks

In [41]:
import re

def to_snake_case(text):
    """Convert text to lower snake case"""
    # Replace spaces and special chars with underscores
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', '_', text)
    return text.lower()

def normalize_whitespace(text):
    """Normalize multiple spaces to single space"""
    return re.sub(r'\s+', ' ', text)

def find_header_in_markdown(markdown, header_text):
    """Find header in markdown, handling whitespace differences"""
    # Try exact match first
    pos = markdown.find(header_text)
    if pos != -1:
        return pos
    
    # Try with normalized whitespace
    normalized_header = normalize_whitespace(header_text)
    
    # Search using regex to match any amount of whitespace
    pattern = re.escape(normalized_header).replace(r'\ ', r'\s+')
    match = re.search(pattern, markdown)
    
    if match:
        return match.start()
    
    return -1

# Create output directory for chunks
chunks_dir = md_dir.parent / "pads_md_chunks"
chunks_dir.mkdir(parents=True, exist_ok=True)

print(f"Splitting markdown into {len(sections)} section chunks...")
print(f"Output directory: {chunks_dir}")
print()

# Split the markdown by sections
saved_chunks = []
for i, section in enumerate(sections):
    header_text = section['header_text']
    section_id = section['section_id']
    section_title = section['section_title']
    
    # Find the start position of this section (with fuzzy whitespace matching)
    start_pos = find_header_in_markdown(markdown_content, header_text)
    
    if start_pos == -1:
        print(f"⚠ Warning: Could not find header '{header_text}' in markdown")
        continue
    
    # Find the end position (start of next section, or end of document)
    if i < len(sections) - 1:
        next_header = sections[i + 1]['header_text']
        end_pos = find_header_in_markdown(markdown_content[start_pos + len(header_text):], next_header)
        if end_pos == -1:
            end_pos = len(markdown_content)
        else:
            end_pos = start_pos + len(header_text) + end_pos
    else:
        end_pos = len(markdown_content)
    
    # Extract the section content
    section_content = markdown_content[start_pos:end_pos].rstrip()
    
    # Generate filename
    snake_title = to_snake_case(section_title)
    filename = f"{project_id}_{section_id}_{snake_title}.md"
    chunk_file = chunks_dir / filename
    
    # Save the chunk
    chunk_file.write_text(section_content, encoding='utf-8')
    saved_chunks.append({
        'file': filename,
        'size': len(section_content),
        'title': section_title
    })
    
    print(f"✓ Saved: {filename} ({len(section_content)} chars)")

print()
print(f"✓ Saved {len(saved_chunks)} section chunks to {chunks_dir}")


Splitting markdown into 16 section chunks...
Output directory: /Users/lauren/repos/PAD2Skills/data/silver/pads_md_chunks

✓ Saved: P075941_0_strategic_context.md (31030 chars)
✓ Saved: P075941_1_project_development_objectives.md (2769 chars)
✓ Saved: P075941_2_project_description.md (17649 chars)
✓ Saved: P075941_3_implementation.md (8975 chars)
✓ Saved: P075941_4_key_risks_and_mitigation_measures.md (10570 chars)
✓ Saved: P075941_5_appraisal_summary.md (48636 chars)
✓ Saved: P075941_6_results_framework_and_monitoring.md (15964 chars)
✓ Saved: P075941_7_detailed_project_description.md (22009 chars)
✓ Saved: P075941_8_implementation_arrangements_regional_rusumo_falls_hydroelectric_project.md (101151 chars)
✓ Saved: P075941_9_operational_risk_assessment_framework_oraf.md (116773 chars)
✓ Saved: P075941_10_economic_and_financial_analysis_implementation_arrangements.md (189523 chars)
✓ Saved: P075941_11_power_supply_options_for_the_nile_equatorial_lakes_region_nel.md (10695 chars)
✓ Saved:

## 10. Verify Chunks Created

In [None]:
# List all created chunk files
chunk_files = sorted(chunks_dir.glob(f"{project_id}_*.md"))

print(f"Created {len(chunk_files)} chunk files:")
print("=" * 80)

for chunk_file in chunk_files:
    size_kb = chunk_file.stat().st_size / 1024
    print(f"  {chunk_file.name:60s} {size_kb:6.2f} KB")

print("=" * 80)
print(f"Total chunks: {len(chunk_files)}")
