# Generate PAD Summary

Generate a summary of PAD documents by passing project_id and concatenated pad_text (abbreviations + first 4 chunks) to the API.

## Import Required Libraries

In [1]:
import os
import json
from pathlib import Path
from dotenv import load_dotenv

from openai import OpenAI

# Import our config
import sys
sys.path.append(str(Path.cwd().parent))
from src.config import load_config

## Load Configuration and Environment Variables

In [2]:
# Load environment variables from .env file
project_root = Path.cwd().parent
env_path = project_root / ".env"

if not env_path.exists():
    raise FileNotFoundError(
        f"'.env' file not found at {env_path}\n"
        "Please copy .env.example to .env and add your OpenAI API key."
    )

# Load from specific path
load_dotenv(env_path, override=True)

# Load project config
config = load_config()

# Get OpenAI API key from environment
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Verify API key is set
if not OPENAI_API_KEY:
    raise ValueError("Missing required environment variable: OPENAI_API_KEY")

print("✓ Environment variables loaded")
print(f"  API Key: {OPENAI_API_KEY[:10]}...{OPENAI_API_KEY[-4:]}")

✓ Environment variables loaded
  API Key: sk-proj-cj...__0A


## Set Up Paths

In [3]:
# Get paths
chunks_dir = project_root / "data" / "silver" / "pads_md_chunks"
abbr_dir = project_root / "data" / "silver" / "abbreviations_md"
output_dir = project_root / "data" / "silver" / "pad_summaries"

# Create output directory
output_dir.mkdir(parents=True, exist_ok=True)

print(f"Chunks directory: {chunks_dir}")
print(f"Abbreviations directory: {abbr_dir}")
print(f"Output directory: {output_dir}")

Chunks directory: /Users/lauren/repos/PAD2Skills/data/silver/pads_md_chunks
Abbreviations directory: /Users/lauren/repos/PAD2Skills/data/silver/abbreviations_md
Output directory: /Users/lauren/repos/PAD2Skills/data/silver/pad_summaries


## Select Project to Process

In [4]:
# Set project to process
project_id = "P075941"

print(f"Processing project: {project_id}")

Processing project: P075941


## Load Abbreviations

In [5]:
# Load abbreviations file for this project
abbr_file = abbr_dir / f"{project_id}_abbr.md"

if abbr_file.exists():
    abbreviations_text = abbr_file.read_text(encoding='utf-8')
    print(f"✓ Loaded abbreviations file: {abbr_file.name}")
    print(f"  Size: {len(abbreviations_text)} chars")
    print(f"  Preview:\n{abbreviations_text[:200]}...")
else:
    abbreviations_text = ""
    print(f"⚠ No abbreviations file found: {abbr_file}")
    print("  Proceeding without abbreviations context")

✓ Loaded abbreviations file: P075941_abbr.md
  Size: 4506 chars
  Preview:
Abbreviation | Definition
--- | ---
AfDB | Africa Development Bank
BoQ | Bill of Quantities
CAG | Controller and Auditor General
CAS | Country Assistance Strategy
CBWS | Comprehensive Basin-wide Study...


## Load First 4 Chunks (0, 1, 2, 3)

In [7]:
# Find all chunk files
chunk_files = sorted(chunks_dir.glob("*.md"))

# Find chunks for this project
project_chunks = sorted([f for f in chunk_files if f.stem.startswith(project_id)])

print(f"Found {len(project_chunks)} chunks for {project_id}")

# Load first 4 chunks (0, 1, 2, 3)
chunks_to_load = []
for i in range(4):
    chunk_pattern = f"{project_id}_{i}_"
    matching_chunks = [f for f in project_chunks if f.stem.startswith(chunk_pattern)]
    
    if matching_chunks:
        chunk_file = matching_chunks[0]
        chunk_text = chunk_file.read_text(encoding='utf-8')
        chunks_to_load.append(chunk_text)
        print(f"✓ Loaded chunk {i}: {chunk_file.name} ({len(chunk_text)} chars)")
    else:
        print(f"⚠ No chunk {i} found for {project_id}")

print(f"\nTotal chunks loaded: {len(chunks_to_load)}")

Found 17 chunks for P075941
✓ Loaded chunk 0: P075941_0_strategic_context.md (31030 chars)
✓ Loaded chunk 1: P075941_1_project_development_objectives.md (2769 chars)
✓ Loaded chunk 2: P075941_2_project_description.md (17649 chars)
✓ Loaded chunk 3: P075941_3_implementation.md (8975 chars)

Total chunks loaded: 4


## Concatenate Abbreviations and Chunks

In [8]:
# Build pad_text: abbreviations + chunks concatenated with double newlines
pad_text_parts = []

if abbreviations_text:
    pad_text_parts.append(abbreviations_text)

pad_text_parts.extend(chunks_to_load)

pad_text = "\n\n".join(pad_text_parts)

print(f"✓ Concatenated pad_text")
print(f"  Total size: {len(pad_text)} chars")
print(f"  Components: {len(pad_text_parts)} parts")
if abbreviations_text:
    print(f"    - Abbreviations: {len(abbreviations_text)} chars")
print(f"    - Chunks: {len(chunks_to_load)} chunks")

# Preview
print(f"\n  Preview (first 500 chars):\n{pad_text[:500]}...")

✓ Concatenated pad_text
  Total size: 64937 chars
  Components: 5 parts
    - Abbreviations: 4506 chars
    - Chunks: 4 chunks

  Preview (first 500 chars):
Abbreviation | Definition
--- | ---
AfDB | Africa Development Bank
BoQ | Bill of Quantities
CAG | Controller and Auditor General
CAS | Country Assistance Strategy
CBWS | Comprehensive Basin-wide Study
COD | Commercial Operation Date
COMESA | Common Market for Eastern and Southern Africa
CSO | Civil Society Organization
DARESCO | District Electric Supply Company Ltd.
DflD | Department for International Development, UK
DP | Development Partner
DRC | Democratic Republic of Congo
E&M | Electrical an...


## Initialize OpenAI Client

In [9]:
# Initialize OpenAI client
client = OpenAI()

print("✓ OpenAI client initialized")

✓ OpenAI client initialized


## Call API to Generate Summary

In [16]:
# Prepare input message
input_message = f"project_id: {project_id}\npad_text: {pad_text}"

print(f"Calling API with:")
print(f"  project_id: {project_id}")
print(f"  pad_text length: {len(pad_text)} chars")
print()

# Call API with the specified prompt
response = client.responses.create(
    prompt={
        "id": "pmpt_6958a3a9da908190b195df7be708793008cf7519acf777ba",
        "version": "3"
    },
    input=[
        {"role": "user", "content": input_message}
    ],
    reasoning={
        "summary": None
    },
    store=False,
    include=[
        "reasoning.encrypted_content",
        "web_search_call.action.sources"
    ]
)

print("✓ API call completed")

Calling API with:
  project_id: P075941
  pad_text length: 64937 chars

✓ API call completed


## Extract and Display Response

In [17]:
# Extract response content
response_text = response.output_text

print("Response received:")
print("=" * 80)
print(response_text)
print("=" * 80)
print(f"\nResponse length: {len(response_text)} chars")

Response received:
The Rusumo Falls Hydroelectric Project is a regional run-of-river power scheme on the Kagera River. It will be developed and shared by Burundi, Rwanda and Tanzania in the Nile Equatorial Lakes region. These countries face acute power deficits, low access to electricity, high-cost thermal generation, and constrained economic opportunities. The project aims to increase the supply of electricity to the national grids of Rwanda, Tanzania and Burundi. It will finance an 80 MW run-of-river power plant, transmission facilities to connect the plant to national grids, and related institutional measures including a Special Purpose Vehicle, mitigation of social and environmental impacts, and livelihood and local area development programs. The Rusumo Power Company Limited (RPCL) will own the plant and the NELSAP Rusumo Project Implementation Unit (RPIU) will implement the power plant complex on RPCL’s behalf. An Owner's Engineer will provide technical oversight. A Project Steeri

## Save Response to File

In [18]:
# Save response to output directory
output_file = output_dir / f"{project_id}_summary.txt"
output_file.write_text(response_text, encoding='utf-8')

print(f"✓ Saved summary to: {output_file}")
print(f"  Size: {len(response_text)} chars")

✓ Saved summary to: /Users/lauren/repos/PAD2Skills/data/silver/pad_summaries/P075941_summary.txt
  Size: 1816 chars
