# Extract Occupations and Skills from PAD Chunks

Extract occupations and skills from markdown chunks using OpenAI custom GPT.

## 1. Import Required Libraries

In [7]:
import os
import json
from pathlib import Path
from dotenv import load_dotenv

from openai import OpenAI

# Import our config
import sys
sys.path.append(str(Path.cwd().parent))
from src.config import load_config

## 2. Load Configuration and Environment Variables

In [8]:
# Load environment variables from .env file
project_root = Path.cwd().parent
env_path = project_root / ".env"

if not env_path.exists():
    raise FileNotFoundError(
        f"'.env' file not found at {env_path}\n"
        "Please copy .env.example to .env and add your OpenAI API key."
    )

# Load from specific path
load_dotenv(env_path, override=True)

# Load project config
config = load_config()

# Get OpenAI API key from environment
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Verify API key is set
if not OPENAI_API_KEY:
    raise ValueError("Missing required environment variable: OPENAI_API_KEY")

print("✓ Environment variables loaded")
print(f"  API Key: {OPENAI_API_KEY[:10]}...{OPENAI_API_KEY[-4:]}")

✓ Environment variables loaded
  API Key: sk-proj-cj...__0A


## 3. Set Up Paths

In [9]:
# Get paths
md_dir = project_root / config.paths.markdown
chunks_dir = md_dir.parent / "pads_md_chunks"
output_dir = project_root / "data" / "silver" / "occupations_skills_json"

# Create output directory
output_dir.mkdir(parents=True, exist_ok=True)

print(f"Chunks directory: {chunks_dir}")
print(f"Output directory: {output_dir}")
print(f"Chunks exist: {chunks_dir.exists()}")

Chunks directory: /Users/lauren/repos/PAD2Skills/data/silver/pads_md_chunks
Output directory: /Users/lauren/repos/PAD2Skills/data/silver/occupations_skills_json
Chunks exist: True


## 4. Load Chunk Files

In [10]:
# Find all markdown chunk files
chunk_files = sorted(chunks_dir.glob("*.md"))

print(f"Found {len(chunk_files)} chunk files")
print("\nFirst 5 chunks:")
for chunk_file in chunk_files[:5]:
    size_kb = chunk_file.stat().st_size / 1024
    print(f"  {chunk_file.name:60s} {size_kb:6.2f} KB")

if len(chunk_files) > 5:
    print(f"  ... and {len(chunk_files) - 5} more")

Found 16 chunk files

First 5 chunks:
  P075941_0_strategic_context.md                                30.30 KB
  P075941_10_economic_and_financial_analysis_implementation_arrangements.md 185.08 KB
  P075941_11_power_supply_options_for_the_nile_equatorial_lakes_region_nel.md  10.44 KB
  P075941_12_summary_of_the_power_sectors_in_burundi_rwanda_and_tanzania.md  32.54 KB
  P075941_13_implementation_support_team.md                      5.18 KB
  ... and 11 more


## 5. Initialize OpenAI Client

In [11]:
# Initialize OpenAI client
client = OpenAI()

print("✓ OpenAI client initialized")

✓ OpenAI client initialized


## 6. Process Each Chunk

In [None]:
print(f"Processing {len(chunk_files)} chunks...")
print()

processed_chunks = []

for i, chunk_file in enumerate(chunk_files, 1):
    # Parse filename: {project_id}_{section_id}_{snake_title}.md
    filename_parts = chunk_file.stem.split('_', 2)
    project_id = filename_parts[0]
    section_id = filename_parts[1]
    
    # Read chunk content
    chunk_text = chunk_file.read_text(encoding='utf-8')
    
    print(f"[{i}/{len(chunk_files)}] Processing: {chunk_file.name}")
    print(f"  Project ID: {project_id}, Section ID: {section_id}")
    print(f"  Chunk size: {len(chunk_text)} chars")
    
    # Prepare input for custom GPT
    input_message = f"project_id: {project_id}\nsection_id: {section_id}\nchunk_text: {chunk_text}"
    
    # Call custom GPT with prompt ID
    response = client.responses.create(
        prompt={
            "id": "pmpt_6950c224bab0819486a7f38e0ae0109b08192593c3d4b4af",
            "version": "13"
        },
        input=[
            {"role": "user", "content": input_message}
        ],
        reasoning={
            "summary": None
        },
        store=False,
        include=[
            "reasoning.encrypted_content",
            "web_search_call.action.sources"
        ]
    )
    
    # Extract the text from the response
    result = None
    for item in response.output:
        if hasattr(item, 'content') and hasattr(item, 'role'):
            result = item.content[0].text
            break
    
    # Save result to file
    output_file = output_dir / f"{project_id}_{section_id}_occupations.json"
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(result)
    
    processed_chunks.append({
        'chunk_file': chunk_file.name,
        'project_id': project_id,
        'section_id': section_id,
        'output_file': output_file.name,
        'response_id': response.id
    })
    
    print(f"  ✓ Saved to: {output_file.name}")
    print()

print("=" * 80)
print(f"✓ Processed {len(processed_chunks)} chunks")
print(f"✓ Results saved to: {output_dir}")

Processing 16 chunks...

[1/16] Processing: P075941_0_strategic_context.md
  Project ID: P075941, Section ID: 0
  Chunk size: 31030 chars


NameError: name 'null' is not defined

## 7. Verify Output Files

In [None]:
# List all output files
output_files = sorted(output_dir.glob("*_occupations.json"))

print(f"Created {len(output_files)} output files:")
print("=" * 80)

for output_file in output_files[:10]:  # Show first 10
    size_kb = output_file.stat().st_size / 1024
    print(f"  {output_file.name:60s} {size_kb:6.2f} KB")

if len(output_files) > 10:
    print(f"  ... and {len(output_files) - 10} more")

print("=" * 80)
print(f"Total output files: {len(output_files)}")