# Pipeline Runner

Run utilities and pipeline steps for selected projects.

## 0. Setup

### 0.01 Import Required Libraries

In [12]:
import os
from pathlib import Path
from dotenv import load_dotenv

import pandas as pd

# Import our config
import sys
sys.path.append(str(Path.cwd().parent))
from src.config import load_config

### 0.02 Load Configuration and Environment Variables

In [13]:
# Load environment variables from .env file
project_root = Path.cwd().parent
env_path = project_root / ".env"

if not env_path.exists():
    raise FileNotFoundError(
        f"'.env' file not found at {env_path}\n"
        "Please copy .env.example to .env and add your OpenAI API key."
    )

# Load from specific path
load_dotenv(env_path, override=True)

# Load project config
config = load_config()

# Get OpenAI API key from environment
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Verify API key is set
if not OPENAI_API_KEY:
    raise ValueError("Missing required environment variable: OPENAI_API_KEY")

print("✓ Environment variables loaded")
print(f"  API Key: {OPENAI_API_KEY[:10]}...{OPENAI_API_KEY[-4:]}")

✓ Environment variables loaded
  API Key: sk-proj-cj...__0A


### 0.03 Set Up Paths

In [14]:
# Get paths
project_data_dir = project_root / "data" / "bronze" / "project_data"
all_project_details_path = project_data_dir / "all_project_details.csv"
project_summary_path = project_data_dir / "project_summary.csv"

print(f"Project data directory: {project_data_dir}")
print(f"All project details path: {all_project_details_path}")
print(f"Project summary path: {project_summary_path}")
print(f"Files exist: {all_project_details_path.exists()} / {project_summary_path.exists()}")

Project data directory: /Users/lauren/repos/PAD2Skills/data/bronze/project_data
All project details path: /Users/lauren/repos/PAD2Skills/data/bronze/project_data/all_project_details.csv
Project summary path: /Users/lauren/repos/PAD2Skills/data/bronze/project_data/project_summary.csv
Files exist: True / True


## 1. Get Project Details

### 1.01 Load Project Data

In [15]:
# Read project details and summary
all_project_details = pd.read_csv(all_project_details_path)
project_summary = pd.read_csv(project_summary_path)

print(f"All project details: {all_project_details.shape[0]} rows, {all_project_details.shape[1]} columns")
print(f"Project summary: {project_summary.shape[0]} rows, {project_summary.shape[1]} columns")

All project details: 123 rows, 20 columns
Project summary: 123 rows, 4 columns


### 1.02 Convert column names to snake_case

In [16]:
# Convert column names to snake_case
all_project_details.columns = (
    all_project_details.columns
    .str.strip()
    .str.lower()
    .str.replace(' ', '_')
    .str.replace(r'[^\w]', '_', regex=True)
    .str.replace(r'_+', '_', regex=True)
    .str.strip('_')
)

print("Column names after standardization:")
print(list(all_project_details.columns))

Column names after standardization:
['project_id', 'status', 'team_leader', 'borrower_2', 'country', 'disclosure_date', 'approval_date', 'effective_date', 'total_project_cost_1', 'implementing_agency', 'region', 'fiscal_year_3', 'commitment_amount', 'environmental_category', 'environmental_and_social_risk', 'closing_date', 'last_stage_reached', 'last_update_date', 'consultant_services_required', 'associated_projects']


### 1.03 Merge and Filter Projects

In [18]:
# Merge project_summary to all_project_details
all_projects = all_project_details.merge(
    project_summary,
    on="project_id",
    how="left"
)

print(f"Merged data: {all_projects.shape[0]} rows, {all_projects.shape[1]} columns")

# Keep only projects with downloaded PADs
all_projects = all_projects[all_projects["pads_downloaded"] >= 1]

print(f"Projects with downloaded PADs: {all_projects.shape[0]} rows")
print(f"\nFirst few projects:")
print(all_projects.head())

Merged data: 123 rows, 23 columns
Projects with downloaded PADs: 98 rows

First few projects:
  project_id  status                                        team_leader  \
1    P119893  Closed                    Abdulhakim Mohammed Abdisubhan    
3    P173506  Active  Didier Makoso Tsasa , Fabrice Karl Bertholet, ...   
4    P176731  Active  Janina Franco , Abdulhakim Mohammed Abdisubhan...   
5    P507759  Active                     Jenny Jing Chao , Maria Arango   
6    P180547  Active   Monali Ranade , Dana Rysankova, Alona Kazantseva   

                                          borrower_2  \
1            Federal Democratic Republic of Ethiopia   
3                       DEMOCRATIC REPUBLIC OF CONGO   
4            Federal Democratic Republic of Ethiopia   
5                             Republic of Mozambique   
6  Common Market for Eastern and Southern Africa ...   

                         country    disclosure_date  \
1                       Ethiopia  December 22, 2011   
3  Congo

## 2. Select Projects

### 2.01 Select first 10 Projects or define your own list

In [19]:
# Select first 10 project IDs
selected_projects = all_projects.head(10)["project_id"].tolist()

print(f"Selected {len(selected_projects)} projects:")
for project_id in selected_projects:
    print(f"  {project_id}")

Selected 10 projects:
  P119893
  P173506
  P176731
  P507759
  P180547
  P505856
  P181341
  P075941
  P160708
  P153743


In [31]:
# Custom list (first 10 plus important project)
selected_projects = ['P119893', 'P173506', 'P176731', 'P507759', 
                     'P180547', 'P505856', 'P181341', 'P075941', 'P160708', 
                     'P153743', 'P511453']

### 2.02 Filter Projects DataFrame

In [32]:
# Filter projects dataframe for selected projects
projects_df = all_projects[all_projects["project_id"].isin(selected_projects)]

print(f"Filtered to {len(projects_df)} projects:")
print(projects_df[["project_id", "status", "country"]].to_string(index=False))

Filtered to 11 projects:
project_id status                       country
   P119893 Closed                      Ethiopia
   P173506 Active Congo, Democratic Republic of
   P176731 Active                      Ethiopia
   P507759 Active                    Mozambique
   P180547 Active   Eastern and Southern Africa
   P505856 Active                    Seychelles
   P181341 Active  Somalia, Federal Republic of
   P075941 Closed   Eastern and Southern Africa
   P160708 Active    Western and Central Africa
   P153743 Closed                         Niger
   P511453 Active                        Guinea


### 2.03 Save Selected Projects Data

In [33]:
# Save filtered projects to silver directory
output_dir = project_root / "data" / "silver" / "selected_projects_data"
output_dir.mkdir(parents=True, exist_ok=True)

output_path = output_dir / "selected_projects.csv"
projects_df.to_csv(output_path, index=False)

print(f"Saved {len(projects_df)} projects to:")
print(f"  {output_path}")

Saved 11 projects to:
  /Users/lauren/repos/PAD2Skills/data/silver/selected_projects_data/selected_projects.csv


## 3. Convert PDFs to Markdown

### 3.01 Import PDF Conversion Module

In [23]:
from src.pdf_conversion.converter import convert_pdfs

print("✓ PDF conversion module imported")

  from .autonotebook import tqdm as notebook_tqdm


✓ PDF conversion module imported


### 3.02 Set Up PDF Conversion Paths

In [34]:
# Set up paths for PDF conversion
pdf_dir = project_root / config.paths.raw_pdfs
markdown_dir = project_root / config.paths.markdown

print(f"PDF directory: {pdf_dir}")
print(f"Markdown directory: {markdown_dir}")
print(f"PDF directory exists: {pdf_dir.exists()}")

PDF directory: /Users/lauren/repos/PAD2Skills/data/bronze/pads_pdf
Markdown directory: /Users/lauren/repos/PAD2Skills/data/silver/pads_md
PDF directory exists: True


### 3.03 Convert PDFs for Selected Projects

In [35]:
# Convert PDFs for each selected project
for project_id in selected_projects:
    pdf_file = pdf_dir / f"{project_id}_1.pdf"
    
    # Skip if PDF doesn't exist
    if not pdf_file.exists():
        print(f"⚠ PDF not found: {project_id}")
        continue
    
    # Convert single PDF using the src utility
    results = convert_pdfs(
        pdf_dir=pdf_dir,
        output_dir=markdown_dir,
        specific_pdf=pdf_file.name,
        overwrite=False,
        accurate_tables=True
    )
    
    # Report result
    if results["converted"]:
        print(f"✓ Converted: {project_id}")
    elif results["skipped"]:
        print(f"○ Skipped (already exists): {project_id}")
    elif results["failed"]:
        error = results["failed"][0][1]
        print(f"✗ Failed: {project_id} - {error}")

○ Skipped (already exists): P119893
○ Skipped (already exists): P173506
○ Skipped (already exists): P176731
○ Skipped (already exists): P507759
○ Skipped (already exists): P180547
○ Skipped (already exists): P505856
○ Skipped (already exists): P181341
○ Skipped (already exists): P075941
○ Skipped (already exists): P160708
○ Skipped (already exists): P153743
○ Skipped (already exists): P511453


## 4. Extract Document Sections

### 4.01 Import Section Extraction Module

In [28]:
from src.extraction.extractor import extract_all_sections

print("✓ Section extraction module imported")

✓ Section extraction module imported


### 4.02 Set Up Section Extraction Paths

In [29]:
# Set up paths for section extraction
sections_output_dir = project_root / "data" / "silver" / "document_sections"

print(f"Markdown directory: {markdown_dir}")
print(f"Sections output directory: {sections_output_dir}")

Markdown directory: /Users/lauren/repos/PAD2Skills/data/silver/pads_md
Sections output directory: /Users/lauren/repos/PAD2Skills/data/silver/document_sections


### 4.03 Extract Sections for Selected Projects

In [36]:
# Extract sections for each selected project
for project_id in selected_projects:
    md_file = markdown_dir / f"{project_id}_1.md"
    
    # Skip if markdown doesn't exist
    if not md_file.exists():
        print(f"⚠ Markdown not found: {project_id}")
        continue
    
    # Extract sections using the src utility
    results = extract_all_sections(
        markdown_dir=markdown_dir,
        output_dir=sections_output_dir,
        specific_file=md_file.name,
        overwrite=False
    )
    
    # Report result
    if results["extracted"]:
        print(f"✓ Extracted sections: {project_id}")
    elif results["skipped"]:
        print(f"○ Skipped (already exists): {project_id}")
    elif results["failed"]:
        error = results["failed"][0][1]
        print(f"✗ Failed: {project_id} - {error}")

○ Skipped (already exists): P119893
○ Skipped (already exists): P173506


2026-01-02 17:52:03,376 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Extracted sections: P176731
○ Skipped (already exists): P507759
○ Skipped (already exists): P180547
○ Skipped (already exists): P505856
○ Skipped (already exists): P181341
○ Skipped (already exists): P075941
○ Skipped (already exists): P160708
○ Skipped (already exists): P153743
○ Skipped (already exists): P511453


## 5. Extract Abbreviations

### 5.01 Import Abbreviation Extraction Module

In [37]:
from src.extraction.extractor import extract_all_abbreviations

print("✓ Abbreviation extraction module imported")

✓ Abbreviation extraction module imported


### 5.02 Set Up Abbreviation Extraction Paths

In [38]:
# Set up paths for abbreviation extraction
abbreviations_output_dir = project_root / "data" / "silver" / "abbreviations_md"

print(f"Markdown directory: {markdown_dir}")
print(f"Abbreviations output directory: {abbreviations_output_dir}")

Markdown directory: /Users/lauren/repos/PAD2Skills/data/silver/pads_md
Abbreviations output directory: /Users/lauren/repos/PAD2Skills/data/silver/abbreviations_md


### 5.03 Extract Abbreviations for Selected Projects

In [39]:
# Extract abbreviations for each selected project
for project_id in selected_projects:
    md_file = markdown_dir / f"{project_id}_1.md"
    
    # Skip if markdown doesn't exist
    if not md_file.exists():
        print(f"⚠ Markdown not found: {project_id}")
        continue
    
    # Extract abbreviations using the src utility
    results = extract_all_abbreviations(
        markdown_dir=markdown_dir,
        output_dir=abbreviations_output_dir,
        specific_file=md_file.name,
        overwrite=False
    )
    
    # Report result
    if results["extracted"]:
        print(f"✓ Extracted abbreviations: {project_id}")
    elif results["skipped"]:
        print(f"○ Skipped (already exists): {project_id}")
    elif results["failed"]:
        error = results["failed"][0][1]
        print(f"✗ Failed: {project_id} - {error}")

2026-01-02 17:52:43,157 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Extracted abbreviations: P119893


2026-01-02 17:53:24,731 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Extracted abbreviations: P173506


2026-01-02 17:53:46,955 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Extracted abbreviations: P176731


2026-01-02 17:54:21,001 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Extracted abbreviations: P507759


2026-01-02 17:55:06,728 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Extracted abbreviations: P180547


2026-01-02 17:55:17,786 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Extracted abbreviations: P505856


2026-01-02 17:56:00,093 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Extracted abbreviations: P181341


2026-01-02 17:56:25,306 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 502 Bad Gateway"
2026-01-02 17:56:25,308 - INFO - Retrying request to /responses in 0.449558 seconds
2026-01-02 17:57:27,568 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Extracted abbreviations: P075941


2026-01-02 17:58:15,495 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Extracted abbreviations: P160708


2026-01-02 17:58:49,697 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Extracted abbreviations: P153743


2026-01-02 17:59:42,334 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Extracted abbreviations: P511453
