# Pipeline Runner

Run utilities and pipeline steps for selected projects.

## 0. Setup

### 0.01 Import Required Libraries

In [1]:
import os
from pathlib import Path
from dotenv import load_dotenv

import pandas as pd

# Import our config
import sys
sys.path.append(str(Path.cwd().parent))
from src.config import load_config

### 0.02 Load Configuration and Environment Variables

In [2]:
# Load environment variables from .env file
project_root = Path.cwd().parent
env_path = project_root / ".env"

if not env_path.exists():
    raise FileNotFoundError(
        f"'.env' file not found at {env_path}\n"
        "Please copy .env.example to .env and add your OpenAI API key."
    )

# Load from specific path
load_dotenv(env_path, override=True)

# Load project config
config = load_config()

# Get OpenAI API key from environment
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Verify API key is set
if not OPENAI_API_KEY:
    raise ValueError("Missing required environment variable: OPENAI_API_KEY")

print("✓ Environment variables loaded")
print(f"  API Key: {OPENAI_API_KEY[:10]}...{OPENAI_API_KEY[-4:]}")

✓ Environment variables loaded
  API Key: sk-proj-cj...__0A


### 0.03 Set Up Paths

In [3]:
# Get paths
project_data_dir = project_root / "data" / "bronze" / "project_data"
all_project_details_path = project_data_dir / "all_project_details.csv"
project_summary_path = project_data_dir / "project_summary.csv"

print(f"Project data directory: {project_data_dir}")
print(f"All project details path: {all_project_details_path}")
print(f"Project summary path: {project_summary_path}")
print(f"Files exist: {all_project_details_path.exists()} / {project_summary_path.exists()}")

Project data directory: /Users/lauren/repos/PAD2Skills/data/bronze/project_data
All project details path: /Users/lauren/repos/PAD2Skills/data/bronze/project_data/all_project_details.csv
Project summary path: /Users/lauren/repos/PAD2Skills/data/bronze/project_data/project_summary.csv
Files exist: True / True


## 1. Get Project Details

### 1.01 Load Project Data

In [4]:
# Read project details and summary
all_project_details = pd.read_csv(all_project_details_path)
project_summary = pd.read_csv(project_summary_path)

print(f"All project details: {all_project_details.shape[0]} rows, {all_project_details.shape[1]} columns")
print(f"Project summary: {project_summary.shape[0]} rows, {project_summary.shape[1]} columns")

All project details: 123 rows, 20 columns
Project summary: 123 rows, 4 columns


### 1.02 Convert column names to snake_case

In [5]:
# Convert column names to snake_case
all_project_details.columns = (
    all_project_details.columns
    .str.strip()
    .str.lower()
    .str.replace(' ', '_')
    .str.replace(r'[^\w]', '_', regex=True)
    .str.replace(r'_+', '_', regex=True)
    .str.strip('_')
)

print("Column names after standardization:")
print(list(all_project_details.columns))

Column names after standardization:
['project_id', 'status', 'team_leader', 'borrower_2', 'country', 'disclosure_date', 'approval_date', 'effective_date', 'total_project_cost_1', 'implementing_agency', 'region', 'fiscal_year_3', 'commitment_amount', 'environmental_category', 'environmental_and_social_risk', 'closing_date', 'last_stage_reached', 'last_update_date', 'consultant_services_required', 'associated_projects']


### 1.03 Merge and Filter Projects

In [6]:
# Merge project_summary to all_project_details
all_projects = all_project_details.merge(
    project_summary,
    on="project_id",
    how="left"
)

print(f"Merged data: {all_projects.shape[0]} rows, {all_projects.shape[1]} columns")

# Keep only projects with downloaded PADs
all_projects = all_projects[all_projects["pads_downloaded"] >= 1]

print(f"Projects with downloaded PADs: {all_projects.shape[0]} rows")
print(f"\nFirst few projects:")
print(all_projects.head())

Merged data: 123 rows, 23 columns
Projects with downloaded PADs: 98 rows

First few projects:
  project_id  status                                        team_leader  \
1    P119893  Closed                    Abdulhakim Mohammed Abdisubhan    
3    P173506  Active  Didier Makoso Tsasa , Fabrice Karl Bertholet, ...   
4    P176731  Active  Janina Franco , Abdulhakim Mohammed Abdisubhan...   
5    P507759  Active                     Jenny Jing Chao , Maria Arango   
6    P180547  Active   Monali Ranade , Dana Rysankova, Alona Kazantseva   

                                          borrower_2  \
1            Federal Democratic Republic of Ethiopia   
3                       DEMOCRATIC REPUBLIC OF CONGO   
4            Federal Democratic Republic of Ethiopia   
5                             Republic of Mozambique   
6  Common Market for Eastern and Southern Africa ...   

                         country    disclosure_date  \
1                       Ethiopia  December 22, 2011   
3  Congo

## 2. Select Projects

### 2.01 Select first 10 Projects or define your own list

In [7]:
# Select first 10 project IDs
selected_projects = all_projects.head(10)["project_id"].tolist()

print(f"Selected {len(selected_projects)} projects:")
for project_id in selected_projects:
    print(f"  {project_id}")

Selected 10 projects:
  P119893
  P173506
  P176731
  P507759
  P180547
  P505856
  P181341
  P075941
  P160708
  P153743


In [8]:
# Custom list (first 10 plus important project)
selected_projects = ['P119893', 'P173506', 'P176731', 'P507759', 
                     'P180547', 'P505856', 'P181341', 'P075941', 'P160708', 
                     'P153743', 'P511453']

### 2.02 Filter Projects DataFrame

In [9]:
# Filter projects dataframe for selected projects
projects_df = all_projects[all_projects["project_id"].isin(selected_projects)]

print(f"Filtered to {len(projects_df)} projects:")
print(projects_df[["project_id", "status", "country"]].to_string(index=False))

Filtered to 11 projects:
project_id status                       country
   P119893 Closed                      Ethiopia
   P173506 Active Congo, Democratic Republic of
   P176731 Active                      Ethiopia
   P507759 Active                    Mozambique
   P180547 Active   Eastern and Southern Africa
   P505856 Active                    Seychelles
   P181341 Active  Somalia, Federal Republic of
   P075941 Closed   Eastern and Southern Africa
   P160708 Active    Western and Central Africa
   P153743 Closed                         Niger
   P511453 Active                        Guinea


### 2.03 Save Selected Projects Data

In [10]:
# Save filtered projects to silver directory
output_dir = project_root / "data" / "silver" / "selected_projects_data"
output_dir.mkdir(parents=True, exist_ok=True)

output_path = output_dir / "selected_projects.csv"
projects_df.to_csv(output_path, index=False)

print(f"Saved {len(projects_df)} projects to:")
print(f"  {output_path}")

Saved 11 projects to:
  /Users/lauren/repos/PAD2Skills/data/silver/selected_projects_data/selected_projects.csv


## 3. Pipeline Step #1: Convert PDFs to Markdown

### 3.01 Import PDF Conversion Module

In [11]:
from src.pdf_conversion.converter import convert_pdfs

print("✓ PDF conversion module imported")

  from .autonotebook import tqdm as notebook_tqdm


✓ PDF conversion module imported


### 3.02 Set Up PDF Conversion Paths

In [12]:
# Set up paths for PDF conversion
pdf_dir = project_root / config.paths.raw_pdfs
markdown_dir = project_root / config.paths.markdown

print(f"PDF directory: {pdf_dir}")
print(f"Markdown directory: {markdown_dir}")
print(f"PDF directory exists: {pdf_dir.exists()}")

PDF directory: /Users/lauren/repos/PAD2Skills/data/bronze/pads_pdf
Markdown directory: /Users/lauren/repos/PAD2Skills/data/silver/pads_md
PDF directory exists: True


### 3.03 Convert PDFs for Selected Projects

In [13]:
# Convert PDFs for each selected project
for project_id in selected_projects:
    pdf_file = pdf_dir / f"{project_id}_1.pdf"
    
    # Skip if PDF doesn't exist
    if not pdf_file.exists():
        print(f"⚠ PDF not found: {project_id}")
        continue
    
    # Convert single PDF using the src utility
    results = convert_pdfs(
        pdf_dir=pdf_dir,
        output_dir=markdown_dir,
        specific_pdf=pdf_file.name,
        overwrite=False,
        accurate_tables=True
    )
    
    # Report result
    if results["converted"]:
        print(f"✓ Converted: {project_id}")
    elif results["skipped"]:
        print(f"○ Skipped (already exists): {project_id}")
    elif results["failed"]:
        error = results["failed"][0][1]
        print(f"✗ Failed: {project_id} - {error}")

○ Skipped (already exists): P119893
○ Skipped (already exists): P173506
○ Skipped (already exists): P176731
○ Skipped (already exists): P507759
○ Skipped (already exists): P180547
○ Skipped (already exists): P505856
○ Skipped (already exists): P181341
○ Skipped (already exists): P075941
○ Skipped (already exists): P160708
○ Skipped (already exists): P153743
○ Skipped (already exists): P511453


## 4. Pipeline Step #2: Extract Document Sections

### 4.01 Import Section Extraction Module

In [14]:
from src.extraction.extractor import extract_all_sections

print("✓ Section extraction module imported")

✓ Section extraction module imported


### 4.02 Set Up Section Extraction Paths

In [15]:
# Set up paths for section extraction
sections_output_dir = project_root / "data" / "silver" / "document_sections"

print(f"Markdown directory: {markdown_dir}")
print(f"Sections output directory: {sections_output_dir}")

Markdown directory: /Users/lauren/repos/PAD2Skills/data/silver/pads_md
Sections output directory: /Users/lauren/repos/PAD2Skills/data/silver/document_sections


### 4.03 Extract Sections for Selected Projects

In [16]:
# Extract sections for each selected project
for project_id in selected_projects:
    md_file = markdown_dir / f"{project_id}_1.md"
    
    # Skip if markdown doesn't exist
    if not md_file.exists():
        print(f"⚠ Markdown not found: {project_id}")
        continue
    
    # Extract sections using the src utility
    results = extract_all_sections(
        markdown_dir=markdown_dir,
        output_dir=sections_output_dir,
        specific_file=md_file.name,
        overwrite=False
    )
    
    # Report result
    if results["extracted"]:
        print(f"✓ Extracted sections: {project_id}")
    elif results["skipped"]:
        print(f"○ Skipped (already exists): {project_id}")
    elif results["failed"]:
        error = results["failed"][0][1]
        print(f"✗ Failed: {project_id} - {error}")

○ Skipped (already exists): P119893
○ Skipped (already exists): P173506
○ Skipped (already exists): P176731
○ Skipped (already exists): P507759
○ Skipped (already exists): P180547
○ Skipped (already exists): P505856
○ Skipped (already exists): P181341
○ Skipped (already exists): P075941
○ Skipped (already exists): P160708
○ Skipped (already exists): P153743
○ Skipped (already exists): P511453


## 5. Pipeline Step #3: Extract Abbreviations

### 5.01 Import Abbreviation Extraction Module

In [17]:
from src.extraction.extractor import extract_all_abbreviations

print("✓ Abbreviation extraction module imported")

✓ Abbreviation extraction module imported


### 5.02 Set Up Abbreviation Extraction Paths

In [18]:
# Set up paths for abbreviation extraction
abbreviations_output_dir = project_root / "data" / "silver" / "abbreviations_md"

print(f"Markdown directory: {markdown_dir}")
print(f"Abbreviations output directory: {abbreviations_output_dir}")

Markdown directory: /Users/lauren/repos/PAD2Skills/data/silver/pads_md
Abbreviations output directory: /Users/lauren/repos/PAD2Skills/data/silver/abbreviations_md


### 5.03 Extract Abbreviations for Selected Projects

In [19]:
# Extract abbreviations for each selected project
for project_id in selected_projects:
    md_file = markdown_dir / f"{project_id}_1.md"
    
    # Skip if markdown doesn't exist
    if not md_file.exists():
        print(f"⚠ Markdown not found: {project_id}")
        continue
    
    # Extract abbreviations using the src utility
    results = extract_all_abbreviations(
        markdown_dir=markdown_dir,
        output_dir=abbreviations_output_dir,
        specific_file=md_file.name,
        overwrite=False
    )
    
    # Report result
    if results["extracted"]:
        print(f"✓ Extracted abbreviations: {project_id}")
    elif results["skipped"]:
        print(f"○ Skipped (already exists): {project_id}")
    elif results["failed"]:
        error = results["failed"][0][1]
        print(f"✗ Failed: {project_id} - {error}")

○ Skipped (already exists): P119893
○ Skipped (already exists): P173506
○ Skipped (already exists): P176731
○ Skipped (already exists): P507759
○ Skipped (already exists): P180547
○ Skipped (already exists): P505856
○ Skipped (already exists): P181341
○ Skipped (already exists): P075941
○ Skipped (already exists): P160708
○ Skipped (already exists): P153743
○ Skipped (already exists): P511453


## 6. Pipeline Step #4: Create Chunked Markdown Files

### 6.01 Import Chunking Module

In [20]:
from src.extraction.extractor import create_chunks

print("✓ Chunking module imported")

✓ Chunking module imported


### 6.02 Set Up Chunking Paths

In [21]:
# Set up paths for chunking
chunks_output_dir = project_root / "data" / "silver" / "pads_md_chunks"

print(f"Markdown directory: {markdown_dir}")
print(f"Sections directory: {sections_output_dir}")
print(f"Chunks output directory: {chunks_output_dir}")

Markdown directory: /Users/lauren/repos/PAD2Skills/data/silver/pads_md
Sections directory: /Users/lauren/repos/PAD2Skills/data/silver/document_sections
Chunks output directory: /Users/lauren/repos/PAD2Skills/data/silver/pads_md_chunks


### 6.03 Create Chunks for Selected Projects

In [22]:
# Create chunks for each selected project
for project_id in selected_projects:
    md_file = markdown_dir / f"{project_id}_1.md"
    
    # Skip if markdown doesn't exist
    if not md_file.exists():
        print(f"⚠ Markdown not found: {project_id}")
        continue
    
    # Create chunks using the src utility
    results = create_chunks(
        markdown_dir=markdown_dir,
        sections_dir=sections_output_dir,
        output_dir=chunks_output_dir,
        specific_file=md_file.name,
        overwrite=False
    )
    
    # Report result
    if results["chunked"]:
        # Count chunks created for this project
        chunk_files = list(chunks_output_dir.glob(f"{project_id}_*.md"))
        print(f"✓ Created chunks: {project_id} ({len(chunk_files)} chunks)")
    elif results["skipped"]:
        print(f"○ Skipped (no sections or already exists): {project_id}")
    elif results["failed"]:
        error = results["failed"][0][1]
        print(f"✗ Failed: {project_id} - {error}")

✓ Created chunks: P119893 (11 chunks)
✓ Created chunks: P173506 (10 chunks)
✓ Created chunks: P176731 (11 chunks)
✓ Created chunks: P507759 (7 chunks)
✓ Created chunks: P180547 (16 chunks)
✓ Created chunks: P505856 (6 chunks)
✓ Created chunks: P181341 (16 chunks)
✓ Created chunks: P075941 (17 chunks)
✓ Created chunks: P160708 (21 chunks)
✓ Created chunks: P153743 (11 chunks)
✓ Created chunks: P511453 (8 chunks)


## 7. Pipeline Step #5: Generate PAD Summaries

### 7.01 Import Summary Generation Module

In [23]:
from src.extraction.summarizer import generate_all_summaries

print("✓ Summary generation module imported")

✓ Summary generation module imported


### 7.02 Set Up Summary Generation Paths

In [24]:
# Set up paths for summary generation
summaries_output_dir = project_root / "data" / "silver" / "pad_summaries"

print(f"Chunks directory: {chunks_output_dir}")
print(f"Abbreviations directory: {abbreviations_output_dir}")
print(f"Summaries output directory: {summaries_output_dir}")

Chunks directory: /Users/lauren/repos/PAD2Skills/data/silver/pads_md_chunks
Abbreviations directory: /Users/lauren/repos/PAD2Skills/data/silver/abbreviations_md
Summaries output directory: /Users/lauren/repos/PAD2Skills/data/silver/pad_summaries


### 7.03 Generate Summaries for Selected Projects

In [25]:
# Generate summaries for each selected project
for project_id in selected_projects:
    # Check if chunks exist for this project
    chunk_files = list(chunks_output_dir.glob(f"{project_id}_*.md"))
    
    if not chunk_files:
        print(f"⚠ No chunks found: {project_id}")
        continue
    
    # Generate summary using the src utility
    results = generate_all_summaries(
        chunks_dir=chunks_output_dir,
        output_dir=summaries_output_dir,
        abbr_dir=abbreviations_output_dir,
        specific_project=project_id,
        num_chunks=4,
        overwrite=False
    )
    
    # Report result
    if results["generated"]:
        summary_file = summaries_output_dir / f"{project_id}_summary.txt"
        summary_text = summary_file.read_text(encoding="utf-8")
        word_count = len(summary_text.split())
        print(f"✓ Generated summary: {project_id} ({word_count} words)")
    elif results["skipped"]:
        print(f"○ Skipped (already exists): {project_id}")
    elif results["failed"]:
        error = results["failed"][0][1]
        print(f"✗ Failed: {project_id} - {error}")

2026-01-02 22:37:17,383 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Generated summary: P119893 (286 words)


2026-01-02 22:38:43,691 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Generated summary: P173506 (286 words)


2026-01-02 22:39:39,986 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Generated summary: P176731 (247 words)


2026-01-02 22:41:17,405 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Generated summary: P507759 (259 words)


2026-01-02 22:42:32,319 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Generated summary: P180547 (250 words)


2026-01-02 22:43:20,673 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Generated summary: P505856 (277 words)


2026-01-02 22:44:17,072 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Generated summary: P181341 (250 words)
○ Skipped (already exists): P075941


2026-01-02 22:44:52,179 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Generated summary: P160708 (269 words)


2026-01-02 22:45:39,144 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Generated summary: P153743 (268 words)


2026-01-02 22:46:25,189 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Generated summary: P511453 (265 words)


## 8. Pipeline Step #5: Extract Occupations and Skills

### 8.01 Import Occupations Extraction Module

In [26]:
from src.extraction.occupations_extractor import extract_all_occupations

print("✓ Occupations extraction module imported")

✓ Occupations extraction module imported


### 8.02 Set Up Occupations Extraction Paths

In [27]:
# Set up paths for occupations extraction
occupations_output_dir = project_root / "data" / "silver" / "occupations_skills_json"

print(f"Chunks directory: {chunks_output_dir}")
print(f"Abbreviations directory: {abbreviations_output_dir}")
print(f"Occupations output directory: {occupations_output_dir}")

Chunks directory: /Users/lauren/repos/PAD2Skills/data/silver/pads_md_chunks
Abbreviations directory: /Users/lauren/repos/PAD2Skills/data/silver/abbreviations_md
Occupations output directory: /Users/lauren/repos/PAD2Skills/data/silver/occupations_skills_json


### 8.03 Extract Occupations for Selected Projects

In [28]:
# Extract occupations for each selected project
for project_id in selected_projects:
    # Check if chunks exist for this project
    chunk_files = list(chunks_output_dir.glob(f"{project_id}_*.md"))
    
    if not chunk_files:
        print(f"⚠ No chunks found: {project_id}")
        continue
    
    # Extract occupations using the src utility
    results = extract_all_occupations(
        chunks_dir=chunks_output_dir,
        output_dir=occupations_output_dir,
        abbr_dir=abbreviations_output_dir,
        specific_project=project_id,
        overwrite=False
    )
    
    # Report result
    if results["generated"]:
        occupation_files = list(occupations_output_dir.glob(f"{project_id}_*_occupations.json"))
        print(f"✓ Extracted occupations: {project_id} ({len(results['generated'])} files)")
    elif results["skipped"]:
        print(f"○ Skipped (already exists): {project_id}")
    elif results["failed"]:
        error = results["failed"][0][1] if results["failed"] else "Unknown error"
        print(f"✗ Failed: {project_id} - {error}")

2026-01-02 23:23:03,589 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-02 23:23:52,024 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-02 23:24:36,063 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-02 23:26:09,853 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-02 23:28:37,932 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-02 23:29:39,476 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-02 23:32:30,695 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-02 23:34:29,994 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-02 23:36:11,679 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-02 23:38:08,470 - INFO - HTTP Request:

✓ Extracted occupations: P119893 (11 files)


2026-01-02 23:42:05,842 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-02 23:45:44,163 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-02 23:48:07,733 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-02 23:50:06,418 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-02 23:50:11,726 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-02 23:51:31,207 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-02 23:53:50,884 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-02 23:56:12,987 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-02 23:57:10,571 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-02 23:57:14,974 - INFO - HTTP Request:

✓ Extracted occupations: P173506 (10 files)


2026-01-02 23:59:07,812 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-02 23:59:13,454 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 00:01:58,386 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 00:03:55,186 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 00:06:59,511 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 00:07:15,221 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 00:09:17,856 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 00:11:00,963 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 00:12:53,208 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 00:14:13,494 - INFO - HTTP Request:

✓ Extracted occupations: P176731 (11 files)


2026-01-03 00:18:07,996 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 00:22:33,985 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 00:24:11,421 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 00:26:45,076 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 00:26:58,850 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 00:28:11,250 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 00:30:16,281 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Extracted occupations: P507759 (7 files)


2026-01-03 00:31:58,013 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 00:33:48,739 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 00:36:33,478 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 00:39:07,846 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 00:41:53,794 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 00:43:45,819 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 00:45:04,527 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 00:48:39,278 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 00:50:22,943 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 00:52:21,114 - INFO - HTTP Request:

✓ Extracted occupations: P180547 (16 files)


2026-01-03 01:02:23,721 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 01:03:57,082 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 01:07:48,309 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 01:09:51,163 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 01:12:09,132 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 01:13:08,939 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Extracted occupations: P505856 (6 files)


2026-01-03 01:15:13,249 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 01:17:19,337 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 01:24:58,018 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 01:27:14,261 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 01:29:31,744 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 01:30:57,208 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 01:32:37,812 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 01:36:07,102 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 01:38:11,341 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 01:42:01,238 - INFO - HTTP Request:

✓ Extracted occupations: P181341 (16 files)
○ Skipped (already exists): P075941


2026-01-03 01:54:25,410 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 01:55:45,444 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 01:57:25,350 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 01:59:58,320 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 02:01:24,735 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 02:01:39,560 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 02:02:56,195 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 02:03:00,905 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 02:03:08,073 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 02:04:28,225 - INFO - HTTP Request:

✓ Extracted occupations: P160708 (21 files)


2026-01-03 02:26:50,972 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 02:28:40,848 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 02:29:25,451 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 02:32:13,036 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 02:34:29,630 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 02:35:31,530 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 02:37:18,985 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 02:38:39,559 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 02:40:36,183 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 02:43:12,243 - INFO - HTTP Request:

✓ Extracted occupations: P153743 (11 files)


2026-01-03 02:46:14,622 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 02:50:05,948 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 02:52:44,510 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 02:54:38,441 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 02:57:35,356 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 02:59:51,587 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 03:02:07,886 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
2026-01-03 03:02:13,209 - INFO - HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


✓ Extracted occupations: P511453 (8 files)


In [None]:
# Re-run occupation extraction for P075941 with overwrite
project_id = "P075941"

results = extract_all_occupations(
    chunks_dir=chunks_output_dir,
    output_dir=occupations_output_dir,
    abbr_dir=abbreviations_output_dir,
    specific_project=project_id,
    overwrite=True
)

if results["generated"]:
    print(f"✓ Re-extracted occupations: {project_id} ({len(results['generated'])} files)")
elif results["failed"]:
    error = results["failed"][0][1] if results["failed"] else "Unknown error"
    print(f"✗ Failed: {project_id} - {error}")

## 9. Pipeline Step #6: Match PAD Occupations to ESCO

### 9.01 Import ESCO Matching Modules

In [10]:
from src.matching.esco_prepare import prepare_esco_data
from src.matching.pad_matcher import match_pad_to_esco

print("✓ ESCO matching modules imported")

  from .autonotebook import tqdm as notebook_tqdm


✓ ESCO matching modules imported


### 9.02 Prepare ESCO Data (Run Once)

In [11]:
# Prepare ESCO data with embeddings (only needs to be run once)
esco_csv = project_root / "data" / "bronze" / "esco" / "occupations_en.csv"
esco_relations_csv = project_root / "data" / "bronze" / "esco" / "occupationSkillRelations_en.csv"
esco_output_csv = project_root / "data" / "silver" / "esco_occupations_prepared.csv"
esco_embeddings_file = project_root / "data" / "silver" / "embeddings" / "esco_embeddings.npy"

# Check if ESCO data is already prepared
if esco_output_csv.exists() and esco_embeddings_file.exists():
    print("○ ESCO data already prepared (skipping)")
else:
    print("Preparing ESCO data with embeddings...")
    prepare_esco_data(
        esco_csv=esco_csv,
        esco_relations_csv=esco_relations_csv,
        output_csv=esco_output_csv,
        embeddings_file=esco_embeddings_file,
        model_name="intfloat/e5-small-v2",
        overwrite_embeddings=False
    )
    print("✓ ESCO data prepared successfully")

○ ESCO data already prepared (skipping)


### 9.03 Set Up PAD Matching Paths

In [12]:
# Set up paths for PAD matching
pad_occupations_dir = project_root / "data" / "silver" / "occupations_skills_json"
esco_matching_csv_dir = project_root / "data" / "silver" / "esco_matching_csv"
esco_matching_json_dir = project_root / "data" / "silver" / "esco_matching_json"

print(f"PAD occupations directory: {pad_occupations_dir}")
print(f"ESCO prepared CSV: {esco_output_csv}")
print(f"ESCO embeddings: {esco_embeddings_file}")
print(f"Matching CSV output: {esco_matching_csv_dir}")
print(f"Matching JSON output: {esco_matching_json_dir}")

PAD occupations directory: /Users/lauren/repos/PAD2Skills/data/silver/occupations_skills_json
ESCO prepared CSV: /Users/lauren/repos/PAD2Skills/data/silver/esco_occupations_prepared.csv
ESCO embeddings: /Users/lauren/repos/PAD2Skills/data/silver/embeddings/esco_embeddings.npy
Matching CSV output: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv
Matching JSON output: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_json


### 9.04 Match PAD Occupations to ESCO for Selected Projects

In [13]:
# Set overwrite parameter
overwrite_matching = False  # Set to True to force re-matching

# Match PAD occupations to ESCO for each selected project
for project_id in selected_projects:
    # Check if occupation JSON files exist for this project
    occupation_files = list(pad_occupations_dir.glob(f"{project_id}_*.json"))
    
    if not occupation_files:
        print(f"⚠ No occupation files found: {project_id}")
        continue
    
    # Match PAD occupations to ESCO
    try:
        match_pad_to_esco(
            pad_occupations_dir=pad_occupations_dir,
            project_id=project_id,
            esco_csv=esco_output_csv,
            esco_embeddings=esco_embeddings_file,
            output_dir=project_root / "data" / "silver",
            model_name="intfloat/e5-small-v2",
            top_k=20,
            chunk_size=75,
            save_diagnostics=True,
            overwrite=overwrite_matching
        )
        print(f"✓ Matched occupations: {project_id}")
    except Exception as e:
        print(f"✗ Failed: {project_id} - {e}")

Deleted existing CSV: P119893_esco_matches.csv
Deleted 2 existing JSON chunk files
Loading ESCO data from /Users/lauren/repos/PAD2Skills/data/silver/esco_occupations_prepared.csv...
✓ Loaded 3,037 ESCO occupations
Loading ESCO embeddings from /Users/lauren/repos/PAD2Skills/data/silver/embeddings/esco_embeddings.npy...
✓ Loaded embeddings: shape=(3037, 384), size=4.45 MB
Loading PAD occupations for project P119893...
✓ Loaded 134 PAD occupation extractions
Loading model: intfloat/e5-small-v2...
  Max sequence length: 512
  Embedding dimension: 384
Encoding 134 PAD occupation queries...


Batches: 100%|██████████| 3/3 [00:03<00:00,  1.27s/it]


✓ Encoded PAD occupations: shape=(134, 384), size=0.20 MB
Computing similarity scores...
✓ Computed similarity matrix: shape=(134, 3037)
  Score range: [0.6596, 0.8911], mean=0.7519
Finding top 20 matches for each PAD occupation...
✓ Found top 20 matches for all 134 PAD occupations
✓ Created results DataFrame: shape=(134, 109)
✓ Saved CSV results to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/P119893_esco_matches.csv
  File size: 1362.26 KB
✓ Saved diagnostics to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/diagnostics/P119893_esco_matches_diagnostics.csv
  File size: 107.79 KB
Splitting 134 records into 2 chunk(s) of up to 75 records
  ✓ Saved chunk 1/2: P119893_000-074_esco_matches.json (75 records, 431.92 KB)
  ✓ Saved chunk 2/2: P119893_075-133_esco_matches.json (59 records, 343.23 KB)
✓ Saved 2 JSON file(s) to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_json
✓ Matched occupations: P119893
Deleted existing CSV: P173506_esco_matches.c

Batches: 100%|██████████| 3/3 [00:04<00:00,  1.38s/it]


✓ Encoded PAD occupations: shape=(138, 384), size=0.20 MB
Computing similarity scores...
✓ Computed similarity matrix: shape=(138, 3037)
  Score range: [0.6670, 0.8962], mean=0.7546
Finding top 20 matches for each PAD occupation...
✓ Found top 20 matches for all 138 PAD occupations
✓ Created results DataFrame: shape=(138, 109)
✓ Saved CSV results to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/P173506_esco_matches.csv
  File size: 1404.18 KB
✓ Saved diagnostics to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/diagnostics/P173506_esco_matches_diagnostics.csv
  File size: 119.31 KB
Splitting 138 records into 2 chunk(s) of up to 75 records
  ✓ Saved chunk 1/2: P173506_000-074_esco_matches.json (75 records, 431.47 KB)
  ✓ Saved chunk 2/2: P173506_075-137_esco_matches.json (63 records, 373.56 KB)
✓ Saved 2 JSON file(s) to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_json
✓ Matched occupations: P173506
Deleted existing CSV: P176731_esco_matches.c

Batches: 100%|██████████| 2/2 [00:03<00:00,  1.82s/it]


✓ Encoded PAD occupations: shape=(113, 384), size=0.17 MB
Computing similarity scores...
✓ Computed similarity matrix: shape=(113, 3037)
  Score range: [0.6730, 0.8874], mean=0.7532
Finding top 20 matches for each PAD occupation...
✓ Found top 20 matches for all 113 PAD occupations
✓ Created results DataFrame: shape=(113, 109)
✓ Saved CSV results to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/P176731_esco_matches.csv
  File size: 1145.10 KB
✓ Saved diagnostics to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/diagnostics/P176731_esco_matches_diagnostics.csv
  File size: 91.65 KB
Splitting 113 records into 2 chunk(s) of up to 75 records
  ✓ Saved chunk 1/2: P176731_000-074_esco_matches.json (75 records, 431.31 KB)
  ✓ Saved chunk 2/2: P176731_075-112_esco_matches.json (38 records, 220.33 KB)
✓ Saved 2 JSON file(s) to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_json
✓ Matched occupations: P176731
Deleted existing CSV: P507759_esco_matches.cs

Batches: 100%|██████████| 2/2 [00:02<00:00,  1.34s/it]


✓ Encoded PAD occupations: shape=(89, 384), size=0.13 MB
Computing similarity scores...
✓ Computed similarity matrix: shape=(89, 3037)
  Score range: [0.6767, 0.8785], mean=0.7549
Finding top 20 matches for each PAD occupation...
✓ Found top 20 matches for all 89 PAD occupations
✓ Created results DataFrame: shape=(89, 109)
✓ Saved CSV results to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/P507759_esco_matches.csv
  File size: 906.52 KB
✓ Saved diagnostics to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/diagnostics/P507759_esco_matches_diagnostics.csv
  File size: 79.05 KB
Splitting 89 records into 2 chunk(s) of up to 75 records
  ✓ Saved chunk 1/2: P507759_000-074_esco_matches.json (75 records, 438.46 KB)
  ✓ Saved chunk 2/2: P507759_075-088_esco_matches.json (14 records, 82.46 KB)
✓ Saved 2 JSON file(s) to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_json
✓ Matched occupations: P507759
Deleted existing CSV: P180547_esco_matches.csv
Delet

Batches: 100%|██████████| 4/4 [00:05<00:00,  1.41s/it]


✓ Encoded PAD occupations: shape=(213, 384), size=0.31 MB
Computing similarity scores...
✓ Computed similarity matrix: shape=(213, 3037)
  Score range: [0.6681, 0.8842], mean=0.7497
Finding top 20 matches for each PAD occupation...
✓ Found top 20 matches for all 213 PAD occupations
✓ Created results DataFrame: shape=(213, 109)
✓ Saved CSV results to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/P180547_esco_matches.csv
  File size: 2223.44 KB
✓ Saved diagnostics to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/diagnostics/P180547_esco_matches_diagnostics.csv
  File size: 204.78 KB
Splitting 213 records into 3 chunk(s) of up to 75 records
  ✓ Saved chunk 1/3: P180547_000-074_esco_matches.json (75 records, 444.51 KB)
  ✓ Saved chunk 2/3: P180547_075-149_esco_matches.json (75 records, 451.93 KB)
  ✓ Saved chunk 3/3: P180547_150-212_esco_matches.json (63 records, 377.26 KB)
✓ Saved 3 JSON file(s) to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_j

Batches: 100%|██████████| 1/1 [00:02<00:00,  2.16s/it]


✓ Encoded PAD occupations: shape=(59, 384), size=0.09 MB
Computing similarity scores...
✓ Computed similarity matrix: shape=(59, 3037)
  Score range: [0.6691, 0.8621], mean=0.7515
Finding top 20 matches for each PAD occupation...
✓ Found top 20 matches for all 59 PAD occupations
✓ Created results DataFrame: shape=(59, 109)
✓ Saved CSV results to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/P505856_esco_matches.csv
  File size: 610.08 KB
✓ Saved diagnostics to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/diagnostics/P505856_esco_matches_diagnostics.csv
  File size: 49.19 KB
Splitting 59 records into 1 chunk(s) of up to 75 records
  ✓ Saved chunk 1/1: P505856_000-058_esco_matches.json (59 records, 343.74 KB)
✓ Saved 1 JSON file(s) to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_json
✓ Matched occupations: P505856
Deleted existing CSV: P181341_esco_matches.csv
Deleted 3 existing JSON chunk files
Loading ESCO data from /Users/lauren/repos/PAD2

Batches: 100%|██████████| 4/4 [00:06<00:00,  1.50s/it]


✓ Encoded PAD occupations: shape=(216, 384), size=0.32 MB
Computing similarity scores...
✓ Computed similarity matrix: shape=(216, 3037)
  Score range: [0.6514, 0.8861], mean=0.7501
Finding top 20 matches for each PAD occupation...
✓ Found top 20 matches for all 216 PAD occupations
✓ Created results DataFrame: shape=(216, 109)
✓ Saved CSV results to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/P181341_esco_matches.csv
  File size: 2252.77 KB
✓ Saved diagnostics to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/diagnostics/P181341_esco_matches_diagnostics.csv
  File size: 199.33 KB
Splitting 216 records into 3 chunk(s) of up to 75 records
  ✓ Saved chunk 1/3: P181341_000-074_esco_matches.json (75 records, 440.64 KB)
  ✓ Saved chunk 2/3: P181341_075-149_esco_matches.json (75 records, 444.46 KB)
  ✓ Saved chunk 3/3: P181341_150-215_esco_matches.json (66 records, 394.77 KB)
✓ Saved 3 JSON file(s) to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_j

Batches: 100%|██████████| 3/3 [00:04<00:00,  1.62s/it]


✓ Encoded PAD occupations: shape=(164, 384), size=0.24 MB
Computing similarity scores...
✓ Computed similarity matrix: shape=(164, 3037)
  Score range: [0.6324, 0.8967], mean=0.7528
Finding top 20 matches for each PAD occupation...
✓ Found top 20 matches for all 164 PAD occupations
✓ Created results DataFrame: shape=(164, 109)
✓ Saved CSV results to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/P075941_esco_matches.csv
  File size: 1647.33 KB
✓ Saved diagnostics to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/diagnostics/P075941_esco_matches_diagnostics.csv
  File size: 129.90 KB
Splitting 164 records into 3 chunk(s) of up to 75 records
  ✓ Saved chunk 1/3: P075941_000-074_esco_matches.json (75 records, 428.20 KB)
  ✓ Saved chunk 2/3: P075941_075-149_esco_matches.json (75 records, 429.74 KB)
  ✓ Saved chunk 3/3: P075941_150-163_esco_matches.json (14 records, 78.80 KB)
✓ Saved 3 JSON file(s) to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_js

Batches: 100%|██████████| 3/3 [00:05<00:00,  1.96s/it]


✓ Encoded PAD occupations: shape=(191, 384), size=0.28 MB
Computing similarity scores...
✓ Computed similarity matrix: shape=(191, 3037)
  Score range: [0.6625, 0.8756], mean=0.7498
Finding top 20 matches for each PAD occupation...
✓ Found top 20 matches for all 191 PAD occupations
✓ Created results DataFrame: shape=(191, 109)
✓ Saved CSV results to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/P160708_esco_matches.csv
  File size: 2009.55 KB
✓ Saved diagnostics to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/diagnostics/P160708_esco_matches_diagnostics.csv
  File size: 174.14 KB
Splitting 191 records into 3 chunk(s) of up to 75 records
  ✓ Saved chunk 1/3: P160708_000-074_esco_matches.json (75 records, 439.29 KB)
  ✓ Saved chunk 2/3: P160708_075-149_esco_matches.json (75 records, 462.06 KB)
  ✓ Saved chunk 3/3: P160708_150-190_esco_matches.json (41 records, 248.32 KB)
✓ Saved 3 JSON file(s) to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_j

Batches: 100%|██████████| 2/2 [00:03<00:00,  1.86s/it]


✓ Encoded PAD occupations: shape=(127, 384), size=0.19 MB
Computing similarity scores...
✓ Computed similarity matrix: shape=(127, 3037)
  Score range: [0.6734, 0.8931], mean=0.7558
Finding top 20 matches for each PAD occupation...
✓ Found top 20 matches for all 127 PAD occupations
✓ Created results DataFrame: shape=(127, 109)
✓ Saved CSV results to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/P153743_esco_matches.csv
  File size: 1255.94 KB
✓ Saved diagnostics to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/diagnostics/P153743_esco_matches_diagnostics.csv
  File size: 104.29 KB
Splitting 127 records into 2 chunk(s) of up to 75 records
  ✓ Saved chunk 1/2: P153743_000-074_esco_matches.json (75 records, 423.85 KB)
  ✓ Saved chunk 2/2: P153743_075-126_esco_matches.json (52 records, 289.72 KB)
✓ Saved 2 JSON file(s) to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_json
✓ Matched occupations: P153743
Deleted existing CSV: P511453_esco_matches.c

Batches: 100%|██████████| 2/2 [00:03<00:00,  1.62s/it]


✓ Encoded PAD occupations: shape=(118, 384), size=0.17 MB
Computing similarity scores...
✓ Computed similarity matrix: shape=(118, 3037)
  Score range: [0.6669, 0.8813], mean=0.7523
Finding top 20 matches for each PAD occupation...
✓ Found top 20 matches for all 118 PAD occupations
✓ Created results DataFrame: shape=(118, 109)
✓ Saved CSV results to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/P511453_esco_matches.csv
  File size: 1193.23 KB
✓ Saved diagnostics to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/diagnostics/P511453_esco_matches_diagnostics.csv
  File size: 113.93 KB
Splitting 118 records into 2 chunk(s) of up to 75 records
  ✓ Saved chunk 1/2: P511453_000-074_esco_matches.json (75 records, 438.05 KB)
  ✓ Saved chunk 2/2: P511453_075-117_esco_matches.json (43 records, 251.13 KB)
✓ Saved 2 JSON file(s) to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_json
✓ Matched occupations: P511453
