# Match PAD Occupations to ESCO

Use embedding model to match extracted PAD occupations to ESCO occupation taxonomy.

## 0. Setup

### 0.01 Import Required Libraries

In [1]:
import pandas as pd
from pathlib import Path

# Import our config
import sys
sys.path.append(str(Path.cwd().parent))
from src.config import load_config

### 0.02 Set Up Paths

In [2]:
project_root = Path.cwd().parent
config = load_config()

# ESCO data paths
esco_dir = project_root / "data" / "bronze" / "esco"
occupations_file = esco_dir / "occupations_en.csv"
skills_relations_file = esco_dir / "occupationSkillRelations_en.csv"

print(f"ESCO directory: {esco_dir}")
print(f"Occupations file exists: {occupations_file.exists()}")
print(f"Skills relations file exists: {skills_relations_file.exists()}")

ESCO directory: /Users/lauren/repos/PAD2Skills/data/bronze/esco
Occupations file exists: True
Skills relations file exists: True


## 1. Prepare ESCO Data

### 1.01 Read ESCO Occupations Data

In [3]:
# Read ESCO occupations
occ_df = pd.read_csv(occupations_file)

print(f"✓ Loaded {len(occ_df):,} ESCO occupations")
print(f"\nColumns: {list(occ_df.columns)}")
print(f"\nFirst few rows:")
occ_df.head()

✓ Loaded 3,043 ESCO occupations

Columns: ['conceptType', 'conceptUri', 'iscoGroup', 'preferredLabel', 'altLabels', 'hiddenLabels', 'status', 'modifiedDate', 'regulatedProfessionNote', 'scopeNote', 'definition', 'inScheme', 'description', 'code', 'naceCode']

First few rows:


Unnamed: 0,conceptType,conceptUri,iscoGroup,preferredLabel,altLabels,hiddenLabels,status,modifiedDate,regulatedProfessionNote,scopeNote,definition,inScheme,description,code,naceCode
0,Occupation,http://data.europa.eu/esco/occupation/00030d09...,2654,technical director,director of technical arts\ntechnical supervis...,,released,2024-01-25T11:28:50.295Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Technical directors realise the artistic visio...,2654.1.7,http://data.europa.eu/ux2/nace2.1/9031
1,Occupation,http://data.europa.eu/esco/occupation/000e93a3...,8121,metal drawing machine operator,wire drawer\nforming machine operative\ndraw m...,,released,2024-01-23T10:09:32.099Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Metal drawing machine operators set up and ope...,8121.4,http://data.europa.eu/ux2/nace2.1/242
2,Occupation,http://data.europa.eu/esco/occupation/0019b951...,7543,precision device inspector,precision device quality control supervisor\np...,,released,2024-01-25T15:00:12.188Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Precision device inspectors make sure precisio...,7543.10.3,http://data.europa.eu/ux2/nace2.1/2651
3,Occupation,http://data.europa.eu/esco/occupation/0022f466...,3155,air traffic safety technician,air traffic safety electronics hardware specia...,,released,2024-01-29T16:01:13.998Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Air traffic safety technicians provide technic...,3155.1,http://data.europa.eu/ux2/nace2.1/5223
4,Occupation,http://data.europa.eu/esco/occupation/002da35b...,2431,hospitality revenue manager,yield manager\nhospitality yields manager\nhos...,,released,2024-01-11T10:28:45.871Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Hospitality revenue managers maximise revenue ...,2431.9,"http://data.europa.eu/ux2/nace2.1/701,\nhttp:/..."


### 1.02 Read ESCO Skills Relations Data

In [4]:
# Read ESCO skills relations
skills_df = pd.read_csv(skills_relations_file)

print(f"✓ Loaded {len(skills_df):,} skill relations")
print(f"\nColumns: {list(skills_df.columns)}")
print(f"\nFirst few rows:")
skills_df.head()

✓ Loaded 126,051 skill relations

Columns: ['occupationUri', 'occupationLabel', 'relationType', 'skillType', 'skillUri', 'skillLabel']

First few rows:


Unnamed: 0,occupationUri,occupationLabel,relationType,skillType,skillUri,skillLabel
0,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,knowledge,http://data.europa.eu/esco/skill/fed5b267-73fa...,theatre techniques
1,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/05bc7677-5a64...,organise rehearsals
2,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/271a36a0-bc7a...,write risk assessment on performing arts produ...
3,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/47ed1d37-971b...,coordinate with creative departments
4,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/591dd514-735b...,adapt to artists' creative demands


### 1.03 Filter Skills Relations

In [5]:
# Filter for essential skills/competences only
skills_filtered = skills_df[
    (skills_df['relationType'] == 'essential') & 
    (skills_df['skillType'] == 'skill/competence')
].copy()

print(f"✓ Filtered to {len(skills_filtered):,} essential skill/competence relations")
print(f"  (from {len(skills_df):,} total relations)")
print(f"\nUnique occupations: {skills_filtered['occupationUri'].nunique():,}")
print(f"Unique skills: {skills_filtered['skillUri'].nunique():,}")
print(f"\nSample filtered relations:")
skills_filtered.head()

✓ Filtered to 51,155 essential skill/competence relations
  (from 126,051 total relations)

Unique occupations: 3,037
Unique skills: 8,752

Sample filtered relations:


Unnamed: 0,occupationUri,occupationLabel,relationType,skillType,skillUri,skillLabel
1,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/05bc7677-5a64...,organise rehearsals
2,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/271a36a0-bc7a...,write risk assessment on performing arts produ...
3,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/47ed1d37-971b...,coordinate with creative departments
4,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/591dd514-735b...,adapt to artists' creative demands
5,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/860be36a-d19b...,negotiate health and safety issues with third ...


### 1.04 Merge Skills onto Occupations

In [6]:
# Merge skills onto occupations
# occupationUri in skills -> conceptUri in occupations
merged_df = occ_df.merge(
    skills_filtered,
    right_on='occupationUri',
    left_on='conceptUri',
    how='left'
)

print(f"✓ Merged skills onto occupations: {len(merged_df):,} rows")
print(f"\nColumns after merge: {list(merged_df.columns)}")
print(f"\nSample merged data:")
merged_df.head()

✓ Merged skills onto occupations: 51,209 rows

Columns after merge: ['conceptType', 'conceptUri', 'iscoGroup', 'preferredLabel', 'altLabels', 'hiddenLabels', 'status', 'modifiedDate', 'regulatedProfessionNote', 'scopeNote', 'definition', 'inScheme', 'description', 'code', 'naceCode', 'occupationUri', 'occupationLabel', 'relationType', 'skillType', 'skillUri', 'skillLabel']

Sample merged data:


Unnamed: 0,conceptType,conceptUri,iscoGroup,preferredLabel,altLabels,hiddenLabels,status,modifiedDate,regulatedProfessionNote,scopeNote,...,inScheme,description,code,naceCode,occupationUri,occupationLabel,relationType,skillType,skillUri,skillLabel
0,Occupation,http://data.europa.eu/esco/occupation/00030d09...,2654,technical director,director of technical arts\ntechnical supervis...,,released,2024-01-25T11:28:50.295Z,http://data.europa.eu/esco/regulated-professio...,,...,http://data.europa.eu/esco/concept-scheme/memb...,Technical directors realise the artistic visio...,2654.1.7,http://data.europa.eu/ux2/nace2.1/9031,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/05bc7677-5a64...,organise rehearsals
1,Occupation,http://data.europa.eu/esco/occupation/00030d09...,2654,technical director,director of technical arts\ntechnical supervis...,,released,2024-01-25T11:28:50.295Z,http://data.europa.eu/esco/regulated-professio...,,...,http://data.europa.eu/esco/concept-scheme/memb...,Technical directors realise the artistic visio...,2654.1.7,http://data.europa.eu/ux2/nace2.1/9031,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/271a36a0-bc7a...,write risk assessment on performing arts produ...
2,Occupation,http://data.europa.eu/esco/occupation/00030d09...,2654,technical director,director of technical arts\ntechnical supervis...,,released,2024-01-25T11:28:50.295Z,http://data.europa.eu/esco/regulated-professio...,,...,http://data.europa.eu/esco/concept-scheme/memb...,Technical directors realise the artistic visio...,2654.1.7,http://data.europa.eu/ux2/nace2.1/9031,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/47ed1d37-971b...,coordinate with creative departments
3,Occupation,http://data.europa.eu/esco/occupation/00030d09...,2654,technical director,director of technical arts\ntechnical supervis...,,released,2024-01-25T11:28:50.295Z,http://data.europa.eu/esco/regulated-professio...,,...,http://data.europa.eu/esco/concept-scheme/memb...,Technical directors realise the artistic visio...,2654.1.7,http://data.europa.eu/ux2/nace2.1/9031,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/591dd514-735b...,adapt to artists' creative demands
4,Occupation,http://data.europa.eu/esco/occupation/00030d09...,2654,technical director,director of technical arts\ntechnical supervis...,,released,2024-01-25T11:28:50.295Z,http://data.europa.eu/esco/regulated-professio...,,...,http://data.europa.eu/esco/concept-scheme/memb...,Technical directors realise the artistic visio...,2654.1.7,http://data.europa.eu/ux2/nace2.1/9031,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/860be36a-d19b...,negotiate health and safety issues with third ...


### 1.05 Flatten Skills by Occupation

In [7]:
# Group by occupation and combine skills into comma-separated list
flattened_df = merged_df.groupby('occupationUri').agg({
    'conceptUri': 'first',  # Occupation URI from occupations table
    'preferredLabel': 'first',
    'altLabels': 'first',
    'description': 'first',
    'skillLabel': lambda x: ', '.join(x.dropna().astype(str))
}).reset_index()

# Rename skillLabel column to be clearer
flattened_df = flattened_df.rename(columns={'skillLabel': 'skills_list'})

print(f"✓ Flattened to {len(flattened_df):,} unique occupations")
print(f"\nColumns: {list(flattened_df.columns)}")
print(f"\nNote: occupationUri is kept as a column")
print(f"\nSample flattened data:")
flattened_df[['occupationUri', 'preferredLabel', 'skills_list']].head()

✓ Flattened to 3,037 unique occupations

Columns: ['occupationUri', 'conceptUri', 'preferredLabel', 'altLabels', 'description', 'skills_list']

Note: occupationUri is kept as a column

Sample flattened data:


Unnamed: 0,occupationUri,preferredLabel,skills_list
0,http://data.europa.eu/esco/occupation/00030d09...,technical director,"organise rehearsals, write risk assessment on ..."
1,http://data.europa.eu/esco/occupation/000e93a3...,metal drawing machine operator,"monitor gauge, remove inadequate workpieces, t..."
2,http://data.europa.eu/esco/occupation/0019b951...,precision device inspector,"monitor machine operations, read assembly draw..."
3,http://data.europa.eu/esco/occupation/0022f466...,air traffic safety technician,"implement airside safety procedures, install e..."
4,http://data.europa.eu/esco/occupation/002da35b...,hospitality revenue manager,"produce statistical financial records, comply ..."


### 1.06 Combine Fields into Single Text Column

In [8]:
# Combine preferredLabel, altLabels, description, and skills into single text column
# Prioritize most important info first to minimize impact of model truncation
def combine_fields(row):
    """Combine multiple fields into prioritized space-separated string"""
    parts = []
    
    # 1. Add preferredLabel (most important)
    if pd.notna(row['preferredLabel']):
        parts.append(str(row['preferredLabel']))
    
    # 2. Add first 5 altLabels
    first_alt_labels = []
    remaining_alt_labels = []
    if pd.notna(row['altLabels']):
        alt_labels_str = str(row['altLabels'])
        # Split by newline first, then by comma if no newlines
        if '\n' in alt_labels_str:
            alt_labels_list = [label.strip() for label in alt_labels_str.split('\n') if label.strip()]
        else:
            alt_labels_list = [label.strip() for label in alt_labels_str.split(',') if label.strip()]
        
        first_alt_labels = alt_labels_list[:5]
        remaining_alt_labels = alt_labels_list[5:]
        
        if first_alt_labels:
            parts.append(' '.join(first_alt_labels))
    
    # 3. Add description
    if pd.notna(row['description']):
        parts.append(str(row['description']))
    
    # 4. Add skills_list truncated to 1500 characters
    if pd.notna(row['skills_list']):
        skills_str = str(row['skills_list'])
        if len(skills_str) > 1500:
            skills_str = skills_str[:1500]
        parts.append(skills_str)
    
    # 5. Add remaining altLabels (6+) if they exist
    if remaining_alt_labels:
        parts.append(' '.join(remaining_alt_labels))
    
    return ' '.join(parts)

# Create combined text column
flattened_df['combined_text'] = flattened_df.apply(combine_fields, axis=1)

print(f"✓ Created combined_text column with prioritized fields")
print(f"\nSample combined text (first 500 chars):")
print("=" * 80)
print(flattened_df['combined_text'].iloc[0][:500])
print("...")
print("=" * 80)
print(f"\nFinal dataset shape: {flattened_df.shape}")
print(f"Columns: {list(flattened_df.columns)}")

✓ Created combined_text column with prioritized fields

Sample combined text (first 500 chars):
technical director director of technical arts technical supervisor head of technical technical and operations director technical manager Technical directors realise the artistic visions of the creators within technical constraints. They coordinate the operations of various production units, such as scene, wardrobe, sound and lighting, and make-up. They adapt the prototype and study the feasibility, implementation, operation and technical monitoring of the artistic project. They are also responsi
...

Final dataset shape: (3037, 7)
Columns: ['occupationUri', 'conceptUri', 'preferredLabel', 'altLabels', 'description', 'skills_list', 'combined_text']


### 1.07 Inspect Final Dataset

In [9]:
# Display summary statistics
print("Dataset Summary:")
print("=" * 80)
print(f"Total occupations: {len(flattened_df):,}")
print(f"\nCombined text length statistics:")
print(flattened_df['combined_text'].str.len().describe())

print(f"\nSample rows:")
flattened_df.head()

Dataset Summary:
Total occupations: 3,037

Combined text length statistics:
count    3037.000000
mean     1111.472176
std       433.540204
min       252.000000
25%       798.000000
50%      1016.000000
75%      1333.000000
max      3119.000000
Name: combined_text, dtype: float64

Sample rows:


Unnamed: 0,occupationUri,conceptUri,preferredLabel,altLabels,description,skills_list,combined_text
0,http://data.europa.eu/esco/occupation/00030d09...,http://data.europa.eu/esco/occupation/00030d09...,technical director,director of technical arts\ntechnical supervis...,Technical directors realise the artistic visio...,"organise rehearsals, write risk assessment on ...",technical director director of technical arts ...
1,http://data.europa.eu/esco/occupation/000e93a3...,http://data.europa.eu/esco/occupation/000e93a3...,metal drawing machine operator,wire drawer\nforming machine operative\ndraw m...,Metal drawing machine operators set up and ope...,"monitor gauge, remove inadequate workpieces, t...",metal drawing machine operator wire drawer for...
2,http://data.europa.eu/esco/occupation/0019b951...,http://data.europa.eu/esco/occupation/0019b951...,precision device inspector,precision device quality control supervisor\np...,Precision device inspectors make sure precisio...,"monitor machine operations, read assembly draw...",precision device inspector precision device qu...
3,http://data.europa.eu/esco/occupation/0022f466...,http://data.europa.eu/esco/occupation/0022f466...,air traffic safety technician,air traffic safety electronics hardware specia...,Air traffic safety technicians provide technic...,"implement airside safety procedures, install e...",air traffic safety technician air traffic safe...
4,http://data.europa.eu/esco/occupation/002da35b...,http://data.europa.eu/esco/occupation/002da35b...,hospitality revenue manager,yield manager\nhospitality yields manager\nhos...,Hospitality revenue managers maximise revenue ...,"produce statistical financial records, comply ...",hospitality revenue manager yield manager hosp...


### 1.08 Analyze Combined Text Character Lengths

In [10]:
# Calculate character lengths
flattened_df['text_length'] = flattened_df['combined_text'].str.len()

# Summary statistics
print("Combined Text Character Length Summary:")
print("=" * 80)
print(f"\nTotal occupations: {len(flattened_df):,}")
print(f"\nCharacter count statistics:")
print(flattened_df['text_length'].describe())

print(f"\nAdditional metrics:")
print(f"  Minimum length: {flattened_df['text_length'].min():,} characters")
print(f"  Maximum length: {flattened_df['text_length'].max():,} characters")
print(f"  Total characters: {flattened_df['text_length'].sum():,} characters")
print(f"  Average length: {flattened_df['text_length'].mean():.1f} characters")
print(f"  Median length: {flattened_df['text_length'].median():.1f} characters")

# Distribution insights
print(f"\nLength distribution:")
print(f"  < 500 chars: {(flattened_df['text_length'] < 500).sum():,} occupations ({(flattened_df['text_length'] < 500).sum() / len(flattened_df) * 100:.1f}%)")
print(f"  500-1000 chars: {((flattened_df['text_length'] >= 500) & (flattened_df['text_length'] < 1000)).sum():,} occupations")
print(f"  1000-2000 chars: {((flattened_df['text_length'] >= 1000) & (flattened_df['text_length'] < 2000)).sum():,} occupations")
print(f"  2000+ chars: {(flattened_df['text_length'] >= 2000).sum():,} occupations ({(flattened_df['text_length'] >= 2000).sum() / len(flattened_df) * 100:.1f}%)")


Combined Text Character Length Summary:

Total occupations: 3,037

Character count statistics:
count    3037.000000
mean     1111.472176
std       433.540204
min       252.000000
25%       798.000000
50%      1016.000000
75%      1333.000000
max      3119.000000
Name: text_length, dtype: float64

Additional metrics:
  Minimum length: 252 characters
  Maximum length: 3,119 characters
  Total characters: 3,375,541 characters
  Average length: 1111.5 characters
  Median length: 1016.0 characters

Length distribution:
  < 500 chars: 70 occupations (2.3%)
  500-1000 chars: 1,393 occupations
  1000-2000 chars: 1,416 occupations
  2000+ chars: 158 occupations (5.2%)


### 1.09 Prepare Final Dataset for Export

In [11]:
# Extract ESCO UUID from conceptUri (the official ESCO identifier)
flattened_df['esco_id'] = flattened_df['conceptUri'].apply(lambda uri: uri.split('/')[-1])

# Select only the columns we need (including description for JSON output)
export_df = flattened_df[['esco_id', 'conceptUri', 'preferredLabel', 'description', 'combined_text']].copy()

print(f"✓ Created export dataset with {len(export_df):,} rows")
print(f"  ESCO ID is the UUID from conceptUri (official ESCO identifier)")
print(f"\nColumns: {list(export_df.columns)}")
print(f"\nSample export data:")
export_df.head()

✓ Created export dataset with 3,037 rows
  ESCO ID is the UUID from conceptUri (official ESCO identifier)

Columns: ['esco_id', 'conceptUri', 'preferredLabel', 'description', 'combined_text']

Sample export data:


Unnamed: 0,esco_id,conceptUri,preferredLabel,description,combined_text
0,00030d09-2b3a-4efd-87cc-c4ea39d27c34,http://data.europa.eu/esco/occupation/00030d09...,technical director,Technical directors realise the artistic visio...,technical director director of technical arts ...
1,000e93a3-d956-4e45-aacb-f12c83fedf84,http://data.europa.eu/esco/occupation/000e93a3...,metal drawing machine operator,Metal drawing machine operators set up and ope...,metal drawing machine operator wire drawer for...
2,0019b951-c699-4191-8208-9822882d150c,http://data.europa.eu/esco/occupation/0019b951...,precision device inspector,Precision device inspectors make sure precisio...,precision device inspector precision device qu...
3,0022f466-426c-41a4-ac96-a235c945cf97,http://data.europa.eu/esco/occupation/0022f466...,air traffic safety technician,Air traffic safety technicians provide technic...,air traffic safety technician air traffic safe...
4,002da35b-7808-43f3-83bf-63596b8b351f,http://data.europa.eu/esco/occupation/002da35b...,hospitality revenue manager,Hospitality revenue managers maximise revenue ...,hospitality revenue manager yield manager hosp...


### 1.10 Save to CSV

In [12]:
# Save to silver directory
output_dir = project_root / "data" / "silver"
output_file = output_dir / "esco_occupations_prepared.csv"

export_df.to_csv(output_file, index=False)

print(f"✓ Saved to: {output_file}")
print(f"  File size: {output_file.stat().st_size / 1024:.2f} KB")
print(f"  Rows: {len(export_df):,}")
print(f"  Columns: {list(export_df.columns)}")

✓ Saved to: /Users/lauren/repos/PAD2Skills/data/silver/esco_occupations_prepared.csv
  File size: 4602.02 KB
  Rows: 3,037
  Columns: ['esco_id', 'conceptUri', 'preferredLabel', 'description', 'combined_text']


In [13]:
# Explore rows with long lengths
long_rows = flattened_df[flattened_df['text_length'] > 2000]
long_rows.head()

Unnamed: 0,occupationUri,conceptUri,preferredLabel,altLabels,description,skills_list,combined_text,text_length,esco_id
8,http://data.europa.eu/esco/occupation/006cc1f9...,http://data.europa.eu/esco/occupation/006cc1f9...,physiotherapist,respiratory therapist\nmasseuse\nrehabilitatio...,Physiotherapists are autonomous health profess...,"contribute to quality physiotherapy services, ...",physiotherapist respiratory therapist masseuse...,2611,006cc1f9-2841-41c3-991a-dc3f2f3bd533
13,http://data.europa.eu/esco/occupation/009d29de...,http://data.europa.eu/esco/occupation/009d29de...,rental service representative in other machine...,rental sales desk supervisor in other machiner...,Rental service representatives in other machin...,"guarantee customer satisfaction, perform multi...",rental service representative in other machine...,2145,009d29de-5872-43be-8d9b-abd27f8c99f1
49,http://data.europa.eu/esco/occupation/034cad59...,http://data.europa.eu/esco/occupation/034cad59...,special educational needs teacher secondary sc...,secondary school special education teacher\nsp...,Special educational needs teachers at secondar...,"guarantee students' safety, assign homework, m...",special educational needs teacher secondary sc...,2161,034cad59-e666-4770-a7ef-a337257f8072
94,http://data.europa.eu/esco/occupation/068df7d1...,http://data.europa.eu/esco/occupation/068df7d1...,performance lighting designer,lighting designer and technician\nlighting tec...,Performance lighting designers develop a light...,analyse the artistic concept based on stage ac...,performance lighting designer lighting designe...,2316,068df7d1-516f-4829-b339-294cb0cf6318
115,http://data.europa.eu/esco/occupation/08984bec...,http://data.europa.eu/esco/occupation/08984bec...,specialised veterinarian,zoological medicine veterinarian\nemergency me...,Specialised veterinarians are professionals wi...,certify the performance of veterinary procedur...,specialised veterinarian zoological medicine v...,2710,08984bec-31d8-4bb0-aa2f-d557761ff029


## 2. Prepare PAD Occupation Data

### 2.01 Load PAD Occupation JSON Files

In [14]:
import json

# Path to PAD occupations JSON files
pad_occs_dir = project_root / "data" / "silver" / "occupations_skills_json"

# Find all P075941_*.json files
json_files = sorted(pad_occs_dir.glob("P075941_*_occupations.json"))

print(f"Found {len(json_files)} PAD occupation JSON files")
print(f"\nFirst 5 files:")
for f in json_files[:5]:
    print(f"  {f.name}")
if len(json_files) > 5:
    print(f"  ... and {len(json_files) - 5} more")

Found 16 PAD occupation JSON files

First 5 files:
  P075941_0_occupations.json
  P075941_10_occupations.json
  P075941_11_occupations.json
  P075941_12_occupations.json
  P075941_13_occupations.json
  ... and 11 more


### 2.02 Read and Concatenate JSON Files

In [15]:
# Read all JSON files and collect extractions
all_extractions = []

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Check if extractions exist and are not null
    if data.get('extractions') is not None:
        # Each extraction is a dictionary, add project_id and section_id to it
        for extraction in data['extractions']:
            extraction['project_id'] = data['project_id']
            extraction['section_id'] = data['section_id']
            all_extractions.append(extraction)

print(f"✓ Loaded {len(all_extractions):,} total occupation extractions")
print(f"  from {len(json_files)} JSON files")

# Convert to DataFrame
pad_occs_df = pd.DataFrame(all_extractions)

print(f"\n✓ Created DataFrame with {len(pad_occs_df):,} rows")
print(f"\nColumns: {list(pad_occs_df.columns)}")
print(f"\nFirst few rows:")
pad_occs_df.head()

✓ Loaded 190 total occupation extractions
  from 16 JSON files

✓ Created DataFrame with 190 rows

Columns: ['extraction_id', 'identified_occupation', 'activity_description_in_pad', 'skills_needed_for_activity', 'source_material_quote', 'project_id', 'section_id']

First few rows:


Unnamed: 0,extraction_id,identified_occupation,activity_description_in_pad,skills_needed_for_activity,source_material_quote,project_id,section_id
0,1,Transmission Line Construction Engineer,Build power system interconnections to form re...,"[build power system interconnections, form the...",The Project will also build important power sy...,P075941,0
1,2,Electrical Transmission Line Installer,Build transmission interconnections and form t...,"[build power system interconnections, form the...",The Project will also build important power sy...,P075941,0
2,3,Hydroelectric Power Plant Construction Engineer,Increase regional power generation capacity to...,"[increase regional generation, help meet regio...",It will play a critical role in increasing reg...,P075941,0
3,4,Water Resources Development Specialist,Develop and manage river resources in an integ...,[develop the resources of the Nyabarongo River...,This collaboration will ensure that the resour...,P075941,0
4,5,Environmental Management Specialist,Ensure integrated and environmentally sustaina...,[ensure development is integrated and environm...,This collaboration will ensure that the resour...,P075941,0


### 2.03 Inspect PAD Occupations Dataset

In [16]:
# Display summary information
print("PAD Occupations Dataset Summary:")
print("=" * 80)
print(f"Total extractions: {len(pad_occs_df):,}")
print(f"Unique occupations: {pad_occs_df['identified_occupation'].nunique():,}")
print(f"Unique project-section combinations: {pad_occs_df.groupby(['project_id', 'section_id']).ngroups:,}")

print(f"\nSample occupations:")
print(pad_occs_df['identified_occupation'].value_counts().head(10))

print(f"\nDataFrame info:")
pad_occs_df.info()

PAD Occupations Dataset Summary:
Total extractions: 190
Unique occupations: 160
Unique project-section combinations: 14

Sample occupations:
identified_occupation
Procurement Specialist                     6
Owner's Engineer                           4
Monitoring and Evaluation Specialist       4
Communications Specialist                  4
Financial Management Specialist            4
Construction Supervisor                    3
Resettlement Specialist                    3
Transmission Line Construction Engineer    2
Procurement Officer                        2
Geotechnical Engineer                      2
Name: count, dtype: int64

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 190 entries, 0 to 189
Data columns (total 7 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   extraction_id                190 non-null    int64 
 1   identified_occupation        190 non-null    object
 2   activit

### 2.04 Prepare PAD Occupations for Export

In [17]:
# Create three-digit ID with leading zeros
pad_occs_df['pad_id'] = [f"{i:03d}" for i in range(len(pad_occs_df))]

# Create combined text column
def combine_pad_fields(row):
    """Combine PAD occupation fields into single text column"""
    parts = []
    
    # Add identified_occupation
    if pd.notna(row['identified_occupation']):
        parts.append(str(row['identified_occupation']))
    
    # Add activity_description_in_pad
    if pd.notna(row['activity_description_in_pad']):
        parts.append(str(row['activity_description_in_pad']))
    
    # Add skills_needed_for_activity (remove list brackets and quotes)
    skills = row['skills_needed_for_activity']
    if skills is not None and skills is not pd.NA:
        # If it's a list, join with commas
        if isinstance(skills, list):
            skills_str = ', '.join(str(s) for s in skills)
        else:
            # If it's a string representation of a list, clean it up
            skills_str = str(skills).strip('[]').replace("'", "").replace('"', '')
        parts.append(skills_str)
    
    return ' '.join(parts)

# Create combined text column
pad_occs_df['combined_text'] = pad_occs_df.apply(combine_pad_fields, axis=1)

print(f"✓ Created pad_id and combined_text columns")
print(f"\nSample combined text (first 500 chars):")
print("=" * 80)
print(pad_occs_df['combined_text'].iloc[0][:500])
print("...")
print("=" * 80)
print(f"\nColumns: {list(pad_occs_df.columns)}")
print(f"\nSample rows with new columns:")
pad_occs_df[['pad_id', 'identified_occupation', 'combined_text']].head()

✓ Created pad_id and combined_text columns

Sample combined text (first 500 chars):
Transmission Line Construction Engineer Build power system interconnections to form regional transmission backbone build power system interconnections, form the transmission backbone linking Burundi, Rwanda, and Tanzania
...

Columns: ['extraction_id', 'identified_occupation', 'activity_description_in_pad', 'skills_needed_for_activity', 'source_material_quote', 'project_id', 'section_id', 'pad_id', 'combined_text']

Sample rows with new columns:


Unnamed: 0,pad_id,identified_occupation,combined_text
0,0,Transmission Line Construction Engineer,Transmission Line Construction Engineer Build ...
1,1,Electrical Transmission Line Installer,Electrical Transmission Line Installer Build t...
2,2,Hydroelectric Power Plant Construction Engineer,Hydroelectric Power Plant Construction Enginee...
3,3,Water Resources Development Specialist,Water Resources Development Specialist Develop...
4,4,Environmental Management Specialist,Environmental Management Specialist Ensure int...


### 2.05 Save PAD Occupations to CSV

In [18]:
# Create output directory
output_dir = project_root / "data" / "silver" / "occupation_skills_csv"
output_dir.mkdir(parents=True, exist_ok=True)

# Use project_id for filename
project_id = pad_occs_df['project_id'].unique()[0]
output_file = output_dir / f"{project_id}_pad_occupations_prepared.csv"

# Save to CSV
pad_occs_df.to_csv(output_file, index=False)

print(f"✓ Saved to: {output_file}")
print(f"  File size: {output_file.stat().st_size / 1024:.2f} KB")
print(f"  Rows: {len(pad_occs_df):,}")
print(f"  Columns: {list(pad_occs_df.columns)}")

✓ Saved to: /Users/lauren/repos/PAD2Skills/data/silver/occupation_skills_csv/P075941_pad_occupations_prepared.csv
  File size: 145.82 KB
  Rows: 190
  Columns: ['extraction_id', 'identified_occupation', 'activity_description_in_pad', 'skills_needed_for_activity', 'source_material_quote', 'project_id', 'section_id', 'pad_id', 'combined_text']


## 3. Occupation Retrieval

### 3.01 Load Prepared CSV Files

In [19]:
# Load ESCO occupations
esco_csv = project_root / "data" / "silver" / "esco_occupations_prepared.csv"
esco_df = pd.read_csv(esco_csv)

print(f"✓ Loaded ESCO occupations: {len(esco_df):,} rows")
print(f"  Columns: {list(esco_df.columns)}")

# Load PAD occupations (using project_id from previous section)
pad_csv = project_root / "data" / "silver" / "occupation_skills_csv" / f"{project_id}_pad_occupations_prepared.csv"
pad_df = pd.read_csv(pad_csv)

print(f"\n✓ Loaded PAD occupations: {len(pad_df):,} rows")
print(f"  Columns: {list(pad_df.columns)}")

print(f"\nReady for embedding-based matching")
print(f"  ESCO: {len(esco_df):,} occupations to match against")
print(f"  PAD: {len(pad_df):,} queries to match")

✓ Loaded ESCO occupations: 3,037 rows
  Columns: ['esco_id', 'conceptUri', 'preferredLabel', 'description', 'combined_text']

✓ Loaded PAD occupations: 190 rows
  Columns: ['extraction_id', 'identified_occupation', 'activity_description_in_pad', 'skills_needed_for_activity', 'source_material_quote', 'project_id', 'section_id', 'pad_id', 'combined_text']

Ready for embedding-based matching
  ESCO: 3,037 occupations to match against
  PAD: 190 queries to match


### 3.02 Load Sentence Transformer Model

In [20]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load model - intfloat/e5-small-v2 is optimized for semantic search
model = SentenceTransformer("intfloat/e5-small-v2")

print(f"✓ Loaded model: intfloat/e5-small-v2")
print(f"  Max sequence length: {model.max_seq_length}")
print(f"  Embedding dimension: {model.get_sentence_embedding_dimension()}")

  from .autonotebook import tqdm as notebook_tqdm


✓ Loaded model: intfloat/e5-small-v2
  Max sequence length: 512
  Embedding dimension: 384


### 3.03 Encode ESCO Occupations (Passages)

In [21]:
# Set to True to force re-encoding even if cached embeddings exist
overwrite_embeddings = False

# Prepare ESCO texts with "passage: " prefix for e5 model
esco_texts = ["passage: " + text for text in esco_df['combined_text'].tolist()]

print(f"Encoding {len(esco_texts):,} ESCO occupations...")
print(f"Sample passage (first 200 chars):")
print(esco_texts[0][:200])
print("...")

# Check if cached embeddings exist
embeddings_dir = project_root / "data" / "silver" / "embeddings"
embeddings_dir.mkdir(parents=True, exist_ok=True)
esco_embeddings_file = embeddings_dir / "esco_embeddings.npy"

if esco_embeddings_file.exists() and not overwrite_embeddings:
    print(f"\n✓ Loading cached ESCO embeddings from: {esco_embeddings_file}")
    E = np.load(esco_embeddings_file)
    print(f"  Loaded embeddings shape: {E.shape}")
else:
    if overwrite_embeddings and esco_embeddings_file.exists():
        print(f"\n⚠ Overwriting existing embeddings (overwrite_embeddings=True)")
    else:
        print(f"\nNo cached embeddings found. Encoding...")
    
    # Encode ESCO texts (normalized for cosine similarity via dot product)
    E = model.encode(esco_texts, normalize_embeddings=True, batch_size=64, show_progress_bar=True)
    
    # Save embeddings for future use
    np.save(esco_embeddings_file, E)
    print(f"\n✓ Saved embeddings to: {esco_embeddings_file}")

print(f"\n✓ ESCO embeddings ready")
print(f"  Embeddings shape: {E.shape}")
print(f"  Memory size: {E.nbytes / 1024 / 1024:.2f} MB")

Encoding 3,037 ESCO occupations...
Sample passage (first 200 chars):
passage: technical director director of technical arts technical supervisor head of technical technical and operations director technical manager Technical directors realise the artistic visions of th
...

✓ Loading cached ESCO embeddings from: /Users/lauren/repos/PAD2Skills/data/silver/embeddings/esco_embeddings.npy
  Loaded embeddings shape: (3037, 384)

✓ ESCO embeddings ready
  Embeddings shape: (3037, 384)
  Memory size: 4.45 MB


### 3.04 Encode PAD Occupations (Queries)

In [22]:
# Prepare PAD texts with "query: " prefix for e5 model
queries = ["query: " + text for text in pad_df['combined_text'].tolist()]

print(f"Encoding {len(queries):,} PAD occupation queries...")
print(f"Sample query (first 200 chars):")
print(queries[0][:200])
print("...")

# Encode PAD queries (normalized for cosine similarity via dot product)
Q = model.encode(queries, normalize_embeddings=True, batch_size=64, show_progress_bar=True)

print(f"\n✓ Encoded PAD occupations")
print(f"  Embeddings shape: {Q.shape}")
print(f"  Memory size: {Q.nbytes / 1024 / 1024:.2f} MB")

Encoding 190 PAD occupation queries...
Sample query (first 200 chars):
query: Transmission Line Construction Engineer Build power system interconnections to form regional transmission backbone build power system interconnections, form the transmission backbone linking Bu
...


Batches: 100%|██████████| 3/3 [00:05<00:00,  1.71s/it]


✓ Encoded PAD occupations
  Embeddings shape: (190, 384)
  Memory size: 0.28 MB





### 3.05 Compute Similarities and Get Top 20 Matches

In [23]:
# Compute cosine similarity scores (normalized embeddings -> dot product)
print("Computing similarity scores...")
scores = Q @ E.T

print(f"✓ Computed similarity matrix")
print(f"  Shape: {scores.shape} (PAD queries × ESCO passages)")
print(f"  Min score: {scores.min():.4f}")
print(f"  Max score: {scores.max():.4f}")
print(f"  Mean score: {scores.mean():.4f}")

# Get top 20 matches for each PAD occupation
print(f"\nFinding top 20 matches for each PAD occupation...")
topk_indices = np.argsort(-scores, axis=1)[:, :20]  # Top 20 indices per query
topk_scores = np.take_along_axis(scores, topk_indices, axis=1)  # Corresponding scores

print(f"✓ Found top 20 matches for all {len(pad_df):,} PAD occupations")
print(f"\nSample top 20 for first PAD occupation:")
print(f"  Indices: {topk_indices[0]}")
print(f"  Scores: {topk_scores[0]}")

Computing similarity scores...
✓ Computed similarity matrix
  Shape: (190, 3037) (PAD queries × ESCO passages)
  Min score: 0.6609
  Max score: 0.9085
  Mean score: 0.7556

Finding top 20 matches for each PAD occupation...
✓ Found top 20 matches for all 190 PAD occupations

Sample top 20 for first PAD occupation:
  Indices: [1547 1101   44  593 1387 2452 2871 2706 2104  384  156 2526 1669 1578
 3011 2919 2488 1749 1534 2710]
  Scores: [0.82189405 0.8163478  0.81102157 0.80630004 0.80594254 0.8008526
 0.8005297  0.7991286  0.7977431  0.79741347 0.7970016  0.79456615
 0.7940226  0.79381347 0.79322124 0.79297626 0.79255885 0.79238546
 0.7922157  0.79032075]


### 3.06 Create Results DataFrame with Top 20 Matches

In [24]:
# Start with PAD occupation data
results_df = pad_df.copy()

# Add top 20 ESCO matches as columns
for rank in range(20):
    # Get ESCO IDs (UUIDs) for this rank
    esco_ids = [esco_df.iloc[idx]['esco_id'] for idx in topk_indices[:, rank]]
    results_df[f'match_{rank+1}_esco_id'] = esco_ids
    
    # Get ESCO URIs for this rank
    esco_uris = [esco_df.iloc[idx]['conceptUri'] for idx in topk_indices[:, rank]]
    results_df[f'match_{rank+1}_uri'] = esco_uris
    
    # Get ESCO occupation labels for this rank
    esco_labels = [esco_df.iloc[idx]['preferredLabel'] for idx in topk_indices[:, rank]]
    results_df[f'match_{rank+1}_occupation'] = esco_labels
    
    # Get ESCO descriptions for this rank (description field, not combined_text)
    esco_descriptions = [esco_df.iloc[idx]['description'] for idx in topk_indices[:, rank]]
    results_df[f'match_{rank+1}_description'] = esco_descriptions
    
    # Get similarity scores for this rank
    results_df[f'match_{rank+1}_score'] = topk_scores[:, rank]

print(f"✓ Created results DataFrame with top 20 matches")
print(f"  Shape: {results_df.shape}")
print(f"  Original PAD columns: {len(pad_df.columns)}")
print(f"  New match columns: {results_df.shape[1] - len(pad_df.columns)} (100 columns: esco_id, URI, label, description, score × 20 ranks)")
print(f"  Note: esco_id is the official ESCO UUID extracted from conceptUri in Section 1")
print(f"\nNew columns added:")
print([col for col in results_df.columns if col.startswith('match_')][:10])
print("...")

print(f"\nSample result (first PAD occupation with top 3 matches):")
sample = results_df.iloc[0]
print(f"PAD: {sample['identified_occupation']}")
for i in range(3):
    print(f"  Match {i+1}: {sample[f'match_{i+1}_occupation']} (esco_id: {sample[f'match_{i+1}_esco_id']}, score: {sample[f'match_{i+1}_score']:.4f})")

✓ Created results DataFrame with top 20 matches
  Shape: (190, 109)
  Original PAD columns: 9
  New match columns: 100 (100 columns: esco_id, URI, label, description, score × 20 ranks)
  Note: esco_id is the official ESCO UUID extracted from conceptUri in Section 1

New columns added:
['match_1_esco_id', 'match_1_uri', 'match_1_occupation', 'match_1_description', 'match_1_score', 'match_2_esco_id', 'match_2_uri', 'match_2_occupation', 'match_2_description', 'match_2_score']
...

Sample result (first PAD occupation with top 3 matches):
PAD: Transmission Line Construction Engineer
  Match 1: electrical transmission system operator (esco_id: 7cdeb653-8f3d-4921-832b-b95f9d700a86, score: 0.8219)
  Match 2: electric power generation engineer (esco_id: 58db3ac6-5217-4d46-8a4c-126598be1d13, score: 0.8163)
  Match 3: telecommunications engineer (esco_id: 02eb0ae6-ecdd-4602-9c8e-60ffe6dbe1e2, score: 0.8110)


  results_df[f'match_{rank+1}_description'] = esco_descriptions
  results_df[f'match_{rank+1}_score'] = topk_scores[:, rank]


### 3.07 Save Results to CSV

In [None]:
# Save to silver/esco_matching_csv directory
output_dir = project_root / "data" / "silver" / "esco_matching_csv"
output_dir.mkdir(parents=True, exist_ok=True)

# Use project_id for filename
project_id = results_df['project_id'].unique()[0]
output_file = output_dir / f"{project_id}_esco_matches.csv"
results_df.to_csv(output_file, index=False)

print(f"✓ Saved results to: {output_file}")
print(f"  File size: {output_file.stat().st_size / 1024:.2f} KB")
print(f"  Rows: {len(results_df):,}")
print(f"  Columns: {len(results_df.columns)}")
print(f"\nResults include:")
print(f"  - All original PAD occupation fields")
print(f"  - Top 20 ESCO matches with IDs, labels, and similarity scores")


✓ Saved results to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/P075941_esco_matches.csv
  File size: 1903.30 KB
  Rows: 190
  Columns: 109

Results include:
  - All original PAD occupation fields
  - Top 20 ESCO matches with IDs, labels, and similarity scores


In [30]:
# Create and save diagnostics file
diagnostics_dir = output_dir / "diagnostics"
diagnostics_dir.mkdir(parents=True, exist_ok=True)

# Select only diagnostic columns
diag_cols = ['identified_occupation', 'source_material_quote'] + [f'match_{i}_occupation' for i in range(1, 21)]
diagnostics_df = results_df[diag_cols].copy()

diagnostics_file = diagnostics_dir / f"{project_id}_esco_matches_diagnostics.csv"
diagnostics_df.to_csv(diagnostics_file, index=False)

print(f"\n✓ Saved diagnostics to: {diagnostics_file}")
print(f"  File size: {diagnostics_file.stat().st_size / 1024:.2f} KB")
print(f"  Rows: {len(diagnostics_df):,}")
print(f"  Columns: {len(diagnostics_df.columns)} (identified_occupation, source_material_quote, match_1-20_occupation)")


✓ Saved diagnostics to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_csv/diagnostics/P075941_esco_matches_diagnostics.csv
  File size: 156.62 KB
  Rows: 190
  Columns: 22 (identified_occupation, source_material_quote, match_1-20_occupation)


In [26]:
# Display combined_text, matches, and scores .head()
results_df.loc[:, 'combined_text':].head()

Unnamed: 0,combined_text,match_1_esco_id,match_1_uri,match_1_occupation,match_1_description,match_1_score,match_2_esco_id,match_2_uri,match_2_occupation,match_2_description,...,match_19_esco_id,match_19_uri,match_19_occupation,match_19_description,match_19_score,match_20_esco_id,match_20_uri,match_20_occupation,match_20_description,match_20_score
0,Transmission Line Construction Engineer Build ...,7cdeb653-8f3d-4921-832b-b95f9d700a86,http://data.europa.eu/esco/occupation/7cdeb653...,electrical transmission system operator,Electrical transmission system operators trans...,0.821894,58db3ac6-5217-4d46-8a4c-126598be1d13,http://data.europa.eu/esco/occupation/58db3ac6...,electric power generation engineer,Electric power generation engineers design and...,...,7bc02fa7-7aaa-41a6-be5b-ab7a7b6684d3,http://data.europa.eu/esco/occupation/7bc02fa7...,gas distribution engineer,Gas distribution engineers design and construc...,0.792216,e12f08fb-4748-4388-9489-b647df60332a,http://data.europa.eu/esco/occupation/e12f08fb...,hydropower engineer,"Hydropower engineers research, design and plan...",0.790321
1,Electrical Transmission Line Installer Build t...,7cdeb653-8f3d-4921-832b-b95f9d700a86,http://data.europa.eu/esco/occupation/7cdeb653...,electrical transmission system operator,Electrical transmission system operators trans...,0.823216,f5ba8fb4-1387-459a-9b22-59feb891e229,http://data.europa.eu/esco/occupation/f5ba8fb4...,electrical cable assembler,Electrical cable assembler manipulate cables a...,...,d694b54f-208a-4154-9fe1-d3d335992467,http://data.europa.eu/esco/occupation/d694b54f...,import export specialist in electronic and tel...,Import export specialists in electronic and te...,0.790698,5dbb9cf0-b226-402c-a295-2f42ef05ff8b,http://data.europa.eu/esco/occupation/5dbb9cf0...,domestic electrician,Domestic electricians install and maintain ele...,0.790106
2,Hydroelectric Power Plant Construction Enginee...,58db3ac6-5217-4d46-8a4c-126598be1d13,http://data.europa.eu/esco/occupation/58db3ac6...,electric power generation engineer,Electric power generation engineers design and...,0.857467,0e99c929-364f-4b0a-8a64-2aab42420f00,http://data.europa.eu/esco/occupation/0e99c929...,renewable energy engineer,Renewable energy engineers research alternativ...,...,ba490796-46f7-45b7-827b-db6882fca3e1,http://data.europa.eu/esco/occupation/ba490796...,onshore wind energy engineer,"Onshore wind energy engineers design, install ...",0.812998,a377fafb-7e6e-4925-bc64-8099b58aafb8,http://data.europa.eu/esco/occupation/a377fafb...,drainage engineer,Drainage engineers design and construct draina...,0.812186
3,Water Resources Development Specialist Develop...,fcf9024e-277c-43c7-8ec8-8903e5c9c576,http://data.europa.eu/esco/occupation/fcf9024e...,water engineer,Water engineers research and develop methods f...,0.826864,fab474ea-a4a9-4c11-9548-1d50ad594a34,http://data.europa.eu/esco/occupation/fab474ea...,natural resources consultant,Natural resources consultant provide advice on...,...,0e99c929-364f-4b0a-8a64-2aab42420f00,http://data.europa.eu/esco/occupation/0e99c929...,renewable energy engineer,Renewable energy engineers research alternativ...,0.79618,e12f08fb-4748-4388-9489-b647df60332a,http://data.europa.eu/esco/occupation/e12f08fb...,hydropower engineer,"Hydropower engineers research, design and plan...",0.795815
4,Environmental Management Specialist Ensure int...,4b214899-ed00-4156-b269-a23df0fb1093,http://data.europa.eu/esco/occupation/4b214899...,environmental protection manager,Environmental protection managers provide advi...,0.859926,32548f7e-8c25-4c49-9f22-78dee3543704,http://data.europa.eu/esco/occupation/32548f7e...,environmental mining engineer,Environmental mining engineers oversee the env...,...,bff716c9-42c7-471e-8af0-582f799fb29a,http://data.europa.eu/esco/occupation/bff716c9...,aquaculture environmental analyst,"Aquaculture environmental analysts assess, pla...",0.824342,a005f8a2-2d8a-433e-8d89-13575f822fd4,http://data.europa.eu/esco/occupation/a005f8a2...,energy manager,Energy managers coordinate the energy use in a...,0.824173


## 4. Prepare JSON Output

### 4.01 Load Matching Results CSV

In [31]:
# Load the matching results CSV
matching_csv = project_root / "data" / "silver" / "esco_matching_csv" / f"{project_id}_esco_matches.csv"
matches_df = pd.read_csv(matching_csv)

print(f"✓ Loaded matching results: {len(matches_df):,} rows")
print(f"  Columns: {len(matches_df.columns)}")
print(f"\nSample row:")
matches_df.head(1)

✓ Loaded matching results: 190 rows
  Columns: 109

Sample row:


Unnamed: 0,extraction_id,identified_occupation,activity_description_in_pad,skills_needed_for_activity,source_material_quote,project_id,section_id,pad_id,combined_text,match_1_esco_id,...,match_19_esco_id,match_19_uri,match_19_occupation,match_19_description,match_19_score,match_20_esco_id,match_20_uri,match_20_occupation,match_20_description,match_20_score
0,1,Transmission Line Construction Engineer,Build power system interconnections to form re...,"['build power system interconnections', 'form ...",The Project will also build important power sy...,P075941,0,0,Transmission Line Construction Engineer Build ...,7cdeb653-8f3d-4921-832b-b95f9d700a86,...,7bc02fa7-7aaa-41a6-be5b-ab7a7b6684d3,http://data.europa.eu/esco/occupation/7bc02fa7...,gas distribution engineer,Gas distribution engineers design and construc...,0.792216,e12f08fb-4748-4388-9489-b647df60332a,http://data.europa.eu/esco/occupation/e12f08fb...,hydropower engineer,"Hydropower engineers research, design and plan...",0.790321


### 4.02 Transform to JSON Format

In [None]:
# Transform each row into the specified JSON format
records = []

for idx, row in matches_df.iterrows():
    # Build esco_candidates array with top 10 matches
    esco_candidates = []
    for rank in range(1, 11):  # Top 10 matches
        # Use esco_id directly from Section 1 (no need to extract from URI)
        candidate = {
            "rank": rank,
            "esco_id": row[f'match_{rank}_esco_id'],  # UUID from Section 1
            "label": row[f'match_{rank}_occupation'],
            "description": row[f'match_{rank}_description'],
            "similarity_score": round(float(row[f'match_{rank}_score']), 2)
        }
        esco_candidates.append(candidate)
    
    # Build the record
    record = {
        "record_id": row['pad_id'],
        "pad_occupation": row['identified_occupation'],
        "pad_activity": row['activity_description_in_pad'] if pd.notna(row['activity_description_in_pad']) else "",
        "pad_quote": row['source_material_quote'] if pd.notna(row['source_material_quote']) else "",
        "esco_candidates": esco_candidates
    }
    records.append(record)

print(f"✓ Transformed {len(records):,} records to JSON format")
print(f"\nSample record (first with top 3 candidates):")
print("=" * 80)
sample = records[0].copy()
sample['esco_candidates'] = sample['esco_candidates'][:3]  # Show only first 3
print(json.dumps(sample, indent=2))
print("...")

✓ Transformed 190 records to JSON format

Sample record (first with top 3 candidates):
{
  "record_id": 0,
  "pad_occupation": "Transmission Line Construction Engineer",
  "pad_activity": "Build power system interconnections to form regional transmission backbone",
  "pad_quote": "The Project will also build important power system interconnections, which will help form the transmission backbone linking Burundi, Rwanda, and Tanzania.",
  "esco_candidates": [
    {
      "rank": 1,
      "esco_id": "7cdeb653-8f3d-4921-832b-b95f9d700a86",
      "label": "electrical transmission system operator",
      "description": "Electrical transmission system operators transport energy in the form of electrical power. They transmit electrical power from generation plants over an interconnected network, an electrical grid, to electricity distribution stations.",
      "similarity_score": 0.82189405
    },
    {
      "rank": 2,
      "esco_id": "58db3ac6-5217-4d46-8a4c-126598be1d13",
      "label": "e

### 4.03 Save to JSON File

In [35]:
# Save to JSON files, chunked into files of 75 records each
json_output_dir = project_root / "data" / "silver" / "esco_matching_json"
json_output_dir.mkdir(parents=True, exist_ok=True)

# Split records into chunks of 75
chunk_size = 75
num_chunks = (len(records) + chunk_size - 1) // chunk_size  # Ceiling division

print(f"Splitting {len(records):,} records into {num_chunks} chunk(s) of up to {chunk_size} records each")
print()

saved_files = []
for i in range(num_chunks):
    start_idx = i * chunk_size
    end_idx = min((i + 1) * chunk_size, len(records))
    chunk_records = records[start_idx:end_idx]
    
    # Format filename with leading zeros: projectid_start-end_esco_matches.json
    chunk_filename = f"{project_id}_{start_idx:03d}-{end_idx-1:03d}_esco_matches.json"
    chunk_file = json_output_dir / chunk_filename
    
    with open(chunk_file, 'w', encoding='utf-8') as f:
        json.dump(chunk_records, f, indent=2, ensure_ascii=False)
    
    saved_files.append(chunk_filename)
    print(f"✓ Saved chunk {i+1}/{num_chunks}: {chunk_filename}")
    print(f"  Records: {len(chunk_records)} (indices {start_idx}-{end_idx-1})")
    print(f"  File size: {chunk_file.stat().st_size / 1024:.2f} KB")
    print()

print("=" * 80)
print(f"✓ Saved {num_chunks} JSON file(s) to: {json_output_dir}")
print(f"  Total records: {len(records):,}")
print(f"  Files created: {', '.join(saved_files)}")
print(f"\nEach record contains:")
print(f"  - PAD occupation details (record_id, occupation, activity, quote)")
print(f"  - Top 10 ESCO candidates with ID, label, description, and similarity score")

Splitting 190 records into 3 chunk(s) of up to 75 records each

✓ Saved chunk 1/3: P075941_000-074_esco_matches.json
  Records: 75 (indices 0-74)
  File size: 431.41 KB

✓ Saved chunk 2/3: P075941_075-149_esco_matches.json
  Records: 75 (indices 75-149)
  File size: 427.32 KB

✓ Saved chunk 3/3: P075941_150-189_esco_matches.json
  Records: 40 (indices 150-189)
  File size: 242.72 KB

✓ Saved 3 JSON file(s) to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_json
  Total records: 190
  Files created: P075941_000-074_esco_matches.json, P075941_075-149_esco_matches.json, P075941_150-189_esco_matches.json

Each record contains:
  - PAD occupation details (record_id, occupation, activity, quote)
  - Top 10 ESCO candidates with ID, label, description, and similarity score
