# Match PAD Occupations to ESCO

Use embedding model to match extracted PAD occupations to ESCO occupation taxonomy.

## 0. Setup

### 0.01 Import Required Libraries

In [22]:
import pandas as pd
from pathlib import Path

# Import our config
import sys
sys.path.append(str(Path.cwd().parent))
from src.config import load_config

### 0.02 Set Up Paths

In [23]:
project_root = Path.cwd().parent
config = load_config()

# ESCO data paths
esco_dir = project_root / "data" / "bronze" / "esco"
occupations_file = esco_dir / "occupations_en.csv"
skills_relations_file = esco_dir / "occupationSkillRelations_en.csv"

print(f"ESCO directory: {esco_dir}")
print(f"Occupations file exists: {occupations_file.exists()}")
print(f"Skills relations file exists: {skills_relations_file.exists()}")

ESCO directory: /Users/lauren/repos/PAD2Skills/data/bronze/esco
Occupations file exists: True
Skills relations file exists: True


## 1. Prepare ESCO Data

### 1.01 Read ESCO Occupations Data

In [24]:
# Read ESCO occupations
occ_df = pd.read_csv(occupations_file)

print(f"✓ Loaded {len(occ_df):,} ESCO occupations")
print(f"\nColumns: {list(occ_df.columns)}")
print(f"\nFirst few rows:")
occ_df.head()

✓ Loaded 3,043 ESCO occupations

Columns: ['conceptType', 'conceptUri', 'iscoGroup', 'preferredLabel', 'altLabels', 'hiddenLabels', 'status', 'modifiedDate', 'regulatedProfessionNote', 'scopeNote', 'definition', 'inScheme', 'description', 'code', 'naceCode']

First few rows:


Unnamed: 0,conceptType,conceptUri,iscoGroup,preferredLabel,altLabels,hiddenLabels,status,modifiedDate,regulatedProfessionNote,scopeNote,definition,inScheme,description,code,naceCode
0,Occupation,http://data.europa.eu/esco/occupation/00030d09...,2654,technical director,director of technical arts\ntechnical supervis...,,released,2024-01-25T11:28:50.295Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Technical directors realise the artistic visio...,2654.1.7,http://data.europa.eu/ux2/nace2.1/9031
1,Occupation,http://data.europa.eu/esco/occupation/000e93a3...,8121,metal drawing machine operator,wire drawer\nforming machine operative\ndraw m...,,released,2024-01-23T10:09:32.099Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Metal drawing machine operators set up and ope...,8121.4,http://data.europa.eu/ux2/nace2.1/242
2,Occupation,http://data.europa.eu/esco/occupation/0019b951...,7543,precision device inspector,precision device quality control supervisor\np...,,released,2024-01-25T15:00:12.188Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Precision device inspectors make sure precisio...,7543.10.3,http://data.europa.eu/ux2/nace2.1/2651
3,Occupation,http://data.europa.eu/esco/occupation/0022f466...,3155,air traffic safety technician,air traffic safety electronics hardware specia...,,released,2024-01-29T16:01:13.998Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Air traffic safety technicians provide technic...,3155.1,http://data.europa.eu/ux2/nace2.1/5223
4,Occupation,http://data.europa.eu/esco/occupation/002da35b...,2431,hospitality revenue manager,yield manager\nhospitality yields manager\nhos...,,released,2024-01-11T10:28:45.871Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Hospitality revenue managers maximise revenue ...,2431.9,"http://data.europa.eu/ux2/nace2.1/701,\nhttp:/..."


### 1.02 Read ESCO Skills Relations Data

In [25]:
# Read ESCO skills relations
skills_df = pd.read_csv(skills_relations_file)

print(f"✓ Loaded {len(skills_df):,} skill relations")
print(f"\nColumns: {list(skills_df.columns)}")
print(f"\nFirst few rows:")
skills_df.head()

✓ Loaded 126,051 skill relations

Columns: ['occupationUri', 'occupationLabel', 'relationType', 'skillType', 'skillUri', 'skillLabel']

First few rows:


Unnamed: 0,occupationUri,occupationLabel,relationType,skillType,skillUri,skillLabel
0,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,knowledge,http://data.europa.eu/esco/skill/fed5b267-73fa...,theatre techniques
1,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/05bc7677-5a64...,organise rehearsals
2,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/271a36a0-bc7a...,write risk assessment on performing arts produ...
3,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/47ed1d37-971b...,coordinate with creative departments
4,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/591dd514-735b...,adapt to artists' creative demands


### 1.03 Filter Skills Relations

In [26]:
# Filter for essential skills/competences only
skills_filtered = skills_df[
    (skills_df['relationType'] == 'essential') & 
    (skills_df['skillType'] == 'skill/competence')
].copy()

print(f"✓ Filtered to {len(skills_filtered):,} essential skill/competence relations")
print(f"  (from {len(skills_df):,} total relations)")
print(f"\nUnique occupations: {skills_filtered['occupationUri'].nunique():,}")
print(f"Unique skills: {skills_filtered['skillUri'].nunique():,}")
print(f"\nSample filtered relations:")
skills_filtered.head()

✓ Filtered to 51,155 essential skill/competence relations
  (from 126,051 total relations)

Unique occupations: 3,037
Unique skills: 8,752

Sample filtered relations:


Unnamed: 0,occupationUri,occupationLabel,relationType,skillType,skillUri,skillLabel
1,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/05bc7677-5a64...,organise rehearsals
2,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/271a36a0-bc7a...,write risk assessment on performing arts produ...
3,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/47ed1d37-971b...,coordinate with creative departments
4,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/591dd514-735b...,adapt to artists' creative demands
5,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/860be36a-d19b...,negotiate health and safety issues with third ...


### 1.04 Merge Skills onto Occupations

In [27]:
# Merge skills onto occupations
# occupationUri in skills -> conceptUri in occupations
merged_df = occ_df.merge(
    skills_filtered,
    right_on='occupationUri',
    left_on='conceptUri',
    how='left'
)

print(f"✓ Merged skills onto occupations: {len(merged_df):,} rows")
print(f"\nColumns after merge: {list(merged_df.columns)}")
print(f"\nSample merged data:")
merged_df.head()

✓ Merged skills onto occupations: 51,209 rows

Columns after merge: ['conceptType', 'conceptUri', 'iscoGroup', 'preferredLabel', 'altLabels', 'hiddenLabels', 'status', 'modifiedDate', 'regulatedProfessionNote', 'scopeNote', 'definition', 'inScheme', 'description', 'code', 'naceCode', 'occupationUri', 'occupationLabel', 'relationType', 'skillType', 'skillUri', 'skillLabel']

Sample merged data:


Unnamed: 0,conceptType,conceptUri,iscoGroup,preferredLabel,altLabels,hiddenLabels,status,modifiedDate,regulatedProfessionNote,scopeNote,...,inScheme,description,code,naceCode,occupationUri,occupationLabel,relationType,skillType,skillUri,skillLabel
0,Occupation,http://data.europa.eu/esco/occupation/00030d09...,2654,technical director,director of technical arts\ntechnical supervis...,,released,2024-01-25T11:28:50.295Z,http://data.europa.eu/esco/regulated-professio...,,...,http://data.europa.eu/esco/concept-scheme/memb...,Technical directors realise the artistic visio...,2654.1.7,http://data.europa.eu/ux2/nace2.1/9031,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/05bc7677-5a64...,organise rehearsals
1,Occupation,http://data.europa.eu/esco/occupation/00030d09...,2654,technical director,director of technical arts\ntechnical supervis...,,released,2024-01-25T11:28:50.295Z,http://data.europa.eu/esco/regulated-professio...,,...,http://data.europa.eu/esco/concept-scheme/memb...,Technical directors realise the artistic visio...,2654.1.7,http://data.europa.eu/ux2/nace2.1/9031,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/271a36a0-bc7a...,write risk assessment on performing arts produ...
2,Occupation,http://data.europa.eu/esco/occupation/00030d09...,2654,technical director,director of technical arts\ntechnical supervis...,,released,2024-01-25T11:28:50.295Z,http://data.europa.eu/esco/regulated-professio...,,...,http://data.europa.eu/esco/concept-scheme/memb...,Technical directors realise the artistic visio...,2654.1.7,http://data.europa.eu/ux2/nace2.1/9031,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/47ed1d37-971b...,coordinate with creative departments
3,Occupation,http://data.europa.eu/esco/occupation/00030d09...,2654,technical director,director of technical arts\ntechnical supervis...,,released,2024-01-25T11:28:50.295Z,http://data.europa.eu/esco/regulated-professio...,,...,http://data.europa.eu/esco/concept-scheme/memb...,Technical directors realise the artistic visio...,2654.1.7,http://data.europa.eu/ux2/nace2.1/9031,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/591dd514-735b...,adapt to artists' creative demands
4,Occupation,http://data.europa.eu/esco/occupation/00030d09...,2654,technical director,director of technical arts\ntechnical supervis...,,released,2024-01-25T11:28:50.295Z,http://data.europa.eu/esco/regulated-professio...,,...,http://data.europa.eu/esco/concept-scheme/memb...,Technical directors realise the artistic visio...,2654.1.7,http://data.europa.eu/ux2/nace2.1/9031,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/860be36a-d19b...,negotiate health and safety issues with third ...


### 1.05 Flatten Skills by Occupation

In [28]:
# Group by occupation and combine skills into comma-separated list
flattened_df = merged_df.groupby('occupationUri').agg({
    'conceptUri': 'first',  # Occupation URI from occupations table
    'preferredLabel': 'first',
    'altLabels': 'first',
    'description': 'first',
    'skillLabel': lambda x: ', '.join(x.dropna().astype(str))
}).reset_index()

# Rename skillLabel column to be clearer
flattened_df = flattened_df.rename(columns={'skillLabel': 'skills_list'})

print(f"✓ Flattened to {len(flattened_df):,} unique occupations")
print(f"\nColumns: {list(flattened_df.columns)}")
print(f"\nNote: occupationUri is kept as a column")
print(f"\nSample flattened data:")
flattened_df[['occupationUri', 'preferredLabel', 'skills_list']].head()

✓ Flattened to 3,037 unique occupations

Columns: ['occupationUri', 'conceptUri', 'preferredLabel', 'altLabels', 'description', 'skills_list']

Note: occupationUri is kept as a column

Sample flattened data:


Unnamed: 0,occupationUri,preferredLabel,skills_list
0,http://data.europa.eu/esco/occupation/00030d09...,technical director,"organise rehearsals, write risk assessment on ..."
1,http://data.europa.eu/esco/occupation/000e93a3...,metal drawing machine operator,"monitor gauge, remove inadequate workpieces, t..."
2,http://data.europa.eu/esco/occupation/0019b951...,precision device inspector,"monitor machine operations, read assembly draw..."
3,http://data.europa.eu/esco/occupation/0022f466...,air traffic safety technician,"implement airside safety procedures, install e..."
4,http://data.europa.eu/esco/occupation/002da35b...,hospitality revenue manager,"produce statistical financial records, comply ..."


### 1.06 Combine Fields into Single Text Column

In [29]:
# Combine preferredLabel, altLabels, description, and skills into single text column
def combine_fields(row):
    """Combine multiple fields into single space-separated string"""
    parts = []
    
    # Add each field if it exists and is not null
    if pd.notna(row['preferredLabel']):
        parts.append(str(row['preferredLabel']))
    
    if pd.notna(row['altLabels']):
        parts.append(str(row['altLabels']))
    
    if pd.notna(row['description']):
        parts.append(str(row['description']))
    
    if pd.notna(row['skills_list']):
        parts.append(str(row['skills_list']))
    
    return ' '.join(parts)

# Create combined text column
flattened_df['combined_text'] = flattened_df.apply(combine_fields, axis=1)

print(f"✓ Created combined_text column")
print(f"\nSample combined text (first 500 chars):")
print("=" * 80)
print(flattened_df['combined_text'].iloc[0][:500])
print("...")
print("=" * 80)
print(f"\nFinal dataset shape: {flattened_df.shape}")
print(f"Columns: {list(flattened_df.columns)}")

✓ Created combined_text column

Sample combined text (first 500 chars):
technical director director of technical arts
technical supervisor
head of technical
technical and operations director
technical manager
head of technical department Technical directors realise the artistic visions of the creators within technical constraints. They coordinate the operations of various production units, such as scene, wardrobe, sound and lighting, and make-up. They adapt the prototype and study the feasibility, implementation, operation and technical monitoring of the artistic pr
...

Final dataset shape: (3037, 7)
Columns: ['occupationUri', 'conceptUri', 'preferredLabel', 'altLabels', 'description', 'skills_list', 'combined_text']


### 1.07 Inspect Final Dataset

In [30]:
# Display summary statistics
print("Dataset Summary:")
print("=" * 80)
print(f"Total occupations: {len(flattened_df):,}")
print(f"\nCombined text length statistics:")
print(flattened_df['combined_text'].str.len().describe())

print(f"\nSample rows:")
flattened_df.head()

Dataset Summary:
Total occupations: 3,037

Combined text length statistics:
count    3037.000000
mean     1128.527494
std       485.669579
min       252.000000
25%       798.000000
50%      1016.000000
75%      1333.000000
max      3633.000000
Name: combined_text, dtype: float64

Sample rows:


Unnamed: 0,occupationUri,conceptUri,preferredLabel,altLabels,description,skills_list,combined_text
0,http://data.europa.eu/esco/occupation/00030d09...,http://data.europa.eu/esco/occupation/00030d09...,technical director,director of technical arts\ntechnical supervis...,Technical directors realise the artistic visio...,"organise rehearsals, write risk assessment on ...",technical director director of technical arts\...
1,http://data.europa.eu/esco/occupation/000e93a3...,http://data.europa.eu/esco/occupation/000e93a3...,metal drawing machine operator,wire drawer\nforming machine operative\ndraw m...,Metal drawing machine operators set up and ope...,"monitor gauge, remove inadequate workpieces, t...",metal drawing machine operator wire drawer\nfo...
2,http://data.europa.eu/esco/occupation/0019b951...,http://data.europa.eu/esco/occupation/0019b951...,precision device inspector,precision device quality control supervisor\np...,Precision device inspectors make sure precisio...,"monitor machine operations, read assembly draw...",precision device inspector precision device qu...
3,http://data.europa.eu/esco/occupation/0022f466...,http://data.europa.eu/esco/occupation/0022f466...,air traffic safety technician,air traffic safety electronics hardware specia...,Air traffic safety technicians provide technic...,"implement airside safety procedures, install e...",air traffic safety technician air traffic safe...
4,http://data.europa.eu/esco/occupation/002da35b...,http://data.europa.eu/esco/occupation/002da35b...,hospitality revenue manager,yield manager\nhospitality yields manager\nhos...,Hospitality revenue managers maximise revenue ...,"produce statistical financial records, comply ...",hospitality revenue manager yield manager\nhos...


### 1.08 Analyze Combined Text Character Lengths

In [12]:
# Calculate character lengths
flattened_df['text_length'] = flattened_df['combined_text'].str.len()

# Summary statistics
print("Combined Text Character Length Summary:")
print("=" * 80)
print(f"\nTotal occupations: {len(flattened_df):,}")
print(f"\nCharacter count statistics:")
print(flattened_df['text_length'].describe())

print(f"\nAdditional metrics:")
print(f"  Minimum length: {flattened_df['text_length'].min():,} characters")
print(f"  Maximum length: {flattened_df['text_length'].max():,} characters")
print(f"  Total characters: {flattened_df['text_length'].sum():,} characters")
print(f"  Average length: {flattened_df['text_length'].mean():.1f} characters")
print(f"  Median length: {flattened_df['text_length'].median():.1f} characters")

# Distribution insights
print(f"\nLength distribution:")
print(f"  < 500 chars: {(flattened_df['text_length'] < 500).sum():,} occupations ({(flattened_df['text_length'] < 500).sum() / len(flattened_df) * 100:.1f}%)")
print(f"  500-1000 chars: {((flattened_df['text_length'] >= 500) & (flattened_df['text_length'] < 1000)).sum():,} occupations")
print(f"  1000-2000 chars: {((flattened_df['text_length'] >= 1000) & (flattened_df['text_length'] < 2000)).sum():,} occupations")
print(f"  2000+ chars: {(flattened_df['text_length'] >= 2000).sum():,} occupations ({(flattened_df['text_length'] >= 2000).sum() / len(flattened_df) * 100:.1f}%)")


Combined Text Character Length Summary:

Total occupations: 3,037

Character count statistics:
count    3037.000000
mean     1128.527494
std       485.669579
min       252.000000
25%       798.000000
50%      1016.000000
75%      1333.000000
max      3633.000000
Name: text_length, dtype: float64

Additional metrics:
  Minimum length: 252 characters
  Maximum length: 3,633 characters
  Total characters: 3,427,338 characters
  Average length: 1128.5 characters
  Median length: 1016.0 characters

Length distribution:
  < 500 chars: 70 occupations (2.3%)
  500-1000 chars: 1,393 occupations
  1000-2000 chars: 1,395 occupations
  2000+ chars: 179 occupations (5.9%)


### 1.09 Prepare Final Dataset for Export

In [31]:
# Create four-digit unique ID with leading zeros
flattened_df['esco_id'] = [f"{i:04d}" for i in range(len(flattened_df))]

# Select only the columns we need
export_df = flattened_df[['esco_id', 'occupationUri', 'preferredLabel', 'combined_text']].copy()

print(f"✓ Created export dataset with {len(export_df):,} rows")
print(f"\nColumns: {list(export_df.columns)}")
print(f"\nSample export data:")
export_df.head()

✓ Created export dataset with 3,037 rows

Columns: ['esco_id', 'occupationUri', 'preferredLabel', 'combined_text']

Sample export data:


Unnamed: 0,esco_id,occupationUri,preferredLabel,combined_text
0,0,http://data.europa.eu/esco/occupation/00030d09...,technical director,technical director director of technical arts\...
1,1,http://data.europa.eu/esco/occupation/000e93a3...,metal drawing machine operator,metal drawing machine operator wire drawer\nfo...
2,2,http://data.europa.eu/esco/occupation/0019b951...,precision device inspector,precision device inspector precision device qu...
3,3,http://data.europa.eu/esco/occupation/0022f466...,air traffic safety technician,air traffic safety technician air traffic safe...
4,4,http://data.europa.eu/esco/occupation/002da35b...,hospitality revenue manager,hospitality revenue manager yield manager\nhos...


### 1.10 Save to CSV

In [32]:
# Save to silver directory
output_dir = project_root / "data" / "silver"
output_file = output_dir / "esco_occupations_prepared.csv"

export_df.to_csv(output_file, index=False)

print(f"✓ Saved to: {output_file}")
print(f"  File size: {output_file.stat().st_size / 1024:.2f} KB")
print(f"  Rows: {len(export_df):,}")
print(f"  Columns: {list(export_df.columns)}")

✓ Saved to: /Users/lauren/repos/PAD2Skills/data/silver/esco_occupations_prepared.csv
  File size: 3665.71 KB
  Rows: 3,037
  Columns: ['esco_id', 'occupationUri', 'preferredLabel', 'combined_text']


In [15]:
# Explore rows with long lengths
long_rows = flattened_df[flattened_df['text_length'] > 2000]
long_rows.head()

Unnamed: 0,occupationUri,conceptUri,preferredLabel,altLabels,description,skills_list,combined_text,text_length
8,http://data.europa.eu/esco/occupation/006cc1f9...,http://data.europa.eu/esco/occupation/006cc1f9...,physiotherapist,respiratory therapist\nmasseuse\nrehabilitatio...,Physiotherapists are autonomous health profess...,"contribute to quality physiotherapy services, ...",physiotherapist respiratory therapist\nmasseus...,3536
13,http://data.europa.eu/esco/occupation/009d29de...,http://data.europa.eu/esco/occupation/009d29de...,rental service representative in other machine...,rental sales desk supervisor in other machiner...,Rental service representatives in other machin...,"guarantee customer satisfaction, perform multi...",rental service representative in other machine...,2145
49,http://data.europa.eu/esco/occupation/034cad59...,http://data.europa.eu/esco/occupation/034cad59...,special educational needs teacher secondary sc...,secondary school special education teacher\nsp...,Special educational needs teachers at secondar...,"guarantee students' safety, assign homework, m...",special educational needs teacher secondary sc...,2161
94,http://data.europa.eu/esco/occupation/068df7d1...,http://data.europa.eu/esco/occupation/068df7d1...,performance lighting designer,lighting designer and technician\nlighting tec...,Performance lighting designers develop a light...,analyse the artistic concept based on stage ac...,performance lighting designer lighting designe...,2316
115,http://data.europa.eu/esco/occupation/08984bec...,http://data.europa.eu/esco/occupation/08984bec...,specialised veterinarian,zoological medicine veterinarian\nemergency me...,Specialised veterinarians are professionals wi...,certify the performance of veterinary procedur...,specialised veterinarian zoological medicine v...,2710


## 2. Prepare PAD Occupation Data

### 2.01 Load PAD Occupation JSON Files

In [16]:
import json

# Path to PAD occupations JSON files
pad_occs_dir = project_root / "data" / "silver" / "occupations_skills"

# Find all P075941_*.json files
json_files = sorted(pad_occs_dir.glob("P075941_*_occupations.json"))

print(f"Found {len(json_files)} PAD occupation JSON files")
print(f"\nFirst 5 files:")
for f in json_files[:5]:
    print(f"  {f.name}")
if len(json_files) > 5:
    print(f"  ... and {len(json_files) - 5} more")

Found 16 PAD occupation JSON files

First 5 files:
  P075941_0_occupations.json
  P075941_10_occupations.json
  P075941_11_occupations.json
  P075941_12_occupations.json
  P075941_13_occupations.json
  ... and 11 more


### 2.02 Read and Concatenate JSON Files

In [17]:
# Read all JSON files and collect extractions
all_extractions = []

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Check if extractions exist and are not null
    if data.get('extractions') is not None:
        # Each extraction is a dictionary, add project_id and section_id to it
        for extraction in data['extractions']:
            extraction['project_id'] = data['project_id']
            extraction['section_id'] = data['section_id']
            all_extractions.append(extraction)

print(f"✓ Loaded {len(all_extractions):,} total occupation extractions")
print(f"  from {len(json_files)} JSON files")

# Convert to DataFrame
pad_occs_df = pd.DataFrame(all_extractions)

print(f"\n✓ Created DataFrame with {len(pad_occs_df):,} rows")
print(f"\nColumns: {list(pad_occs_df.columns)}")
print(f"\nFirst few rows:")
pad_occs_df.head()

✓ Loaded 246 total occupation extractions
  from 16 JSON files

✓ Created DataFrame with 246 rows

Columns: ['extraction_id', 'identified_occupation', 'industry', 'activity_description_in_pad', 'skills_needed_for_activity', 'source_material_quote', 'project_id', 'section_id']

First few rows:


Unnamed: 0,extraction_id,identified_occupation,industry,activity_description_in_pad,skills_needed_for_activity,source_material_quote,project_id,section_id
0,1,Transmission Line Construction Engineer,Construction of high-voltage transmission lines,the construction of various transmission lines...,"[construction of transmission lines, commissio...",the construction of various transmission lines...,P075941,0
1,2,Power Plant Rehabilitation Engineer,Rehabilitation of existing power plants,rehabilitation of existing power plants,[rehabilitation of existing power plants],the CAS calls for the rehabilitation f exis...,P075941,0
2,3,Hydropower Plant Construction Engineer,Construction of hydropower plants,construction of new power plants,[construction of new plants],the CAS calls for the rehabilitation f exis...,P075941,0
3,4,Rural Electrification Program Coordinator,Rural electrification program development,development of a rural electrification program...,[development of a rural electrification program],the CAS calls for the rehabilitation f exis...,P075941,0
4,5,Power System Planning Engineer,Regional power system planning and feasibility...,assessment of economic and engineering feasibi...,"[assessment of economic feasibility, assessmen...",The CBWS has prepared a Regional Power Develop...,P075941,0


### 2.03 Inspect PAD Occupations Dataset

In [18]:
# Display summary information
print("PAD Occupations Dataset Summary:")
print("=" * 80)
print(f"Total extractions: {len(pad_occs_df):,}")
print(f"Unique occupations: {pad_occs_df['identified_occupation'].nunique():,}")
print(f"Unique industries: {pad_occs_df['industry'].nunique():,}")
print(f"Unique project-section combinations: {pad_occs_df.groupby(['project_id', 'section_id']).ngroups:,}")

print(f"\nSample occupations:")
print(pad_occs_df['identified_occupation'].value_counts().head(10))

print(f"\nDataFrame info:")
pad_occs_df.info()

PAD Occupations Dataset Summary:
Total extractions: 246
Unique occupations: 210
Unique industries: 219
Unique project-section combinations: 16

Sample occupations:
identified_occupation
Transmission Line Construction Engineer                  5
Owner's Engineer                                         5
Procurement Specialist                                   5
Financial Management Specialist                          4
Environmental and Social Impact Assessment Specialist    3
Resettlement Action Plan Specialist                      3
Communications Specialist                                3
Resettlement Specialist                                  3
Monitoring and Evaluation Specialist                     3
Hydropower Plant Construction Engineer                   2
Name: count, dtype: int64

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246 entries, 0 to 245
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------         

### 2.04 Prepare PAD Occupations for Export

In [20]:
# Create three-digit ID with leading zeros
pad_occs_df['pad_id'] = [f"{i:03d}" for i in range(len(pad_occs_df))]

# Create combined text column
def combine_pad_fields(row):
    """Combine PAD occupation fields into single text column"""
    parts = []
    
    # Add identified_occupation
    if pd.notna(row['identified_occupation']):
        parts.append(str(row['identified_occupation']))
    
    # Add industry
    if pd.notna(row['industry']):
        parts.append(str(row['industry']))
    
    # Add activity_description_in_pad
    if pd.notna(row['activity_description_in_pad']):
        parts.append(str(row['activity_description_in_pad']))
    
    # Add skills_needed_for_activity (remove list brackets and quotes)
    skills = row['skills_needed_for_activity']
    if skills is not None and skills is not pd.NA:
        # If it's a list, join with commas
        if isinstance(skills, list):
            skills_str = ', '.join(str(s) for s in skills)
        else:
            # If it's a string representation of a list, clean it up
            skills_str = str(skills).strip('[]').replace("'", "").replace('"', '')
        parts.append(skills_str)
    
    return ' '.join(parts)

# Create combined text column
pad_occs_df['combined_text'] = pad_occs_df.apply(combine_pad_fields, axis=1)

print(f"✓ Created pad_id and combined_text columns")
print(f"\nSample combined text (first 500 chars):")
print("=" * 80)
print(pad_occs_df['combined_text'].iloc[0][:500])
print("...")
print("=" * 80)
print(f"\nColumns: {list(pad_occs_df.columns)}")
print(f"\nSample rows with new columns:")
pad_occs_df[['pad_id', 'identified_occupation', 'combined_text']].head()

✓ Created pad_id and combined_text columns

Sample combined text (first 500 chars):
Transmission Line Construction Engineer Construction of high-voltage transmission lines the construction of various transmission lines underway, scheduled for commissioning by 2014 construction of transmission lines, commissioning of transmission lines
...

Columns: ['extraction_id', 'identified_occupation', 'industry', 'activity_description_in_pad', 'skills_needed_for_activity', 'source_material_quote', 'project_id', 'section_id', 'pad_id', 'combined_text']

Sample rows with new columns:


Unnamed: 0,pad_id,identified_occupation,combined_text
0,0,Transmission Line Construction Engineer,Transmission Line Construction Engineer Constr...
1,1,Power Plant Rehabilitation Engineer,Power Plant Rehabilitation Engineer Rehabilita...
2,2,Hydropower Plant Construction Engineer,Hydropower Plant Construction Engineer Constru...
3,3,Rural Electrification Program Coordinator,Rural Electrification Program Coordinator Rura...
4,4,Power System Planning Engineer,Power System Planning Engineer Regional power ...


### 2.05 Save PAD Occupations to CSV

In [21]:
# Create output directory
output_dir = project_root / "data" / "silver" / "occupation_skills_csv"
output_dir.mkdir(parents=True, exist_ok=True)

output_file = output_dir / "pad_occupations_prepared.csv"

# Save to CSV
pad_occs_df.to_csv(output_file, index=False)

print(f"✓ Saved to: {output_file}")
print(f"  File size: {output_file.stat().st_size / 1024:.2f} KB")
print(f"  Rows: {len(pad_occs_df):,}")
print(f"  Columns: {list(pad_occs_df.columns)}")

✓ Saved to: /Users/lauren/repos/PAD2Skills/data/silver/occupation_skills_csv/pad_occupations_prepared.csv
  File size: 201.22 KB
  Rows: 246
  Columns: ['extraction_id', 'identified_occupation', 'industry', 'activity_description_in_pad', 'skills_needed_for_activity', 'source_material_quote', 'project_id', 'section_id', 'pad_id', 'combined_text']


## 3. Occupation Retrieval

### 3.01 Load Prepared CSV Files

In [33]:
# Load ESCO occupations
esco_csv = project_root / "data" / "silver" / "esco_occupations_prepared.csv"
esco_df = pd.read_csv(esco_csv)

print(f"✓ Loaded ESCO occupations: {len(esco_df):,} rows")
print(f"  Columns: {list(esco_df.columns)}")

# Load PAD occupations
pad_csv = project_root / "data" / "silver" / "occupation_skills_csv" / "pad_occupations_prepared.csv"
pad_df = pd.read_csv(pad_csv)

print(f"\n✓ Loaded PAD occupations: {len(pad_df):,} rows")
print(f"  Columns: {list(pad_df.columns)}")

print(f"\nReady for embedding-based matching")
print(f"  ESCO: {len(esco_df):,} occupations to match against")
print(f"  PAD: {len(pad_df):,} queries to match")

✓ Loaded ESCO occupations: 3,037 rows
  Columns: ['esco_id', 'occupationUri', 'preferredLabel', 'combined_text']

✓ Loaded PAD occupations: 246 rows
  Columns: ['extraction_id', 'identified_occupation', 'industry', 'activity_description_in_pad', 'skills_needed_for_activity', 'source_material_quote', 'project_id', 'section_id', 'pad_id', 'combined_text']

Ready for embedding-based matching
  ESCO: 3,037 occupations to match against
  PAD: 246 queries to match


### 3.02 Load Sentence Transformer Model

In [35]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load model - intfloat/e5-small-v2 is optimized for semantic search
model = SentenceTransformer("intfloat/e5-small-v2")

print(f"✓ Loaded model: intfloat/e5-small-v2")
print(f"  Max sequence length: {model.max_seq_length}")
print(f"  Embedding dimension: {model.get_sentence_embedding_dimension()}")

  from .autonotebook import tqdm as notebook_tqdm


✓ Loaded model: intfloat/e5-small-v2
  Max sequence length: 512
  Embedding dimension: 384


### 3.03 Encode ESCO Occupations (Passages)

In [36]:
# Prepare ESCO texts with "passage: " prefix for e5 model
esco_texts = ["passage: " + text for text in esco_df['combined_text'].tolist()]

print(f"Encoding {len(esco_texts):,} ESCO occupations...")
print(f"Sample passage (first 200 chars):")
print(esco_texts[0][:200])
print("...")

# Encode ESCO texts (normalized for cosine similarity via dot product)
E = model.encode(esco_texts, normalize_embeddings=True, batch_size=64, show_progress_bar=True)

print(f"\n✓ Encoded ESCO occupations")
print(f"  Embeddings shape: {E.shape}")
print(f"  Memory size: {E.nbytes / 1024 / 1024:.2f} MB")

Encoding 3,037 ESCO occupations...
Sample passage (first 200 chars):
passage: technical director director of technical arts
technical supervisor
head of technical
technical and operations director
technical manager
head of technical department Technical directors reali
...


Batches: 100%|██████████| 48/48 [04:29<00:00,  5.61s/it]


✓ Encoded ESCO occupations
  Embeddings shape: (3037, 384)
  Memory size: 4.45 MB





### 3.04 Encode PAD Occupations (Queries)

In [37]:
# Prepare PAD texts with "query: " prefix for e5 model
queries = ["query: " + text for text in pad_df['combined_text'].tolist()]

print(f"Encoding {len(queries):,} PAD occupation queries...")
print(f"Sample query (first 200 chars):")
print(queries[0][:200])
print("...")

# Encode PAD queries (normalized for cosine similarity via dot product)
Q = model.encode(queries, normalize_embeddings=True, batch_size=64, show_progress_bar=True)

print(f"\n✓ Encoded PAD occupations")
print(f"  Embeddings shape: {Q.shape}")
print(f"  Memory size: {Q.nbytes / 1024 / 1024:.2f} MB")

Encoding 246 PAD occupation queries...
Sample query (first 200 chars):
query: Transmission Line Construction Engineer Construction of high-voltage transmission lines the construction of various transmission lines underway, scheduled for commissioning by 2014 construction
...


Batches: 100%|██████████| 4/4 [00:05<00:00,  1.40s/it]


✓ Encoded PAD occupations
  Embeddings shape: (246, 384)
  Memory size: 0.36 MB





### 3.05 Compute Similarities and Get Top 10 Matches

In [38]:
# Compute cosine similarity scores (normalized embeddings -> dot product)
print("Computing similarity scores...")
scores = Q @ E.T

print(f"✓ Computed similarity matrix")
print(f"  Shape: {scores.shape} (PAD queries × ESCO passages)")
print(f"  Min score: {scores.min():.4f}")
print(f"  Max score: {scores.max():.4f}")
print(f"  Mean score: {scores.mean():.4f}")

# Get top 10 matches for each PAD occupation
print(f"\nFinding top 10 matches for each PAD occupation...")
topk_indices = np.argsort(-scores, axis=1)[:, :10]  # Top 10 indices per query
topk_scores = np.take_along_axis(scores, topk_indices, axis=1)  # Corresponding scores

print(f"✓ Found top 10 matches for all {len(pad_df):,} PAD occupations")
print(f"\nSample top 10 for first PAD occupation:")
print(f"  Indices: {topk_indices[0]}")
print(f"  Scores: {topk_scores[0]}")

Computing similarity scores...
✓ Computed similarity matrix
  Shape: (246, 3037) (PAD queries × ESCO passages)
  Min score: 0.6625
  Max score: 0.9111
  Mean score: 0.7576

Finding top 10 matches for each PAD occupation...
✓ Found top 10 matches for all 246 PAD occupations

Sample top 10 for first PAD occupation:
  Indices: [1387 2452  384 2871 1547 1101 1090 1749 1456 1470]
  Scores: [0.84681356 0.83814824 0.83773327 0.83725023 0.8342337  0.8309206
 0.82545346 0.8254266  0.8248798  0.82351637]


### 3.06 Create Results DataFrame with Top 10 Matches

In [39]:
# Start with PAD occupation data
results_df = pad_df.copy()

# Add top 10 ESCO matches as columns
for rank in range(10):
    # Get ESCO IDs for this rank
    esco_ids = [esco_df.iloc[idx]['esco_id'] for idx in topk_indices[:, rank]]
    results_df[f'match_{rank+1}_esco_id'] = esco_ids
    
    # Get ESCO occupation labels for this rank
    esco_labels = [esco_df.iloc[idx]['preferredLabel'] for idx in topk_indices[:, rank]]
    results_df[f'match_{rank+1}_occupation'] = esco_labels
    
    # Get similarity scores for this rank
    results_df[f'match_{rank+1}_score'] = topk_scores[:, rank]

print(f"✓ Created results DataFrame with top 10 matches")
print(f"  Shape: {results_df.shape}")
print(f"  Original PAD columns: {len(pad_df.columns)}")
print(f"  New match columns: {results_df.shape[1] - len(pad_df.columns)} (30 columns: ID, label, score × 10 ranks)")
print(f"\nNew columns added:")
print([col for col in results_df.columns if col.startswith('match_')][:6])
print("...")

print(f"\nSample result (first PAD occupation with top 3 matches):")
sample = results_df.iloc[0]
print(f"PAD: {sample['identified_occupation']}")
print(f"  Industry: {sample['industry']}")
for i in range(3):
    print(f"  Match {i+1}: {sample[f'match_{i+1}_occupation']} (score: {sample[f'match_{i+1}_score']:.4f})")

✓ Created results DataFrame with top 10 matches
  Shape: (246, 40)
  Original PAD columns: 10
  New match columns: 30 (30 columns: ID, label, score × 10 ranks)

New columns added:
['match_1_esco_id', 'match_1_occupation', 'match_1_score', 'match_2_esco_id', 'match_2_occupation', 'match_2_score']
...

Sample result (first PAD occupation with top 3 matches):
PAD: Transmission Line Construction Engineer
  Industry: Construction of high-voltage transmission lines
  Match 1: overhead line worker (score: 0.8468)
  Match 2: electricity distribution technician (score: 0.8381)
  Match 3: power lines supervisor (score: 0.8377)


### 3.07 Save Results to CSV

In [40]:
# Save to silver/esco_matching directory
output_dir = project_root / "data" / "silver" / "esco_matching"
output_dir.mkdir(parents=True, exist_ok=True)

# Use project_id for filename
project_id = results_df['project_id'].unique()[0]
output_file = output_dir / f"{project_id}_esco_matches.csv"
results_df.to_csv(output_file, index=False)

print(f"✓ Saved results to: {output_file}")
print(f"  File size: {output_file.stat().st_size / 1024:.2f} KB")
print(f"  Rows: {len(results_df):,}")
print(f"  Columns: {len(results_df.columns)}")
print(f"\nResults include:")
print(f"  - All original PAD occupation fields")
print(f"  - Top 10 ESCO matches with IDs, labels, and similarity scores")

✓ Saved results to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching/P075941_esco_matches.csv
  File size: 297.73 KB
  Rows: 246
  Columns: 40

Results include:
  - All original PAD occupation fields
  - Top 10 ESCO matches with IDs, labels, and similarity scores
