# Match PAD Occupations to ESCO

Use embedding model to match extracted PAD occupations to ESCO occupation taxonomy.

## 0. Setup

### 0.01 Import Required Libraries

In [153]:
import pandas as pd
from pathlib import Path

# Import our config
import sys
sys.path.append(str(Path.cwd().parent))
from src.config import load_config

### 0.02 Set Up Paths

In [154]:
project_root = Path.cwd().parent
config = load_config()

# ESCO data paths
esco_dir = project_root / "data" / "bronze" / "esco"
occupations_file = esco_dir / "occupations_en.csv"
skills_relations_file = esco_dir / "occupationSkillRelations_en.csv"

print(f"ESCO directory: {esco_dir}")
print(f"Occupations file exists: {occupations_file.exists()}")
print(f"Skills relations file exists: {skills_relations_file.exists()}")

ESCO directory: /Users/lauren/repos/PAD2Skills/data/bronze/esco
Occupations file exists: True
Skills relations file exists: True


## 1. Prepare ESCO Data

### 1.01 Read ESCO Occupations Data

In [155]:
# Read ESCO occupations
occ_df = pd.read_csv(occupations_file)

print(f"✓ Loaded {len(occ_df):,} ESCO occupations")
print(f"\nColumns: {list(occ_df.columns)}")
print(f"\nFirst few rows:")
occ_df.head()

✓ Loaded 3,043 ESCO occupations

Columns: ['conceptType', 'conceptUri', 'iscoGroup', 'preferredLabel', 'altLabels', 'hiddenLabels', 'status', 'modifiedDate', 'regulatedProfessionNote', 'scopeNote', 'definition', 'inScheme', 'description', 'code', 'naceCode']

First few rows:


Unnamed: 0,conceptType,conceptUri,iscoGroup,preferredLabel,altLabels,hiddenLabels,status,modifiedDate,regulatedProfessionNote,scopeNote,definition,inScheme,description,code,naceCode
0,Occupation,http://data.europa.eu/esco/occupation/00030d09...,2654,technical director,director of technical arts\ntechnical supervis...,,released,2024-01-25T11:28:50.295Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Technical directors realise the artistic visio...,2654.1.7,http://data.europa.eu/ux2/nace2.1/9031
1,Occupation,http://data.europa.eu/esco/occupation/000e93a3...,8121,metal drawing machine operator,wire drawer\nforming machine operative\ndraw m...,,released,2024-01-23T10:09:32.099Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Metal drawing machine operators set up and ope...,8121.4,http://data.europa.eu/ux2/nace2.1/242
2,Occupation,http://data.europa.eu/esco/occupation/0019b951...,7543,precision device inspector,precision device quality control supervisor\np...,,released,2024-01-25T15:00:12.188Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Precision device inspectors make sure precisio...,7543.10.3,http://data.europa.eu/ux2/nace2.1/2651
3,Occupation,http://data.europa.eu/esco/occupation/0022f466...,3155,air traffic safety technician,air traffic safety electronics hardware specia...,,released,2024-01-29T16:01:13.998Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Air traffic safety technicians provide technic...,3155.1,http://data.europa.eu/ux2/nace2.1/5223
4,Occupation,http://data.europa.eu/esco/occupation/002da35b...,2431,hospitality revenue manager,yield manager\nhospitality yields manager\nhos...,,released,2024-01-11T10:28:45.871Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Hospitality revenue managers maximise revenue ...,2431.9,"http://data.europa.eu/ux2/nace2.1/701,\nhttp:/..."


### 1.02 Read ESCO Skills Relations Data

In [156]:
# Read ESCO skills relations
skills_df = pd.read_csv(skills_relations_file)

print(f"✓ Loaded {len(skills_df):,} skill relations")
print(f"\nColumns: {list(skills_df.columns)}")
print(f"\nFirst few rows:")
skills_df.head()

✓ Loaded 126,051 skill relations

Columns: ['occupationUri', 'occupationLabel', 'relationType', 'skillType', 'skillUri', 'skillLabel']

First few rows:


Unnamed: 0,occupationUri,occupationLabel,relationType,skillType,skillUri,skillLabel
0,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,knowledge,http://data.europa.eu/esco/skill/fed5b267-73fa...,theatre techniques
1,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/05bc7677-5a64...,organise rehearsals
2,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/271a36a0-bc7a...,write risk assessment on performing arts produ...
3,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/47ed1d37-971b...,coordinate with creative departments
4,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/591dd514-735b...,adapt to artists' creative demands


### 1.03 Filter Skills Relations

In [157]:
# Filter for essential skills/competences only
skills_filtered = skills_df[
    (skills_df['relationType'] == 'essential') & 
    (skills_df['skillType'] == 'skill/competence')
].copy()

print(f"✓ Filtered to {len(skills_filtered):,} essential skill/competence relations")
print(f"  (from {len(skills_df):,} total relations)")
print(f"\nUnique occupations: {skills_filtered['occupationUri'].nunique():,}")
print(f"Unique skills: {skills_filtered['skillUri'].nunique():,}")
print(f"\nSample filtered relations:")
skills_filtered.head()

✓ Filtered to 51,155 essential skill/competence relations
  (from 126,051 total relations)

Unique occupations: 3,037
Unique skills: 8,752

Sample filtered relations:


Unnamed: 0,occupationUri,occupationLabel,relationType,skillType,skillUri,skillLabel
1,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/05bc7677-5a64...,organise rehearsals
2,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/271a36a0-bc7a...,write risk assessment on performing arts produ...
3,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/47ed1d37-971b...,coordinate with creative departments
4,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/591dd514-735b...,adapt to artists' creative demands
5,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/860be36a-d19b...,negotiate health and safety issues with third ...


### 1.04 Merge Skills onto Occupations

In [158]:
# Merge skills onto occupations
# occupationUri in skills -> conceptUri in occupations
merged_df = occ_df.merge(
    skills_filtered,
    right_on='occupationUri',
    left_on='conceptUri',
    how='left'
)

print(f"✓ Merged skills onto occupations: {len(merged_df):,} rows")
print(f"\nColumns after merge: {list(merged_df.columns)}")
print(f"\nSample merged data:")
merged_df.head()

✓ Merged skills onto occupations: 51,209 rows

Columns after merge: ['conceptType', 'conceptUri', 'iscoGroup', 'preferredLabel', 'altLabels', 'hiddenLabels', 'status', 'modifiedDate', 'regulatedProfessionNote', 'scopeNote', 'definition', 'inScheme', 'description', 'code', 'naceCode', 'occupationUri', 'occupationLabel', 'relationType', 'skillType', 'skillUri', 'skillLabel']

Sample merged data:


Unnamed: 0,conceptType,conceptUri,iscoGroup,preferredLabel,altLabels,hiddenLabels,status,modifiedDate,regulatedProfessionNote,scopeNote,...,inScheme,description,code,naceCode,occupationUri,occupationLabel,relationType,skillType,skillUri,skillLabel
0,Occupation,http://data.europa.eu/esco/occupation/00030d09...,2654,technical director,director of technical arts\ntechnical supervis...,,released,2024-01-25T11:28:50.295Z,http://data.europa.eu/esco/regulated-professio...,,...,http://data.europa.eu/esco/concept-scheme/memb...,Technical directors realise the artistic visio...,2654.1.7,http://data.europa.eu/ux2/nace2.1/9031,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/05bc7677-5a64...,organise rehearsals
1,Occupation,http://data.europa.eu/esco/occupation/00030d09...,2654,technical director,director of technical arts\ntechnical supervis...,,released,2024-01-25T11:28:50.295Z,http://data.europa.eu/esco/regulated-professio...,,...,http://data.europa.eu/esco/concept-scheme/memb...,Technical directors realise the artistic visio...,2654.1.7,http://data.europa.eu/ux2/nace2.1/9031,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/271a36a0-bc7a...,write risk assessment on performing arts produ...
2,Occupation,http://data.europa.eu/esco/occupation/00030d09...,2654,technical director,director of technical arts\ntechnical supervis...,,released,2024-01-25T11:28:50.295Z,http://data.europa.eu/esco/regulated-professio...,,...,http://data.europa.eu/esco/concept-scheme/memb...,Technical directors realise the artistic visio...,2654.1.7,http://data.europa.eu/ux2/nace2.1/9031,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/47ed1d37-971b...,coordinate with creative departments
3,Occupation,http://data.europa.eu/esco/occupation/00030d09...,2654,technical director,director of technical arts\ntechnical supervis...,,released,2024-01-25T11:28:50.295Z,http://data.europa.eu/esco/regulated-professio...,,...,http://data.europa.eu/esco/concept-scheme/memb...,Technical directors realise the artistic visio...,2654.1.7,http://data.europa.eu/ux2/nace2.1/9031,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/591dd514-735b...,adapt to artists' creative demands
4,Occupation,http://data.europa.eu/esco/occupation/00030d09...,2654,technical director,director of technical arts\ntechnical supervis...,,released,2024-01-25T11:28:50.295Z,http://data.europa.eu/esco/regulated-professio...,,...,http://data.europa.eu/esco/concept-scheme/memb...,Technical directors realise the artistic visio...,2654.1.7,http://data.europa.eu/ux2/nace2.1/9031,http://data.europa.eu/esco/occupation/00030d09...,technical director,essential,skill/competence,http://data.europa.eu/esco/skill/860be36a-d19b...,negotiate health and safety issues with third ...


### 1.05 Flatten Skills by Occupation

In [159]:
# Group by occupation and combine skills into comma-separated list
flattened_df = merged_df.groupby('occupationUri').agg({
    'conceptUri': 'first',  # Occupation URI from occupations table
    'preferredLabel': 'first',
    'altLabels': 'first',
    'description': 'first',
    'skillLabel': lambda x: ', '.join(x.dropna().astype(str))
}).reset_index()

# Rename skillLabel column to be clearer
flattened_df = flattened_df.rename(columns={'skillLabel': 'skills_list'})

print(f"✓ Flattened to {len(flattened_df):,} unique occupations")
print(f"\nColumns: {list(flattened_df.columns)}")
print(f"\nNote: occupationUri is kept as a column")
print(f"\nSample flattened data:")
flattened_df[['occupationUri', 'preferredLabel', 'skills_list']].head()

✓ Flattened to 3,037 unique occupations

Columns: ['occupationUri', 'conceptUri', 'preferredLabel', 'altLabels', 'description', 'skills_list']

Note: occupationUri is kept as a column

Sample flattened data:


Unnamed: 0,occupationUri,preferredLabel,skills_list
0,http://data.europa.eu/esco/occupation/00030d09...,technical director,"organise rehearsals, write risk assessment on ..."
1,http://data.europa.eu/esco/occupation/000e93a3...,metal drawing machine operator,"monitor gauge, remove inadequate workpieces, t..."
2,http://data.europa.eu/esco/occupation/0019b951...,precision device inspector,"monitor machine operations, read assembly draw..."
3,http://data.europa.eu/esco/occupation/0022f466...,air traffic safety technician,"implement airside safety procedures, install e..."
4,http://data.europa.eu/esco/occupation/002da35b...,hospitality revenue manager,"produce statistical financial records, comply ..."


### 1.06 Combine Fields into Single Text Column

In [160]:
# Combine preferredLabel, altLabels, description, and skills into single text column
# Prioritize most important info first to minimize impact of model truncation
def combine_fields(row):
    """Combine multiple fields into prioritized space-separated string"""
    parts = []
    
    # 1. Add preferredLabel (most important)
    if pd.notna(row['preferredLabel']):
        parts.append(str(row['preferredLabel']))
    
    # 2. Add first 5 altLabels
    first_alt_labels = []
    remaining_alt_labels = []
    if pd.notna(row['altLabels']):
        alt_labels_str = str(row['altLabels'])
        # Split by newline first, then by comma if no newlines
        if '\n' in alt_labels_str:
            alt_labels_list = [label.strip() for label in alt_labels_str.split('\n') if label.strip()]
        else:
            alt_labels_list = [label.strip() for label in alt_labels_str.split(',') if label.strip()]
        
        first_alt_labels = alt_labels_list[:5]
        remaining_alt_labels = alt_labels_list[5:]
        
        if first_alt_labels:
            parts.append(' '.join(first_alt_labels))
    
    # 3. Add description
    if pd.notna(row['description']):
        parts.append(str(row['description']))
    
    # 4. Add skills_list truncated to 1500 characters
    if pd.notna(row['skills_list']):
        skills_str = str(row['skills_list'])
        if len(skills_str) > 1500:
            skills_str = skills_str[:1500]
        parts.append(skills_str)
    
    # 5. Add remaining altLabels (6+) if they exist
    if remaining_alt_labels:
        parts.append(' '.join(remaining_alt_labels))
    
    return ' '.join(parts)

# Create combined text column
flattened_df['combined_text'] = flattened_df.apply(combine_fields, axis=1)

print(f"✓ Created combined_text column with prioritized fields")
print(f"\nSample combined text (first 500 chars):")
print("=" * 80)
print(flattened_df['combined_text'].iloc[0][:500])
print("...")
print("=" * 80)
print(f"\nFinal dataset shape: {flattened_df.shape}")
print(f"Columns: {list(flattened_df.columns)}")

✓ Created combined_text column with prioritized fields

Sample combined text (first 500 chars):
technical director director of technical arts technical supervisor head of technical technical and operations director technical manager Technical directors realise the artistic visions of the creators within technical constraints. They coordinate the operations of various production units, such as scene, wardrobe, sound and lighting, and make-up. They adapt the prototype and study the feasibility, implementation, operation and technical monitoring of the artistic project. They are also responsi
...

Final dataset shape: (3037, 7)
Columns: ['occupationUri', 'conceptUri', 'preferredLabel', 'altLabels', 'description', 'skills_list', 'combined_text']


### 1.07 Inspect Final Dataset

In [161]:
# Display summary statistics
print("Dataset Summary:")
print("=" * 80)
print(f"Total occupations: {len(flattened_df):,}")
print(f"\nCombined text length statistics:")
print(flattened_df['combined_text'].str.len().describe())

print(f"\nSample rows:")
flattened_df.head()

Dataset Summary:
Total occupations: 3,037

Combined text length statistics:
count    3037.000000
mean     1111.472176
std       433.540204
min       252.000000
25%       798.000000
50%      1016.000000
75%      1333.000000
max      3119.000000
Name: combined_text, dtype: float64

Sample rows:


Unnamed: 0,occupationUri,conceptUri,preferredLabel,altLabels,description,skills_list,combined_text
0,http://data.europa.eu/esco/occupation/00030d09...,http://data.europa.eu/esco/occupation/00030d09...,technical director,director of technical arts\ntechnical supervis...,Technical directors realise the artistic visio...,"organise rehearsals, write risk assessment on ...",technical director director of technical arts ...
1,http://data.europa.eu/esco/occupation/000e93a3...,http://data.europa.eu/esco/occupation/000e93a3...,metal drawing machine operator,wire drawer\nforming machine operative\ndraw m...,Metal drawing machine operators set up and ope...,"monitor gauge, remove inadequate workpieces, t...",metal drawing machine operator wire drawer for...
2,http://data.europa.eu/esco/occupation/0019b951...,http://data.europa.eu/esco/occupation/0019b951...,precision device inspector,precision device quality control supervisor\np...,Precision device inspectors make sure precisio...,"monitor machine operations, read assembly draw...",precision device inspector precision device qu...
3,http://data.europa.eu/esco/occupation/0022f466...,http://data.europa.eu/esco/occupation/0022f466...,air traffic safety technician,air traffic safety electronics hardware specia...,Air traffic safety technicians provide technic...,"implement airside safety procedures, install e...",air traffic safety technician air traffic safe...
4,http://data.europa.eu/esco/occupation/002da35b...,http://data.europa.eu/esco/occupation/002da35b...,hospitality revenue manager,yield manager\nhospitality yields manager\nhos...,Hospitality revenue managers maximise revenue ...,"produce statistical financial records, comply ...",hospitality revenue manager yield manager hosp...


### 1.08 Analyze Combined Text Character Lengths

In [162]:
# Calculate character lengths
flattened_df['text_length'] = flattened_df['combined_text'].str.len()

# Summary statistics
print("Combined Text Character Length Summary:")
print("=" * 80)
print(f"\nTotal occupations: {len(flattened_df):,}")
print(f"\nCharacter count statistics:")
print(flattened_df['text_length'].describe())

print(f"\nAdditional metrics:")
print(f"  Minimum length: {flattened_df['text_length'].min():,} characters")
print(f"  Maximum length: {flattened_df['text_length'].max():,} characters")
print(f"  Total characters: {flattened_df['text_length'].sum():,} characters")
print(f"  Average length: {flattened_df['text_length'].mean():.1f} characters")
print(f"  Median length: {flattened_df['text_length'].median():.1f} characters")

# Distribution insights
print(f"\nLength distribution:")
print(f"  < 500 chars: {(flattened_df['text_length'] < 500).sum():,} occupations ({(flattened_df['text_length'] < 500).sum() / len(flattened_df) * 100:.1f}%)")
print(f"  500-1000 chars: {((flattened_df['text_length'] >= 500) & (flattened_df['text_length'] < 1000)).sum():,} occupations")
print(f"  1000-2000 chars: {((flattened_df['text_length'] >= 1000) & (flattened_df['text_length'] < 2000)).sum():,} occupations")
print(f"  2000+ chars: {(flattened_df['text_length'] >= 2000).sum():,} occupations ({(flattened_df['text_length'] >= 2000).sum() / len(flattened_df) * 100:.1f}%)")


Combined Text Character Length Summary:

Total occupations: 3,037

Character count statistics:
count    3037.000000
mean     1111.472176
std       433.540204
min       252.000000
25%       798.000000
50%      1016.000000
75%      1333.000000
max      3119.000000
Name: text_length, dtype: float64

Additional metrics:
  Minimum length: 252 characters
  Maximum length: 3,119 characters
  Total characters: 3,375,541 characters
  Average length: 1111.5 characters
  Median length: 1016.0 characters

Length distribution:
  < 500 chars: 70 occupations (2.3%)
  500-1000 chars: 1,393 occupations
  1000-2000 chars: 1,416 occupations
  2000+ chars: 158 occupations (5.2%)


### 1.09 Prepare Final Dataset for Export

In [163]:
# Extract ESCO UUID from conceptUri (the official ESCO identifier)
flattened_df['esco_id'] = flattened_df['conceptUri'].apply(lambda uri: uri.split('/')[-1])

# Select only the columns we need (including description for JSON output)
export_df = flattened_df[['esco_id', 'conceptUri', 'preferredLabel', 'description', 'combined_text']].copy()

print(f"✓ Created export dataset with {len(export_df):,} rows")
print(f"  ESCO ID is the UUID from conceptUri (official ESCO identifier)")
print(f"\nColumns: {list(export_df.columns)}")
print(f"\nSample export data:")
export_df.head()

✓ Created export dataset with 3,037 rows
  ESCO ID is the UUID from conceptUri (official ESCO identifier)

Columns: ['esco_id', 'conceptUri', 'preferredLabel', 'description', 'combined_text']

Sample export data:


Unnamed: 0,esco_id,conceptUri,preferredLabel,description,combined_text
0,00030d09-2b3a-4efd-87cc-c4ea39d27c34,http://data.europa.eu/esco/occupation/00030d09...,technical director,Technical directors realise the artistic visio...,technical director director of technical arts ...
1,000e93a3-d956-4e45-aacb-f12c83fedf84,http://data.europa.eu/esco/occupation/000e93a3...,metal drawing machine operator,Metal drawing machine operators set up and ope...,metal drawing machine operator wire drawer for...
2,0019b951-c699-4191-8208-9822882d150c,http://data.europa.eu/esco/occupation/0019b951...,precision device inspector,Precision device inspectors make sure precisio...,precision device inspector precision device qu...
3,0022f466-426c-41a4-ac96-a235c945cf97,http://data.europa.eu/esco/occupation/0022f466...,air traffic safety technician,Air traffic safety technicians provide technic...,air traffic safety technician air traffic safe...
4,002da35b-7808-43f3-83bf-63596b8b351f,http://data.europa.eu/esco/occupation/002da35b...,hospitality revenue manager,Hospitality revenue managers maximise revenue ...,hospitality revenue manager yield manager hosp...


### 1.10 Save to CSV

In [164]:
# Save to silver directory
output_dir = project_root / "data" / "silver"
output_file = output_dir / "esco_occupations_prepared.csv"

export_df.to_csv(output_file, index=False)

print(f"✓ Saved to: {output_file}")
print(f"  File size: {output_file.stat().st_size / 1024:.2f} KB")
print(f"  Rows: {len(export_df):,}")
print(f"  Columns: {list(export_df.columns)}")

✓ Saved to: /Users/lauren/repos/PAD2Skills/data/silver/esco_occupations_prepared.csv
  File size: 4602.02 KB
  Rows: 3,037
  Columns: ['esco_id', 'conceptUri', 'preferredLabel', 'description', 'combined_text']


In [165]:
# Explore rows with long lengths
long_rows = flattened_df[flattened_df['text_length'] > 2000]
long_rows.head()

Unnamed: 0,occupationUri,conceptUri,preferredLabel,altLabels,description,skills_list,combined_text,text_length,esco_id
8,http://data.europa.eu/esco/occupation/006cc1f9...,http://data.europa.eu/esco/occupation/006cc1f9...,physiotherapist,respiratory therapist\nmasseuse\nrehabilitatio...,Physiotherapists are autonomous health profess...,"contribute to quality physiotherapy services, ...",physiotherapist respiratory therapist masseuse...,2611,006cc1f9-2841-41c3-991a-dc3f2f3bd533
13,http://data.europa.eu/esco/occupation/009d29de...,http://data.europa.eu/esco/occupation/009d29de...,rental service representative in other machine...,rental sales desk supervisor in other machiner...,Rental service representatives in other machin...,"guarantee customer satisfaction, perform multi...",rental service representative in other machine...,2145,009d29de-5872-43be-8d9b-abd27f8c99f1
49,http://data.europa.eu/esco/occupation/034cad59...,http://data.europa.eu/esco/occupation/034cad59...,special educational needs teacher secondary sc...,secondary school special education teacher\nsp...,Special educational needs teachers at secondar...,"guarantee students' safety, assign homework, m...",special educational needs teacher secondary sc...,2161,034cad59-e666-4770-a7ef-a337257f8072
94,http://data.europa.eu/esco/occupation/068df7d1...,http://data.europa.eu/esco/occupation/068df7d1...,performance lighting designer,lighting designer and technician\nlighting tec...,Performance lighting designers develop a light...,analyse the artistic concept based on stage ac...,performance lighting designer lighting designe...,2316,068df7d1-516f-4829-b339-294cb0cf6318
115,http://data.europa.eu/esco/occupation/08984bec...,http://data.europa.eu/esco/occupation/08984bec...,specialised veterinarian,zoological medicine veterinarian\nemergency me...,Specialised veterinarians are professionals wi...,certify the performance of veterinary procedur...,specialised veterinarian zoological medicine v...,2710,08984bec-31d8-4bb0-aa2f-d557761ff029


## 2. Prepare PAD Occupation Data

### 2.01 Load PAD Occupation JSON Files

In [166]:
import json

# Path to PAD occupations JSON files
pad_occs_dir = project_root / "data" / "silver" / "occupations_skills"

# Find all P075941_*.json files
json_files = sorted(pad_occs_dir.glob("P075941_*_occupations.json"))

print(f"Found {len(json_files)} PAD occupation JSON files")
print(f"\nFirst 5 files:")
for f in json_files[:5]:
    print(f"  {f.name}")
if len(json_files) > 5:
    print(f"  ... and {len(json_files) - 5} more")

Found 16 PAD occupation JSON files

First 5 files:
  P075941_0_occupations.json
  P075941_10_occupations.json
  P075941_11_occupations.json
  P075941_12_occupations.json
  P075941_13_occupations.json
  ... and 11 more


### 2.02 Read and Concatenate JSON Files

In [167]:
# Read all JSON files and collect extractions
all_extractions = []

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Check if extractions exist and are not null
    if data.get('extractions') is not None:
        # Each extraction is a dictionary, add project_id and section_id to it
        for extraction in data['extractions']:
            extraction['project_id'] = data['project_id']
            extraction['section_id'] = data['section_id']
            all_extractions.append(extraction)

print(f"✓ Loaded {len(all_extractions):,} total occupation extractions")
print(f"  from {len(json_files)} JSON files")

# Convert to DataFrame
pad_occs_df = pd.DataFrame(all_extractions)

print(f"\n✓ Created DataFrame with {len(pad_occs_df):,} rows")
print(f"\nColumns: {list(pad_occs_df.columns)}")
print(f"\nFirst few rows:")
pad_occs_df.head()

✓ Loaded 246 total occupation extractions
  from 16 JSON files

✓ Created DataFrame with 246 rows

Columns: ['extraction_id', 'identified_occupation', 'industry', 'activity_description_in_pad', 'skills_needed_for_activity', 'source_material_quote', 'project_id', 'section_id']

First few rows:


Unnamed: 0,extraction_id,identified_occupation,industry,activity_description_in_pad,skills_needed_for_activity,source_material_quote,project_id,section_id
0,1,Transmission Line Construction Engineer,Construction of high-voltage transmission lines,the construction of various transmission lines...,"[construction of transmission lines, commissio...",the construction of various transmission lines...,P075941,0
1,2,Power Plant Rehabilitation Engineer,Rehabilitation of existing power plants,rehabilitation of existing power plants,[rehabilitation of existing power plants],the CAS calls for the rehabilitation f exis...,P075941,0
2,3,Hydropower Plant Construction Engineer,Construction of hydropower plants,construction of new power plants,[construction of new plants],the CAS calls for the rehabilitation f exis...,P075941,0
3,4,Rural Electrification Program Coordinator,Rural electrification program development,development of a rural electrification program...,[development of a rural electrification program],the CAS calls for the rehabilitation f exis...,P075941,0
4,5,Power System Planning Engineer,Regional power system planning and feasibility...,assessment of economic and engineering feasibi...,"[assessment of economic feasibility, assessmen...",The CBWS has prepared a Regional Power Develop...,P075941,0


### 2.03 Inspect PAD Occupations Dataset

In [168]:
# Display summary information
print("PAD Occupations Dataset Summary:")
print("=" * 80)
print(f"Total extractions: {len(pad_occs_df):,}")
print(f"Unique occupations: {pad_occs_df['identified_occupation'].nunique():,}")
print(f"Unique industries: {pad_occs_df['industry'].nunique():,}")
print(f"Unique project-section combinations: {pad_occs_df.groupby(['project_id', 'section_id']).ngroups:,}")

print(f"\nSample occupations:")
print(pad_occs_df['identified_occupation'].value_counts().head(10))

print(f"\nDataFrame info:")
pad_occs_df.info()

PAD Occupations Dataset Summary:
Total extractions: 246
Unique occupations: 210
Unique industries: 219
Unique project-section combinations: 16

Sample occupations:
identified_occupation
Transmission Line Construction Engineer                  5
Owner's Engineer                                         5
Procurement Specialist                                   5
Financial Management Specialist                          4
Environmental and Social Impact Assessment Specialist    3
Resettlement Action Plan Specialist                      3
Communications Specialist                                3
Resettlement Specialist                                  3
Monitoring and Evaluation Specialist                     3
Hydropower Plant Construction Engineer                   2
Name: count, dtype: int64

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246 entries, 0 to 245
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------         

### 2.04 Prepare PAD Occupations for Export

In [169]:
# Create three-digit ID with leading zeros
pad_occs_df['pad_id'] = [f"{i:03d}" for i in range(len(pad_occs_df))]

# Create combined text column
def combine_pad_fields(row):
    """Combine PAD occupation fields into single text column"""
    parts = []
    
    # Add identified_occupation
    if pd.notna(row['identified_occupation']):
        parts.append(str(row['identified_occupation']))
    
    # Add industry
    if pd.notna(row['industry']):
        parts.append(str(row['industry']))
    
    # Add activity_description_in_pad
    if pd.notna(row['activity_description_in_pad']):
        parts.append(str(row['activity_description_in_pad']))
    
    # Add skills_needed_for_activity (remove list brackets and quotes)
    skills = row['skills_needed_for_activity']
    if skills is not None and skills is not pd.NA:
        # If it's a list, join with commas
        if isinstance(skills, list):
            skills_str = ', '.join(str(s) for s in skills)
        else:
            # If it's a string representation of a list, clean it up
            skills_str = str(skills).strip('[]').replace("'", "").replace('"', '')
        parts.append(skills_str)
    
    return ' '.join(parts)

# Create combined text column
pad_occs_df['combined_text'] = pad_occs_df.apply(combine_pad_fields, axis=1)

print(f"✓ Created pad_id and combined_text columns")
print(f"\nSample combined text (first 500 chars):")
print("=" * 80)
print(pad_occs_df['combined_text'].iloc[0][:500])
print("...")
print("=" * 80)
print(f"\nColumns: {list(pad_occs_df.columns)}")
print(f"\nSample rows with new columns:")
pad_occs_df[['pad_id', 'identified_occupation', 'combined_text']].head()

✓ Created pad_id and combined_text columns

Sample combined text (first 500 chars):
Transmission Line Construction Engineer Construction of high-voltage transmission lines the construction of various transmission lines underway, scheduled for commissioning by 2014 construction of transmission lines, commissioning of transmission lines
...

Columns: ['extraction_id', 'identified_occupation', 'industry', 'activity_description_in_pad', 'skills_needed_for_activity', 'source_material_quote', 'project_id', 'section_id', 'pad_id', 'combined_text']

Sample rows with new columns:


Unnamed: 0,pad_id,identified_occupation,combined_text
0,0,Transmission Line Construction Engineer,Transmission Line Construction Engineer Constr...
1,1,Power Plant Rehabilitation Engineer,Power Plant Rehabilitation Engineer Rehabilita...
2,2,Hydropower Plant Construction Engineer,Hydropower Plant Construction Engineer Constru...
3,3,Rural Electrification Program Coordinator,Rural Electrification Program Coordinator Rura...
4,4,Power System Planning Engineer,Power System Planning Engineer Regional power ...


### 2.05 Save PAD Occupations to CSV

In [170]:
# Create output directory
output_dir = project_root / "data" / "silver" / "occupation_skills_csv"
output_dir.mkdir(parents=True, exist_ok=True)

# Use project_id for filename
project_id = pad_occs_df['project_id'].unique()[0]
output_file = output_dir / f"{project_id}_pad_occupations_prepared.csv"

# Save to CSV
pad_occs_df.to_csv(output_file, index=False)

print(f"✓ Saved to: {output_file}")
print(f"  File size: {output_file.stat().st_size / 1024:.2f} KB")
print(f"  Rows: {len(pad_occs_df):,}")
print(f"  Columns: {list(pad_occs_df.columns)}")

✓ Saved to: /Users/lauren/repos/PAD2Skills/data/silver/occupation_skills_csv/P075941_pad_occupations_prepared.csv
  File size: 201.22 KB
  Rows: 246
  Columns: ['extraction_id', 'identified_occupation', 'industry', 'activity_description_in_pad', 'skills_needed_for_activity', 'source_material_quote', 'project_id', 'section_id', 'pad_id', 'combined_text']


## 3. Occupation Retrieval

### 3.01 Load Prepared CSV Files

In [171]:
# Load ESCO occupations
esco_csv = project_root / "data" / "silver" / "esco_occupations_prepared.csv"
esco_df = pd.read_csv(esco_csv)

print(f"✓ Loaded ESCO occupations: {len(esco_df):,} rows")
print(f"  Columns: {list(esco_df.columns)}")

# Load PAD occupations (using project_id from previous section)
pad_csv = project_root / "data" / "silver" / "occupation_skills_csv" / f"{project_id}_pad_occupations_prepared.csv"
pad_df = pd.read_csv(pad_csv)

print(f"\n✓ Loaded PAD occupations: {len(pad_df):,} rows")
print(f"  Columns: {list(pad_df.columns)}")

print(f"\nReady for embedding-based matching")
print(f"  ESCO: {len(esco_df):,} occupations to match against")
print(f"  PAD: {len(pad_df):,} queries to match")

✓ Loaded ESCO occupations: 3,037 rows
  Columns: ['esco_id', 'conceptUri', 'preferredLabel', 'description', 'combined_text']

✓ Loaded PAD occupations: 246 rows
  Columns: ['extraction_id', 'identified_occupation', 'industry', 'activity_description_in_pad', 'skills_needed_for_activity', 'source_material_quote', 'project_id', 'section_id', 'pad_id', 'combined_text']

Ready for embedding-based matching
  ESCO: 3,037 occupations to match against
  PAD: 246 queries to match


### 3.02 Load Sentence Transformer Model

In [172]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load model - intfloat/e5-small-v2 is optimized for semantic search
model = SentenceTransformer("intfloat/e5-small-v2")

print(f"✓ Loaded model: intfloat/e5-small-v2")
print(f"  Max sequence length: {model.max_seq_length}")
print(f"  Embedding dimension: {model.get_sentence_embedding_dimension()}")

✓ Loaded model: intfloat/e5-small-v2
  Max sequence length: 512
  Embedding dimension: 384


### 3.03 Encode ESCO Occupations (Passages)

In [173]:
# Set to True to force re-encoding even if cached embeddings exist
overwrite_embeddings = False

# Prepare ESCO texts with "passage: " prefix for e5 model
esco_texts = ["passage: " + text for text in esco_df['combined_text'].tolist()]

print(f"Encoding {len(esco_texts):,} ESCO occupations...")
print(f"Sample passage (first 200 chars):")
print(esco_texts[0][:200])
print("...")

# Check if cached embeddings exist
embeddings_dir = project_root / "data" / "silver" / "embeddings"
embeddings_dir.mkdir(parents=True, exist_ok=True)
esco_embeddings_file = embeddings_dir / "esco_embeddings.npy"

if esco_embeddings_file.exists() and not overwrite_embeddings:
    print(f"\n✓ Loading cached ESCO embeddings from: {esco_embeddings_file}")
    E = np.load(esco_embeddings_file)
    print(f"  Loaded embeddings shape: {E.shape}")
else:
    if overwrite_embeddings and esco_embeddings_file.exists():
        print(f"\n⚠ Overwriting existing embeddings (overwrite_embeddings=True)")
    else:
        print(f"\nNo cached embeddings found. Encoding...")
    
    # Encode ESCO texts (normalized for cosine similarity via dot product)
    E = model.encode(esco_texts, normalize_embeddings=True, batch_size=64, show_progress_bar=True)
    
    # Save embeddings for future use
    np.save(esco_embeddings_file, E)
    print(f"\n✓ Saved embeddings to: {esco_embeddings_file}")

print(f"\n✓ ESCO embeddings ready")
print(f"  Embeddings shape: {E.shape}")
print(f"  Memory size: {E.nbytes / 1024 / 1024:.2f} MB")

Encoding 3,037 ESCO occupations...
Sample passage (first 200 chars):
passage: technical director director of technical arts technical supervisor head of technical technical and operations director technical manager Technical directors realise the artistic visions of th
...

✓ Loading cached ESCO embeddings from: /Users/lauren/repos/PAD2Skills/data/silver/embeddings/esco_embeddings.npy
  Loaded embeddings shape: (3037, 384)

✓ ESCO embeddings ready
  Embeddings shape: (3037, 384)
  Memory size: 4.45 MB


### 3.04 Encode PAD Occupations (Queries)

In [174]:
# Prepare PAD texts with "query: " prefix for e5 model
queries = ["query: " + text for text in pad_df['combined_text'].tolist()]

print(f"Encoding {len(queries):,} PAD occupation queries...")
print(f"Sample query (first 200 chars):")
print(queries[0][:200])
print("...")

# Encode PAD queries (normalized for cosine similarity via dot product)
Q = model.encode(queries, normalize_embeddings=True, batch_size=64, show_progress_bar=True)

print(f"\n✓ Encoded PAD occupations")
print(f"  Embeddings shape: {Q.shape}")
print(f"  Memory size: {Q.nbytes / 1024 / 1024:.2f} MB")

Encoding 246 PAD occupation queries...
Sample query (first 200 chars):
query: Transmission Line Construction Engineer Construction of high-voltage transmission lines the construction of various transmission lines underway, scheduled for commissioning by 2014 construction
...


Batches: 100%|██████████| 4/4 [00:04<00:00,  1.22s/it]


✓ Encoded PAD occupations
  Embeddings shape: (246, 384)
  Memory size: 0.36 MB





### 3.05 Compute Similarities and Get Top 20 Matches

In [175]:
# Compute cosine similarity scores (normalized embeddings -> dot product)
print("Computing similarity scores...")
scores = Q @ E.T

print(f"✓ Computed similarity matrix")
print(f"  Shape: {scores.shape} (PAD queries × ESCO passages)")
print(f"  Min score: {scores.min():.4f}")
print(f"  Max score: {scores.max():.4f}")
print(f"  Mean score: {scores.mean():.4f}")

# Get top 20 matches for each PAD occupation
print(f"\nFinding top 20 matches for each PAD occupation...")
topk_indices = np.argsort(-scores, axis=1)[:, :20]  # Top 20 indices per query
topk_scores = np.take_along_axis(scores, topk_indices, axis=1)  # Corresponding scores

print(f"✓ Found top 20 matches for all {len(pad_df):,} PAD occupations")
print(f"\nSample top 20 for first PAD occupation:")
print(f"  Indices: {topk_indices[0]}")
print(f"  Scores: {topk_scores[0]}")

Computing similarity scores...
✓ Computed similarity matrix
  Shape: (246, 3037) (PAD queries × ESCO passages)
  Min score: 0.6625
  Max score: 0.9149
  Mean score: 0.7562

Finding top 20 matches for each PAD occupation...
✓ Found top 20 matches for all 246 PAD occupations

Sample top 20 for first PAD occupation:
  Indices: [1387 2871 1547  384 1101 2452 1749 1090 1470 1456 3011   44 1099  593
  156 2104  199 1534 1669 1540]
  Scores: [0.8453376  0.8375125  0.8342337  0.8341197  0.8319464  0.83081055
 0.8267323  0.8248322  0.82351637 0.82297474 0.82077885 0.8200836
 0.8196205  0.8193524  0.8190919  0.8179473  0.81779206 0.81755674
 0.8170142  0.8164041 ]


### 3.06 Create Results DataFrame with Top 20 Matches

In [176]:
# Start with PAD occupation data
results_df = pad_df.copy()

# Add top 20 ESCO matches as columns
for rank in range(20):
    # Get ESCO IDs (UUIDs) for this rank
    esco_ids = [esco_df.iloc[idx]['esco_id'] for idx in topk_indices[:, rank]]
    results_df[f'match_{rank+1}_esco_id'] = esco_ids
    
    # Get ESCO URIs for this rank
    esco_uris = [esco_df.iloc[idx]['conceptUri'] for idx in topk_indices[:, rank]]
    results_df[f'match_{rank+1}_uri'] = esco_uris
    
    # Get ESCO occupation labels for this rank
    esco_labels = [esco_df.iloc[idx]['preferredLabel'] for idx in topk_indices[:, rank]]
    results_df[f'match_{rank+1}_occupation'] = esco_labels
    
    # Get ESCO descriptions for this rank (description field, not combined_text)
    esco_descriptions = [esco_df.iloc[idx]['description'] for idx in topk_indices[:, rank]]
    results_df[f'match_{rank+1}_description'] = esco_descriptions
    
    # Get similarity scores for this rank
    results_df[f'match_{rank+1}_score'] = topk_scores[:, rank]

print(f"✓ Created results DataFrame with top 20 matches")
print(f"  Shape: {results_df.shape}")
print(f"  Original PAD columns: {len(pad_df.columns)}")
print(f"  New match columns: {results_df.shape[1] - len(pad_df.columns)} (100 columns: esco_id, URI, label, description, score × 20 ranks)")
print(f"  Note: esco_id is the official ESCO UUID extracted from conceptUri in Section 1")
print(f"\nNew columns added:")
print([col for col in results_df.columns if col.startswith('match_')][:10])
print("...")

print(f"\nSample result (first PAD occupation with top 3 matches):")
sample = results_df.iloc[0]
print(f"PAD: {sample['identified_occupation']}")
print(f"  Industry: {sample['industry']}")
for i in range(3):
    print(f"  Match {i+1}: {sample[f'match_{i+1}_occupation']} (esco_id: {sample[f'match_{i+1}_esco_id']}, score: {sample[f'match_{i+1}_score']:.4f})")

✓ Created results DataFrame with top 20 matches
  Shape: (246, 110)
  Original PAD columns: 10
  New match columns: 100 (100 columns: esco_id, URI, label, description, score × 20 ranks)
  Note: esco_id is the official ESCO UUID extracted from conceptUri in Section 1

New columns added:
['match_1_esco_id', 'match_1_uri', 'match_1_occupation', 'match_1_description', 'match_1_score', 'match_2_esco_id', 'match_2_uri', 'match_2_occupation', 'match_2_description', 'match_2_score']
...

Sample result (first PAD occupation with top 3 matches):
PAD: Transmission Line Construction Engineer
  Industry: Construction of high-voltage transmission lines
  Match 1: overhead line worker (esco_id: 7052fd94-f563-46a9-8e2d-cba6c20f3e71, score: 0.8453)
  Match 2: substation engineer (esco_id: f1b89616-5dfe-40d4-8b02-58a6982b1a01, score: 0.8375)
  Match 3: electrical transmission system operator (esco_id: 7cdeb653-8f3d-4921-832b-b95f9d700a86, score: 0.8342)


  results_df[f'match_{rank+1}_description'] = esco_descriptions
  results_df[f'match_{rank+1}_score'] = topk_scores[:, rank]


### 3.07 Save Results to CSV

In [177]:
# Save to silver/esco_matching directory
output_dir = project_root / "data" / "silver" / "esco_matching"
output_dir.mkdir(parents=True, exist_ok=True)

# Use project_id for filename
project_id = results_df['project_id'].unique()[0]
output_file = output_dir / f"{project_id}_esco_matches.csv"
results_df.to_csv(output_file, index=False)

print(f"✓ Saved results to: {output_file}")
print(f"  File size: {output_file.stat().st_size / 1024:.2f} KB")
print(f"  Rows: {len(results_df):,}")
print(f"  Columns: {len(results_df.columns)}")
print(f"\nResults include:")
print(f"  - All original PAD occupation fields")
print(f"  - Top 20 ESCO matches with IDs, labels, and similarity scores")

✓ Saved results to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching/P075941_esco_matches.csv
  File size: 2497.58 KB
  Rows: 246
  Columns: 110

Results include:
  - All original PAD occupation fields
  - Top 20 ESCO matches with IDs, labels, and similarity scores


In [178]:
# Display combined_text, matches, and scores .head()
results_df.loc[:, 'combined_text':].head()

Unnamed: 0,combined_text,match_1_esco_id,match_1_uri,match_1_occupation,match_1_description,match_1_score,match_2_esco_id,match_2_uri,match_2_occupation,match_2_description,...,match_19_esco_id,match_19_uri,match_19_occupation,match_19_description,match_19_score,match_20_esco_id,match_20_uri,match_20_occupation,match_20_description,match_20_score
0,Transmission Line Construction Engineer Constr...,7052fd94-f563-46a9-8e2d-cba6c20f3e71,http://data.europa.eu/esco/occupation/7052fd94...,overhead line worker,Overhead line workers construct and maintain p...,0.845338,f1b89616-5dfe-40d4-8b02-58a6982b1a01,http://data.europa.eu/esco/occupation/f1b89616...,substation engineer,Substation engineers design medium and high vo...,...,86ca306c-ab99-420a-9e2a-aa73c5c4de22,http://data.europa.eu/esco/occupation/86ca306c...,electrical engineer,Electrical engineers design and develop electr...,0.817014,7c2fbf7d-b934-4f62-8167-0ed90fb2a16f,http://data.europa.eu/esco/occupation/7c2fbf7d...,rolling stock engineer,Rolling stock engineers design and oversee the...,0.816404
1,Power Plant Rehabilitation Engineer Rehabilita...,58db3ac6-5217-4d46-8a4c-126598be1d13,http://data.europa.eu/esco/occupation/58db3ac6...,electric power generation engineer,Electric power generation engineers design and...,0.860811,72381086-cb6e-455e-a40b-ccb26550aab6,http://data.europa.eu/esco/occupation/72381086...,power production plant operator,Power production plant operators maintain and ...,...,0ec62c2e-30c9-434e-bf36-5277a0840450,http://data.europa.eu/esco/occupation/0ec62c2e...,refurbishing technician,Refurbishing technicians overhaul and refurbis...,0.837598,dc69f2ed-ebe9-43da-bf9d-31aac874d4c2,http://data.europa.eu/esco/occupation/dc69f2ed...,rehabilitation support worker,Rehabilitation support workers provide counsel...,0.837145
2,Hydropower Plant Construction Engineer Constru...,e12f08fb-4748-4388-9489-b647df60332a,http://data.europa.eu/esco/occupation/e12f08fb...,hydropower engineer,"Hydropower engineers research, design and plan...",0.880109,58db3ac6-5217-4d46-8a4c-126598be1d13,http://data.europa.eu/esco/occupation/58db3ac6...,electric power generation engineer,Electric power generation engineers design and...,...,f1b89616-5dfe-40d4-8b02-58a6982b1a01,http://data.europa.eu/esco/occupation/f1b89616...,substation engineer,Substation engineers design medium and high vo...,0.839408,ac1fc6a9-70d2-475c-8fa0-82ef83830968,http://data.europa.eu/esco/occupation/ac1fc6a9...,environmental engineer,Environmental engineers integrate environmenta...,0.83901
3,Rural Electrification Program Coordinator Rura...,46354077-416f-4440-a0c3-dddee9031c05,http://data.europa.eu/esco/occupation/46354077...,electricity and energy vocational teacher,Electricity and energy vocational teachers ins...,0.805861,9e1e8379-6377-4969-b141-0d2337218fc9,http://data.europa.eu/esco/occupation/9e1e8379...,economic development coordinator,Economic development coordinators outline and ...,...,22987b58-dbfa-4ecd-ae3f-82dd209255f6,http://data.europa.eu/esco/occupation/22987b58...,green ICT consultant,Green ICT consultants advise organisations on ...,0.779666,0e99c929-364f-4b0a-8a64-2aab42420f00,http://data.europa.eu/esco/occupation/0e99c929...,renewable energy engineer,Renewable energy engineers research alternativ...,0.778745
4,Power System Planning Engineer Regional power ...,58db3ac6-5217-4d46-8a4c-126598be1d13,http://data.europa.eu/esco/occupation/58db3ac6...,electric power generation engineer,Electric power generation engineers design and...,0.854631,1ff61522-8947-4c95-b589-cb0e0539a62b,http://data.europa.eu/esco/occupation/1ff61522...,energy systems engineer,Energy systems engineers supervise the energy ...,...,4d34a9ee-2653-4da0-bef7-f59294577224,http://data.europa.eu/esco/occupation/4d34a9ee...,electronics engineer,"Electronics engineers research, design, and de...",0.826369,ba490796-46f7-45b7-827b-db6882fca3e1,http://data.europa.eu/esco/occupation/ba490796...,onshore wind energy engineer,"Onshore wind energy engineers design, install ...",0.826193


## 4. Prepare JSON Output

### 4.01 Load Matching Results CSV

In [179]:
# Load the matching results CSV
matching_csv = project_root / "data" / "silver" / "esco_matching" / f"{project_id}_esco_matches.csv"
matches_df = pd.read_csv(matching_csv)

print(f"✓ Loaded matching results: {len(matches_df):,} rows")
print(f"  Columns: {len(matches_df.columns)}")
print(f"\nSample row:")
matches_df.head(1)

✓ Loaded matching results: 246 rows
  Columns: 110

Sample row:


Unnamed: 0,extraction_id,identified_occupation,industry,activity_description_in_pad,skills_needed_for_activity,source_material_quote,project_id,section_id,pad_id,combined_text,...,match_19_esco_id,match_19_uri,match_19_occupation,match_19_description,match_19_score,match_20_esco_id,match_20_uri,match_20_occupation,match_20_description,match_20_score
0,1,Transmission Line Construction Engineer,Construction of high-voltage transmission lines,the construction of various transmission lines...,"['construction of transmission lines', 'commis...",the construction of various transmission lines...,P075941,0,0,Transmission Line Construction Engineer Constr...,...,86ca306c-ab99-420a-9e2a-aa73c5c4de22,http://data.europa.eu/esco/occupation/86ca306c...,electrical engineer,Electrical engineers design and develop electr...,0.817014,7c2fbf7d-b934-4f62-8167-0ed90fb2a16f,http://data.europa.eu/esco/occupation/7c2fbf7d...,rolling stock engineer,Rolling stock engineers design and oversee the...,0.816404


### 4.02 Transform to JSON Format

In [180]:
# Transform each row into the specified JSON format
records = []

for idx, row in matches_df.iterrows():
    # Build esco_candidates array with top 15 matches
    esco_candidates = []
    for rank in range(1, 16):  # Top 15 matches
        # Use esco_id directly from Section 1 (no need to extract from URI)
        candidate = {
            "rank": rank,
            "esco_id": row[f'match_{rank}_esco_id'],  # UUID from Section 1
            "label": row[f'match_{rank}_occupation'],
            "description": row[f'match_{rank}_description'],
            "similarity_score": float(row[f'match_{rank}_score'])
        }
        esco_candidates.append(candidate)
    
    # Build the record
    record = {
        "record_id": row['pad_id'],
        "pad_occupation": row['identified_occupation'],
        "pad_quote": row['source_material_quote'] if pd.notna(row['source_material_quote']) else "",
        "esco_candidates": esco_candidates
    }
    records.append(record)

print(f"✓ Transformed {len(records):,} records to JSON format")
print(f"\nSample record (first with top 3 candidates):")
print("=" * 80)
sample = records[0].copy()
sample['esco_candidates'] = sample['esco_candidates'][:3]  # Show only first 3
print(json.dumps(sample, indent=2))
print("...")

✓ Transformed 246 records to JSON format

Sample record (first with top 3 candidates):
{
  "record_id": 0,
  "pad_occupation": "Transmission Line Construction Engineer",
  "pad_quote": "the construction of various transmission lines underway,  scheduled for commissioning by 2014.",
  "esco_candidates": [
    {
      "rank": 1,
      "esco_id": "7052fd94-f563-46a9-8e2d-cba6c20f3e71",
      "label": "overhead line worker",
      "description": "Overhead line workers construct and maintain power supply and control cables in overhead power lines. They also make and repair electrical cables connecting customers to the electricity network.",
      "similarity_score": 0.8453376
    },
    {
      "rank": 2,
      "esco_id": "f1b89616-5dfe-40d4-8b02-58a6982b1a01",
      "label": "substation engineer",
      "description": "Substation engineers design medium and high voltage substations used for the transmission, distribution, and generation of electrical energy. They develop methods for the ef

### 4.03 Save to JSON File

In [181]:
# Save to JSON file
json_output_dir = project_root / "data" / "silver" / "esco_matching_json"
json_output_dir.mkdir(parents=True, exist_ok=True)

json_output_file = json_output_dir / f"{project_id}_esco_matches.json"

with open(json_output_file, 'w', encoding='utf-8') as f:
    json.dump(records, f, indent=2, ensure_ascii=False)

print(f"✓ Saved JSON to: {json_output_file}")
print(f"  File size: {json_output_file.stat().st_size / 1024:.2f} KB")
print(f"  Records: {len(records):,}")
print(f"  Each record contains:")
print(f"    - PAD occupation details (record_id, occupation, quote)")
print(f"    - Top 15 ESCO candidates with URI, label, description, and similarity score")

✓ Saved JSON to: /Users/lauren/repos/PAD2Skills/data/silver/esco_matching_json/P075941_esco_matches.json
  File size: 2036.27 KB
  Records: 246
  Each record contains:
    - PAD occupation details (record_id, occupation, quote)
    - Top 15 ESCO candidates with URI, label, description, and similarity score
