# Notebook 02 — Data Preparation

This notebook cleans all datasets, engineers features, and assembles the master tables that feed into modeling:

1. **Clean** all O\*NET, BLS, LinkedIn, and course datasets
2. **Build Master Occupation Profile Table** — O\*NET backbone with skills, job zones, education, demand, and AI risk
3. **Process LinkedIn Postings** — extract demand signals and aggregate skill frequency
4. **Build Unified Course Catalogue** — merge edX + Udemy + Coursera
5. **Process CBC Pathways** — map Kenya education tracks to job zones
6. **Export** all outputs as parquet files for modeling

In [1]:
import pandas as pd
import numpy as np
import os
import sys
import re
import joblib
from pathlib import Path
from collections import Counter

sys.path.append(str(Path.cwd()))
from src.data_utils import DatasetSpec, load_dataset, standardize_columns
from src.data_cleaning import clean_and_audit
from src.Education_engineering import process_education, process_cbc_education
from src.skills_engineering import prepare_skills, fit_tfidf_skills

DATA_DIR      = Path('DATA')
PROCESSED_DIR = Path('data/processed')
ARTIFACTS_DIR = Path('artifacts')
MODEL_DIR     = Path('models')

for d in [PROCESSED_DIR, ARTIFACTS_DIR, MODEL_DIR]:
    d.mkdir(parents=True, exist_ok=True)

import warnings; warnings.filterwarnings('ignore')
print(' Setup complete')

 Setup complete


## Step 1 — Load New Datasets

In [2]:
# ── O*NET 
onet_occ   = standardize_columns(pd.read_excel(DATA_DIR/'new'/'Occupation Data.xlsx'))
onet_skills= standardize_columns(pd.read_excel(DATA_DIR/'new'/'Skills.xlsx'))
onet_jz    = standardize_columns(pd.read_excel(DATA_DIR/'new'/'Job Zones.xlsx'))
onet_edu   = standardize_columns(pd.read_excel(DATA_DIR/'new'/'Education, Training, and Experience.xlsx'))
bls_df     = standardize_columns(pd.read_csv(DATA_DIR/'new'/'bls_employment_projections.csv'))

# ── Courses ───────────────────────────────────────────────────────────
edx_df     = standardize_columns(pd.read_csv(DATA_DIR/'new'/'edx_courses.csv'))
udemy_df   = standardize_columns(pd.read_csv(DATA_DIR/'new'/'udemy_courses.csv'))
crs_df     = standardize_columns(pd.read_csv(DATA_DIR/'new'/'Coursera.csv'))
rev_df     = standardize_columns(pd.read_csv(DATA_DIR/'new'/'reviews.csv'))

# ── Kenya CBC ─────────────────────────────────────────────────────────
cbc_df     = standardize_columns(pd.read_csv(DATA_DIR/'new'/'cbc_pathways.csv'))

print('Datasets loaded:')
for name, df in [('O*NET Occupations',onet_occ),('O*NET Skills',onet_skills),
                  ('O*NET Job Zones',onet_jz),('O*NET Education',onet_edu),
                  ('BLS Projections',bls_df),('edX',edx_df),('Udemy',udemy_df),
                  ('Coursera',crs_df),('Reviews',rev_df),('CBC',cbc_df)]:
    print(f'  {name:<25}: {df.shape}')

Datasets loaded:
  O*NET Occupations        : (1016, 3)
  O*NET Skills             : (62580, 15)
  O*NET Job Zones          : (923, 5)
  O*NET Education          : (37125, 15)
  BLS Projections          : (102, 12)
  edX                      : (975, 16)
  Udemy                    : (3678, 12)
  Coursera                 : (3404, 9)
  Reviews                  : (107018, 3)
  CBC                      : (535, 5)


## Step 2 — Clean Each Dataset

In [3]:
# O*NET cleaning — minimal, these are authoritative datasets
def clean_onet(df, name):
    df = df.copy()
    df.columns = df.columns.str.strip().str.lower().str.replace(' ','_')
    text_cols = df.select_dtypes('object').columns
    for col in text_cols:
        df[col] = df[col].fillna('Unknown').astype(str).str.strip()
    num_cols = df.select_dtypes(include=np.number).columns
    for col in num_cols:
        df[col] = df[col].fillna(df[col].median())
    before_dupes = df.duplicated().sum()
    df = df.drop_duplicates()
    print(f'  {name}: {df.shape} | dupes removed: {before_dupes}')
    return df

onet_occ_c   = clean_onet(onet_occ,    'O*NET Occupations')
onet_skills_c= clean_onet(onet_skills, 'O*NET Skills')
onet_jz_c    = clean_onet(onet_jz,     'O*NET Job Zones')
onet_edu_c   = clean_onet(onet_edu,    'O*NET Education')
bls_c        = clean_onet(bls_df,      'BLS Projections')

  O*NET Occupations: (1016, 3) | dupes removed: 0
  O*NET Skills: (62580, 15) | dupes removed: 0
  O*NET Job Zones: (923, 5) | dupes removed: 0
  O*NET Education: (37125, 15) | dupes removed: 0
  BLS Projections: (102, 12) | dupes removed: 0


In [4]:
# Course datasets cleaning
edx_c,   edx_report   = clean_and_audit(edx_df,   'edX',     dataset_file='edx_courses.csv')
udemy_c, udemy_report = clean_and_audit(udemy_df,  'Udemy',   dataset_file='udemy_courses.csv')
crs_c,   crs_report   = clean_and_audit(crs_df,    'Coursera',dataset_file='Coursera.csv')
rev_c,   rev_report   = clean_and_audit(rev_df,    'Reviews', dataset_file='reviews.csv')

for name, report in [('edX',edx_report),('Udemy',udemy_report),('Coursera',crs_report),('Reviews',rev_report)]:
    print(f'=== {name} ===')
    display(report)

=== edX ===


Unnamed: 0,dataset,state,rows,nulls,dupes,outliers
0,edX,Raw,975,780,1,0
1,edX,Cleaned,974,0,0,0


=== Udemy ===


Unnamed: 0,dataset,state,rows,nulls,dupes,outliers
0,Udemy,Raw,3678,0,6,1533
1,Udemy,Cleaned,3672,0,0,1529


=== Coursera ===


Unnamed: 0,dataset,state,rows,nulls,dupes,outliers
0,Coursera,Raw,3404,0,0,783
1,Coursera,Cleaned,3404,0,0,783


=== Reviews ===


Unnamed: 0,dataset,state,rows,nulls,dupes,outliers
0,Reviews,Raw,107018,0,0,4720
1,Reviews,Cleaned,107018,0,0,4720


## Step 3 — LinkedIn Postings Processing

In [5]:
postings_path = DATA_DIR / 'new' / 'postings.csv'

if postings_path.exists():
    print('Loading LinkedIn postings (chunked for memory efficiency)...')

    chunk_list = []
    CHUNK_SIZE = 50_000
    KEEP_COLS  = ['title', 'company_name', 'location', 'work_type',
                  'formatted_experience_level', 'remote_allowed',
                  'med_salary', 'skills_abr', 'description']

    for chunk in pd.read_csv(postings_path, low_memory=False, chunksize=CHUNK_SIZE):
        chunk.columns = chunk.columns.str.strip().str.lower().str.replace(' ','_')
        keep = [c for c in KEEP_COLS if c in chunk.columns]
        chunk_list.append(chunk[keep])

    postings_df = pd.concat(chunk_list, ignore_index=True)
    postings_df = postings_df.drop_duplicates()
    print(f'LinkedIn postings loaded: {postings_df.shape}')
    print(f'Columns retained: {postings_df.columns.tolist()}')
    display(postings_df.head(3))
else:
    print(' postings.csv not found — skipping. Place file at DATA/new/postings.csv')
    postings_df = None

Loading LinkedIn postings (chunked for memory efficiency)...
LinkedIn postings loaded: (119320, 8)
Columns retained: ['title', 'company_name', 'location', 'work_type', 'formatted_experience_level', 'remote_allowed', 'med_salary', 'description']


Unnamed: 0,title,company_name,location,work_type,formatted_experience_level,remote_allowed,med_salary,description
0,Marketing Coordinator,Corcoran Sawyer Smith,"Princeton, NJ",FULL_TIME,,,,Job descriptionA leading real estate firm in N...
1,Mental Health Therapist/Counselor,,"Fort Collins, CO",FULL_TIME,,,,"At Aspen Therapy and Wellness , we are committ..."
2,Assitant Restaurant Manager,The National Exemplar,"Cincinnati, OH",FULL_TIME,,,,The National Exemplar is accepting application...


In [6]:
if postings_df is not None:
    # Build skill frequency from LinkedIn postings
    # skills_abr is a pipe-separated list of skill abbreviations
    skill_col = 'skills_abr' if 'skills_abr' in postings_df.columns else None

    if skill_col:
        all_skills = []
        for row in postings_df[skill_col].dropna():
            skills = [s.strip() for s in str(row).split('|') if s.strip()]
            all_skills.extend(skills)
        linkedin_skill_freq = Counter(all_skills)
        print(f'Unique skills in LinkedIn postings : {len(linkedin_skill_freq)}')
        print('Top 20 in-demand skills:')
        for skill, count in linkedin_skill_freq.most_common(20):
            print(f'  {skill:<30} {count:>7,}')
    else:
        linkedin_skill_freq = {}
        print('skills_abr column not found in postings')

skills_abr column not found in postings


In [7]:
if postings_df is not None:
    # Aggregate posting volume per job title (demand signal)
    if 'title' in postings_df.columns:
        posting_demand = (
            postings_df['title']
            .str.lower().str.strip()
            .value_counts()
            .reset_index()
            .rename(columns={'index': 'job_title', 'title': 'posting_volume'})
        )

        # Dynamically handle column names regardless of Pandas version
    posting_demand.columns = ['job_title', 'posting_volume']

    # Ensure posting_volume is numeric (just in case)
    posting_demand['posting_volume'] = posting_demand['posting_volume'].astype(int)
        

    # Normalise to 0–1 for use as a demand weight
    posting_demand['posting_demand_norm'] = (
        posting_demand['posting_volume'] /
        posting_demand['posting_volume'].max()
    )
    print(f'Job titles from LinkedIn: {len(posting_demand):,}')
    display(posting_demand.head(10))

    # Save
    posting_demand.to_parquet(PROCESSED_DIR/'linkedin_demand.parquet', index=False)
    print('Saved: linkedin_demand.parquet')
else:
    posting_demand = None

Job titles from LinkedIn: 71,580


Unnamed: 0,job_title,posting_volume,posting_demand_norm
0,customer service representative,449,1.0
1,sales manager,438,0.975501
2,project manager,358,0.797327
3,assistant store manager,284,0.632517
4,administrative assistant,255,0.567929
5,senior accountant,242,0.538976
6,salesperson,234,0.521158
7,registered nurse,211,0.469933
8,staff accountant,202,0.449889
9,executive assistant,198,0.44098


Saved: linkedin_demand.parquet


## Step 4 — Build Master Occupation Profile Table

Join O\*NET Occupations + Skills (pivoted) + Job Zones + BLS risk/demand

In [8]:
# ── 4a. Pivot O*NET skills to wide format (one row per occupation) ──
# Use Level scores (Scale ID = 'LV') only
level_skills = onet_skills_c[onet_skills_c['scale_id'] == 'LV'].copy()

skill_pivot = (
    level_skills
    .pivot_table(index=['o*net-soc_code', 'title'],
                 columns='element_name',
                 values='data_value',
                 aggfunc='mean')
    .reset_index()
)

# Clean column names
skill_pivot.columns = (
    ['onet_code', 'occupation'] +
    ['skill_' + c.lower().replace(' ', '_') for c in skill_pivot.columns[2:]]
)

print(f'Skills pivot shape: {skill_pivot.shape}')
print(f'Skill feature columns: {[c for c in skill_pivot.columns if c.startswith("skill_")]}')
display(skill_pivot.head(3))

Skills pivot shape: (894, 37)
Skill feature columns: ['skill_active_learning', 'skill_active_listening', 'skill_complex_problem_solving', 'skill_coordination', 'skill_critical_thinking', 'skill_equipment_maintenance', 'skill_equipment_selection', 'skill_installation', 'skill_instructing', 'skill_judgment_and_decision_making', 'skill_learning_strategies', 'skill_management_of_financial_resources', 'skill_management_of_material_resources', 'skill_management_of_personnel_resources', 'skill_mathematics', 'skill_monitoring', 'skill_negotiation', 'skill_operation_and_control', 'skill_operations_analysis', 'skill_operations_monitoring', 'skill_persuasion', 'skill_programming', 'skill_quality_control_analysis', 'skill_reading_comprehension', 'skill_repairing', 'skill_science', 'skill_service_orientation', 'skill_social_perceptiveness', 'skill_speaking', 'skill_systems_analysis', 'skill_systems_evaluation', 'skill_technology_design', 'skill_time_management', 'skill_troubleshooting', 'skill_writ

Unnamed: 0,onet_code,occupation,skill_active_learning,skill_active_listening,skill_complex_problem_solving,skill_coordination,skill_critical_thinking,skill_equipment_maintenance,skill_equipment_selection,skill_installation,...,skill_science,skill_service_orientation,skill_social_perceptiveness,skill_speaking,skill_systems_analysis,skill_systems_evaluation,skill_technology_design,skill_time_management,skill_troubleshooting,skill_writing
0,11-1011.00,Chief Executives,4.5,4.75,4.88,4.88,4.75,0.0,0.25,0.0,...,0.75,3.38,4.25,4.75,5.12,5.0,0.88,4.75,0.5,4.38
1,11-1011.03,Chief Sustainability Officers,3.88,4.0,4.12,3.88,4.12,0.0,0.25,0.0,...,1.88,3.25,3.88,4.12,4.0,4.0,1.12,3.88,0.0,4.25
2,11-1021.00,General and Operations Managers,3.75,4.12,3.88,4.0,3.88,0.0,0.0,0.0,...,0.62,3.12,4.0,4.12,3.12,3.25,0.62,3.88,1.0,3.88


In [9]:
# ── 4b. Join Job Zones ────────────────────────────────────────────────
jz_clean = onet_jz_c[['o*net-soc_code', 'job_zone']].rename(columns={'o*net-soc_code': 'onet_code'})

master = skill_pivot.merge(jz_clean, on='onet_code', how='left')
print(f'After Job Zone join: {master.shape}')

After Job Zone join: (894, 38)


In [10]:
# ── 4c. Map Job Zone to user education level ────────────────────────
JOB_ZONE_EDUCATION = {
    1: 'High School',
    2: 'High School',
    3: 'Diploma/TVET',
    4: 'Bachelors',
    5: 'Masters/Professional',
}

master['min_education'] = master['job_zone'].map(JOB_ZONE_EDUCATION).fillna('Unknown')
print('Education level mapping applied.')
display(master['min_education'].value_counts().reset_index())

Education level mapping applied.


Unnamed: 0,min_education,count
0,High School,324
1,Bachelors,214
2,Diploma/TVET,205
3,Masters/Professional,151


In [11]:
# ── 4d. Join BLS demand + AI risk via fuzzy title match ─────────────
from difflib import get_close_matches

def fuzzy_match_bls(occ_title: str, bls_titles: list, cutoff=0.6):
    """Find best BLS occupation match for an O*NET title."""
    occ_lower = occ_title.lower()
    matches = get_close_matches(occ_lower, [t.lower() for t in bls_titles],
                                 n=1, cutoff=cutoff)
    if matches:
        idx = [t.lower() for t in bls_titles].index(matches[0])
        return bls_titles[idx]
    return None

bls_titles = bls_c['occupation'].tolist()

print('Fuzzy-matching O*NET occupations to BLS projections...')
master['bls_match'] = master['occupation'].apply(
    lambda t: fuzzy_match_bls(t, bls_titles, cutoff=0.55)
)

matched_count = master['bls_match'].notna().sum()
print(f'Matched {matched_count}/{len(master)} occupations to BLS ({matched_count/len(master)*100:.1f}%)')

Fuzzy-matching O*NET occupations to BLS projections...
Matched 288/894 occupations to BLS (32.2%)


In [12]:
# Join BLS metrics onto matched occupations
bls_merge = bls_c.rename(columns={'occupation': 'bls_match'})[
    ['bls_match', 'career_family', 'demand_level', 'automation_risk',
     'demand_score', 'growth_category', 'change_pct', 'median_wage_2022',
     'openings_annual', 'employment_2022', 'employment_2032']
]

master = master.merge(bls_merge, on='bls_match', how='left')

# Fill unmatched occupations with neutral defaults
master['demand_level']    = master['demand_level'].fillna('Unknown')
master['automation_risk'] = master['automation_risk'].fillna(master['automation_risk'].median())
master['demand_score']    = master['demand_score'].fillna(0.5)
master['growth_category'] = master['growth_category'].fillna('Average')

# AI replacement risk category from automation_risk score
master['ai_replacement_risk'] = pd.cut(
    master['automation_risk'],
    bins=[0, 0.3, 0.6, 1.0],
    labels=['Low', 'Medium', 'High'],
    right=True
).astype(str)

print(f'Master occupation table shape: {master.shape}')
display(master[['occupation','career_family','min_education','demand_level',
                'ai_replacement_risk','automation_risk','demand_score']].head(10))

Master occupation table shape: (894, 51)


Unnamed: 0,occupation,career_family,min_education,demand_level,ai_replacement_risk,automation_risk,demand_score
0,Chief Executives,,Masters/Professional,Unknown,Low,0.04,0.5
1,Chief Sustainability Officers,,Masters/Professional,Unknown,Low,0.04,0.5
2,General and Operations Managers,Business And Finance,Bachelors,Medium,Low,0.16,0.313679
3,Advertising and Promotions Managers,Business And Finance,Bachelors,Medium,Low,0.16,0.313679
4,Marketing Managers,Business And Finance,Bachelors,High,Low,0.1,0.396226
5,Sales Managers,Business And Finance,Bachelors,Medium,Low,0.16,0.313679
6,Fundraising Managers,Business And Finance,Bachelors,Medium,Low,0.16,0.313679
7,Administrative Services Managers,,Diploma/TVET,Unknown,Low,0.04,0.5
8,Facilities Managers,Business And Finance,Diploma/TVET,Medium,Low,0.16,0.313679
9,Security Managers,Business And Finance,Bachelors,High,Low,0.1,0.396226


In [13]:
# ── 4e. Add LinkedIn posting demand signal where available ───────────
if postings_df is not None and 'posting_demand' in dir():
    posting_demand_lookup = (
        posting_demand.set_index('job_title')['posting_demand_norm'].to_dict()
    )
    master['linkedin_demand_norm'] = (
        master['occupation'].str.lower().map(posting_demand_lookup).fillna(0)
    )
    print(f'LinkedIn demand signal attached to {(master["linkedin_demand_norm"]>0).sum()} occupations')
else:
    master['linkedin_demand_norm'] = 0.0
    print('LinkedIn demand signal: using 0 (postings.csv not loaded)')

# Composite demand score (BLS demand_score + LinkedIn signal)
master['composite_demand'] = (
    0.7 * master['demand_score'] +
    0.3 * master['linkedin_demand_norm']
).clip(0, 1)

master.to_parquet(PROCESSED_DIR/'master_occupation_profiles.parquet', index=False)
print(f'\n Master occupation table saved: {master.shape}')
print(f'  Columns: {master.columns.tolist()}')

LinkedIn demand signal attached to 5 occupations

 Master occupation table saved: (894, 53)
  Columns: ['onet_code', 'occupation', 'skill_active_learning', 'skill_active_listening', 'skill_complex_problem_solving', 'skill_coordination', 'skill_critical_thinking', 'skill_equipment_maintenance', 'skill_equipment_selection', 'skill_installation', 'skill_instructing', 'skill_judgment_and_decision_making', 'skill_learning_strategies', 'skill_management_of_financial_resources', 'skill_management_of_material_resources', 'skill_management_of_personnel_resources', 'skill_mathematics', 'skill_monitoring', 'skill_negotiation', 'skill_operation_and_control', 'skill_operations_analysis', 'skill_operations_monitoring', 'skill_persuasion', 'skill_programming', 'skill_quality_control_analysis', 'skill_reading_comprehension', 'skill_repairing', 'skill_science', 'skill_service_orientation', 'skill_social_perceptiveness', 'skill_speaking', 'skill_systems_analysis', 'skill_systems_evaluation', 'skill_tech

## Step 5 — Build Unified Course Catalogue

In [14]:
# ── Standardise edX ──────────────────────────────────────────────────
edx_std = pd.DataFrame({
    'course_title':   edx_c.get('title', ''),
    'platform':       'edX',
    'institution':    edx_c.get('institution', 'Unknown'),
    'subject':        edx_c.get('subject', 'Unknown'),
    'level':          edx_c.get('level', 'Unknown'),
    'skills_covered': edx_c.get('course_syllabus', edx_c.get('summary', '')),
    'effort':         edx_c.get('course_effort', ''),
    'price':          edx_c.get('price', 'Free'),
    'url':            edx_c.get('course_url', ''),
    'rating':         np.nan,
    'num_reviews':    np.nan,
}).assign(platform='edX')

# ── Standardise Udemy ─────────────────────────────────────────────────
udemy_std = pd.DataFrame({
    'course_title':   udemy_c.get('course_title', ''),
    'platform':       'Udemy',
    'institution':    'Udemy',
    'subject':        udemy_c.get('subject', 'Unknown'),
    'level':          udemy_c.get('level', 'Unknown'),
    'skills_covered': udemy_c.get('subject', ''),
    'effort':         udemy_c.get('content_duration', '').astype(str) + ' hrs',
    'price':          udemy_c.get('price', 0).astype(str),
    'url':            udemy_c.get('url', ''),
    'rating':         np.nan,
    'num_reviews':    udemy_c.get('num_reviews', np.nan),
}).assign(platform='Udemy')

# ── Standardise Coursera ─────────────────────────────────────────────
crs_std = pd.DataFrame({
    'course_title':   crs_c.get('title', ''),
    'platform':       'Coursera',
    'institution':    crs_c.get('institution', 'Unknown'),
    'subject':        crs_c.get('subject', 'Unknown'),
    'level':          crs_c.get('level', 'Unknown'),
    'skills_covered': crs_c.get('gained_skills', ''),
    'effort':         crs_c.get('duration', ''),
    'price':          'Free audit',
    'url':            '',
    'rating':         crs_c.get('rate', np.nan),
    'num_reviews':    crs_c.get('reviews', np.nan),
}).assign(platform='Coursera')

# ── Combine ───────────────────────────────────────────────────────────
course_catalogue = pd.concat([edx_std, udemy_std, crs_std], ignore_index=True)
course_catalogue = course_catalogue.dropna(subset=['course_title'])
course_catalogue = course_catalogue[course_catalogue['course_title'].str.strip() != '']
course_catalogue = course_catalogue.reset_index(drop=True)

print(f'Unified course catalogue: {course_catalogue.shape}')
display(course_catalogue.groupby('platform').size().reset_index(name='courses'))
display(course_catalogue.head(3))

Unified course catalogue: (8050, 11)


Unnamed: 0,platform,courses
0,Coursera,3404
1,Udemy,3672
2,edX,974


Unnamed: 0,course_title,platform,institution,subject,level,skills_covered,effort,price,url,rating,num_reviews
0,How to Learn Online,edX,edX,Education & Teacher Training,Introductory,Welcome - We start with opportunities to meet ...,2–3 hours per week,FREE-Add a Verified Certificate for $49 USD,https://www.edx.org/course/how-to-learn-online,,
1,Programming for Everybody (Getting Started wit...,edX,The University of Michigan,Computer Science,Introductory,Unknown,2–4 hours per week,FREE-Add a Verified Certificate for $49 USD,https://www.edx.org/course/programming-for-eve...,,
2,CS50's Introduction to Computer Science,edX,Harvard University,Computer Science,Introductory,Unknown,6–18 hours per week,FREE-Add a Verified Certificate for $90 USD,https://www.edx.org/course/cs50s-introduction-...,,


In [15]:
# Quality boost score
# Coursera has ratings, Udemy has review counts, edX has neither
course_catalogue['rating_norm'] = (
    course_catalogue['rating'].fillna(3.5) / 5.0
)
max_reviews = course_catalogue['num_reviews'].max()
course_catalogue['popularity_norm'] = np.where(
    course_catalogue['num_reviews'].notna(),
    np.log1p(course_catalogue['num_reviews'].fillna(0)) / np.log1p(max_reviews),
    0.3  # neutral for edX (no review data)
)
course_catalogue['quality_score'] = (
    0.6 * course_catalogue['rating_norm'] +
    0.4 * course_catalogue['popularity_norm']
).clip(0, 1)

course_catalogue.to_parquet(PROCESSED_DIR/'unified_courses.parquet', index=False)
print(f' Unified course catalogue saved: {course_catalogue.shape}')

 Unified course catalogue saved: (8050, 14)


## Step 6 — CBC Pathways Processing

In [16]:
cbc_clean, cbc_report = clean_and_audit(cbc_df, 'CBC_Pathways', dataset_file='cbc_pathways.csv')
display(cbc_report)

# Rename skills column if needed
if 'skills_required' in cbc_clean.columns:
    cbc_clean = cbc_clean.rename(columns={'skills_required': 'skills'})

# Apply CBC education mapping
cbc_clean = process_cbc_education(cbc_clean)

# Map CBC tracks to Job Zones
CBC_TO_JOB_ZONE = {
    'Applied Science':              4,
    'Pure Science':                 4,
    'Technical Studies':            3,
    'Languages & Literature':       4,
    'Humanities & Business Studies':4,
    'Arts':                         3,
    'Sports':                       2,
}
if 'track' in cbc_clean.columns:
    cbc_clean['job_zone_min'] = cbc_clean['track'].map(CBC_TO_JOB_ZONE).fillna(3).astype(int)

cbc_clean.to_parquet(PROCESSED_DIR/'cbc_pathways.parquet', index=False)

display(cbc_clean.head(5))

Unnamed: 0,dataset,state,rows,nulls,dupes,outliers
0,CBC_Pathways,Raw,535,0,0,0
1,CBC_Pathways,Cleaned,535,0,0,0


Unnamed: 0,pathway,track,subjects,program_code,pathway_id,education_level,job_zone_min
0,ARTS & SPORTS SCIENCE,ARTS,"Arabic,Fine Arts,Theatre & Film",AS1001,25211070-b138-4b82-813d-bbb7383a5c1d,Unknown,3
1,ARTS & SPORTS SCIENCE,ARTS,"Biology,Fine Arts,Theatre & Film",AS1002,1cf6be29-e82d-4e59-ac5f-10d4c3da7a22,Unknown,3
2,ARTS & SPORTS SCIENCE,ARTS,"Business Studies,Fine Arts,Theatre & Film",AS1003,84bc8e17-5fb0-4242-b23f-83141081fec4,Unknown,3
3,ARTS & SPORTS SCIENCE,ARTS,"Computer Studies,Fine Arts,Theatre & Film",AS1004,d5d0a13e-fc32-4c57-8e59-65fc1b49fb97,Unknown,3
4,ARTS & SPORTS SCIENCE,ARTS,"Christian Religious Education,Fine Arts,Theatr...",AS1005,21d1a140-fd7b-416b-822e-6846af18c3f3,Unknown,3


## Step 7 — Save All Artifacts

In [17]:
import joblib, json

# Save master occupation profile
master.to_parquet(PROCESSED_DIR/'master_occupation_profiles.parquet', index=False)

# Save the list of BLS-backed career families for use in modeling
career_families = bls_c['career_family'].unique().tolist()
with open(ARTIFACTS_DIR/'career_families.json', 'w') as f:
    json.dump(career_families, f, indent=2)

# Save BLS career lookup
bls_lookup = bls_c.set_index('occupation').to_dict(orient='index')
with open(ARTIFACTS_DIR/'bls_career_lookup.json', 'w') as f:
    json.dump(bls_lookup, f, indent=2, default=str)

# Save skill columns list (for model feature alignment)
skill_cols = [c for c in master.columns if c.startswith('skill_')]
with open(ARTIFACTS_DIR/'onet_skill_columns.json', 'w') as f:
    json.dump(skill_cols, f, indent=2)

print('  All artifacts saved:')
print(f'  {PROCESSED_DIR}/master_occupation_profiles.parquet  — {master.shape}')
print(f'  {PROCESSED_DIR}/unified_courses.parquet            — {course_catalogue.shape}')
print(f'  {PROCESSED_DIR}/cbc_pathways.parquet               — {cbc_clean.shape}')
print(f'  {ARTIFACTS_DIR}/career_families.json              — {len(career_families)} families')
print(f'  {ARTIFACTS_DIR}/onet_skill_columns.json           — {len(skill_cols)} skill features')


  All artifacts saved:
  data\processed/master_occupation_profiles.parquet  — (894, 53)
  data\processed/unified_courses.parquet            — (8050, 14)
  data\processed/cbc_pathways.parquet               — (535, 7)
  artifacts/career_families.json              — 13 families
  artifacts/onet_skill_columns.json           — 35 skill features
