In [4]:
# ==========================================
# Setup & imports
# ==========================================

import pandas as pd
import os
from io import StringIO
from tabula import read_pdf
import pdfplumber

In [6]:
# config to store processed data
processed_data_folder = 'data/processed_data'
os.makedirs(processed_data_folder, exist_ok=True)

Group Dataset A. BLS (US Occupation Forecast) bls_table_1_2.csv -> Core occupations (growth %, education, wages) bls_table_1_3.csv -> Fastest growing jobs bls_table_1_4.csv -> Jobs with most numeric growth bls_table_1_10.csv -> Job separations & openings bls_table_1_11.csv -> STEM vs Non-STEM overview

B. WEF (Global Future of Jobs) Emerging jobs: wef_fastest_growing_jobs_page_19.csv, wef_fastest_declining_jobs_page_19.csv Skills: wef_core_skills_page_35.csv, wef_genai_substitution_page_b3_1_page_44.csv Industry adoption: wef_ai_big_data_page_39.csv, wef_networks_cyber_page_39.csv, wef_tech_literacy_page_39.csv Training: wef_training_completion_page_46.csv

C. Kaggle (Job Market & Skills) df_kaggle_tech_job_skills.csv → Job postings + skills df_kaggle_ai_job.csv → AI job market (global)

In [9]:
# ==========================================
# Load Data
# ==========================================


# ==========================================
# load BLS dataset
# ==========================================

# we need table 1.2, 1.3, 1.4, 1.10, 1.11
# define a fn to extract only tech related data from tables
tech_title_keywords = [
    "software", "developer", "engineer", "data", "computer", "cyber", "security",
    "it", "ai", "information", "network", "programmer", "scientist", "web", "ux", "technology"
]

non_tech_title_keyword = ['sales', 'civil', 'mechanical', 'financial']

def is_tech_occupation(t):
    title = str(t).lower()
    return any(kw in title for kw in tech_title_keywords) and not any(ex_key in title for ex_key in non_tech_title_keyword)
    
print(os.getcwd())
bls_path = 'data/raw/bls_occupation.xlsx'
bls_files = pd.ExcelFile(bls_path)
print(f'sheet names: {bls_files.sheet_names}')

# table 1.2 Occupational projections, 2023—2033, and worker characteristics, 2023 
bls_table_1_2 = pd.read_excel(bls_path, sheet_name='Table 1.2', skiprows=1)
bls_table_1_2.head(5)
bls_table_1_2.columns = bls_table_1_2.columns.str.lower().str.replace(',', '', regex=True).str.replace(r'\[.*\]', '', regex=True).str.replace(' ', '_')
bls_table_1_2 = bls_table_1_2[bls_table_1_2['2023_national_employment_matrix_title'].apply(is_tech_occupation)]
bls_table_1_2 = bls_table_1_2.iloc[: -2]
bls_table_1_2.to_csv(os.path.join(processed_data_folder, 'bls_table_1_2.csv'), index=False)

/Users/jennifer-david/Documents/work/SpringBoard/projects/Capstone Projects/Tech career growth/notebooks
sheet names: ['Index', 'Table 1.1', 'Table 1.2', 'Table 1.3', 'Table 1.4', 'Table 1.5', 'Table 1.6', 'Table 1.7', 'Table 1.8', 'Table 1.9', 'Table 1.10', 'Table 1.11', 'Table 1.12']


In [11]:
# table 1.3 Fastest growing occupations, 2023 and projected 2033
bls_table_1_3 = pd.read_excel(bls_path, sheet_name='Table 1.3', skiprows=1)
bls_table_1_3.head(5)
bls_table_1_3.columns = bls_table_1_3.columns.str.lower().str.replace(',', '', regex=True).str.replace(r'\[.*\]', '', regex=True).str.replace(' ', '_')
bls_table_1_3 = bls_table_1_3[bls_table_1_3['2023_national_employment_matrix_title'].apply(is_tech_occupation)]
bls_table_1_3 = bls_table_1_3.iloc[: -2]
bls_table_1_3.to_csv(os.path.join(processed_data_folder, 'bls_table_1_3.csv'), index=False)

In [13]:
# table 1.4 Occupations with the most job growth, 2023 and projected 2033
bls_table_1_4 = pd.read_excel(bls_path, sheet_name='Table 1.4', skiprows=1)
bls_table_1_4.head(5)
bls_table_1_4.columns = bls_table_1_4.columns.str.lower().str.replace(',', '', regex=True).str.replace(r'\[.*\]', '', regex=True).str.replace(' ', '_')
bls_table_1_4 = bls_table_1_4[bls_table_1_4['2023_national_employment_matrix_title'].apply(is_tech_occupation)]
bls_table_1_4 = bls_table_1_4.iloc[: -2]
bls_table_1_4.to_csv(os.path.join(processed_data_folder, 'bls_table_1_4.csv'), index=False)

In [15]:
# table 1.10 Occupations with the most job growth, 2023 and projected 2033
bls_table_1_10 = pd.read_excel(bls_path, sheet_name='Table 1.10', skiprows=1)
bls_table_1_10.head(5)
bls_table_1_10.columns = bls_table_1_10.columns.str.lower().str.replace(',', '', regex=True).str.replace(r'\[.*\]', '', regex=True).str.replace(' ', '_')
bls_table_1_10 = bls_table_1_10[bls_table_1_10['2023_national_employment_matrix_title'].apply(is_tech_occupation)]
bls_table_1_10.shape
bls_table_1_10.to_csv(os.path.join(processed_data_folder, 'bls_table_1_10.csv'), index=False)

In [17]:
# table 1.11 Employment in STEM occupations, 2023 and projected 2033 
bls_table_1_11 = pd.read_excel(bls_path, sheet_name='Table 1.11', skiprows=1)
bls_table_1_11.head(5)
bls_table_1_11.columns = bls_table_1_11.columns.str.lower().str.replace(',', '', regex=True).str.replace(r'\[.*\]', '', regex=True).str.replace(' ', '_')
bls_table_1_11 = bls_table_1_11.iloc[: -4]
bls_table_1_11.shape
bls_table_1_11.to_csv(os.path.join(processed_data_folder, 'bls_table_1_11.csv'), index=False)

In [19]:
# ==========================================
# load WEF dataset
# ==========================================

wef_path = 'data/raw/wef_future_2025.pdf'
# I have tried to use tabula and pdfplumber to import tables directly from wef pdf, but failed, so I change to create tables manually 
# 1st table - Figure 2.2 on page 19 "Fastest-growing and fastest-declining jobs, 2025-2030" 
growing_data = {
    'Job Title': [
        'Big Data Specialists',
        'FinTech Engineers',
        'AI and Machine Learning Specialists',
        'Software and Applications Developers',
        'Security Management Specialists',
        'Data Warehousing Specialists',
        'Autonomous and Electric Vehicle Specialists',
        'UI and UX Designers',
        'Light Truck or Delivery Services Drivers',
        'Internet of Things Specialists',
        'Data Analysts and Scientists',
        'Environmental Engineers',
        'Information Security Analysts',
        'DevOps Engineers',
        'Renewable Energy Engineers'
    ],
    'Net Growth %': [120, 100, 80, 60, 40, 40, 40, 20, 20, 20, 20, 20, 20, 20, 20]
}
df_wef_growing = pd.DataFrame(growing_data)
df_wef_growing
df_wef_growing.to_csv(os.path.join(processed_data_folder, 'wef_fastest_growing_jobs_page_19.csv'), index=False)

declining_data = {
    'Job Title': [
        'Administrative Assistants and Executive Secretaries',
        'Accounting, Bookkeeping and Payroll Clerks',
        'Data Entry Clerks',
        'Software Testers',
        'Material-Recording and Stock-Keeping Clerks',
        'Client Information and Customer Service Workers',
        'Security Guards',
        'Accountants and Auditors',
        'Cashiers and Ticket Clerks',
        'Graphic Designers',
        'Printing and Related Trades Workers',
        'Bank Tellers and Related Clerks',
        'Transportation Attendants and Conductors',
        'Building Caretakers, Cleaners, Housekeepers'
    ],
    'Net Job Destruction (Millions)': [-42.0, -29.0, -40.0, -0.5, -15.0, -15.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -15.0]
}
df_wef_declining = pd.DataFrame(declining_data)
df_wef_declining.to_csv(os.path.join(processed_data_folder, 'wef_fastest_declining_jobs_page_19.csv'), index=False)

In [21]:
 # 2nd table - Figure 3.4 on page 37: Skills on the rise, 2025-2030 (Net Increase Percentages for Tech-Related Skills)
df_wef_core_skills_data = {
    'Skill': [
        'AI and big data',
        'Networks and cybersecurity',
        'Technological literacy',
        'Programming',
        'Analytical thinking',
        'Creative thinking',
        'Resilience, flexibility and agility',
        'Curiosity and lifelong learning',
        'Leadership and social influence',
        'Systems thinking',
        'Environmental stewardship',
        'Design and user experience'
    ],
    'Net Increase (%)': [87, 70, 68, 27, 55, 66, 66, 61, 58, 51, 53, 45]
}

df_wef_core_skills_df = pd.DataFrame(df_wef_core_skills_data)

# print("Core Skills, 2025-2030 (Page 35, Figure 3.1):")
# print(df_wef_core_skills_df)

df_wef_core_skills_df.to_csv(os.path.join(processed_data_folder, 'wef_core_skills_page_35.csv'), index=False)

In [23]:
# 3rd table - Figure 3.5 on page 39: Top 10 industries for increasing skill requirements, 2025-2030
# AI and Big Data (Top 10 Industries, % Expecting Increase)
wef_ai_big_data = pd.DataFrame({
    'Rank': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Industry': [
        'Automotive and Aerospace',
        'Telecommunications',
        'Professional Services',
        'Information and Technology Services',
        'Insurance and Pensions Management',
        'Financial Services and Capital Markets',
        'Supply Chain and Transportation',
        'Medical and Healthcare Services',
        'Energy Technology and Utilities',
        'Government and Public Sector'
    ],
    'Percentage (%)': [100, 100, 98, 97, 97, 95, 94, 92, 90, 90]
})

# Technological Literacy (Top 10 Industries, % Expecting Increase)
wef_tech_literacy = pd.DataFrame({
    'Rank': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Industry': [
        'Automotive and Aerospace',
        'Financial Services and Capital Markets',
        'Medical and Healthcare Services',
        'Insurance and Pensions Management',
        'Supply Chain and Transportation',
        'Education and Training',
        'Oil and Gas',
        'Professional Services',
        'Advanced Manufacturing',
        'Production of Consumer Goods'
    ],
    'Percentage (%)': [84, 84, 81, 81, 77, 76, 76, 75, 73, 72]
})

# Networks and Cybersecurity (Top 10 Industries, % Expecting Increase)
wef_networks_cyber = pd.DataFrame({
    'Rank': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Industry': [
        'Financial Services and Capital Markets',
        'Insurance and Pensions Management',
        'Energy Technology and Utilities',
        'Medical and Healthcare Services',
        'Automotive and Aerospace',
        'Government and Public Sector',
        'Supply Chain and Transportation',
        'Telecommunications',
        'Advanced Manufacturing',
        'Information and Technology Services'
    ],
    'Percentage (%)': [82, 81, 79, 78, 78, 78, 76, 75, 74, 74]
})

wef_ai_big_data.to_csv(os.path.join(processed_data_folder, 'wef_ai_big_data_page_39.csv'), index=False)
wef_tech_literacy.to_csv(os.path.join(processed_data_folder, 'wef_tech_literacy_page_39.csv'), index=False)
wef_networks_cyber.to_csv(os.path.join(processed_data_folder, 'wef_networks_cyber_page_39.csv'), index=False)

In [25]:
# 4th table - Figure B3.1 on page 44: Current capacity for substitution by Generative AI, by skill group
wef_substitution_data = {
    'Skill Group': [
        'AI and big data',
        'Programming',
        'Technology literacy',
        'Networks and cybersecurity',
        'Design and user experience',
        'Analytical thinking',
        'Creative thinking',
        'Curiosity and lifelong learning',
        'Resilience, flexibility and agility'
    ],
    'Very Low Capacity (%)': [0, 0, 1, 2, 0, 3, 4, 4, 40],
    'Low Capacity (%)': [8, 35, 58, 76, 63, 83, 90, 45, 50],
    'Moderate Capacity (%)': [78, 45, 38, 23, 36, 10, 4, 41, 8],
    'High Capacity (%)': [20, 37, 3, 2, 0, 2, 0, 3, 0],
}

wef_substitution_df = pd.DataFrame(wef_substitution_data)

wef_substitution_df.to_csv(os.path.join(processed_data_folder, 'wef_genai_substitution_page_b3_1_page_44.csv'), index=False)

In [27]:
# 5th table - Figure 3.8 on page 46: Training completion as part of learning and development strategies, 2023 vs. 2025, by industry
wef_training_data = {
    'Industry': [
        'Financial Services and Capital Markets',
        'Information and Technology Services',
        'Telecommunications',
        'Automotive and Aerospace',
        'Insurance and Pensions Management',
        'Infrastructure',
        'Supply Chain and Transportation',
        'Energy Technology and Utilities',
        'Medical and Healthcare Services',
        'Advanced Manufacturing',
        'Global Average',
        'Education and Training',
        'Oil and Gas',
        'Government and Public Sector',
        'Production of consumer goods'
    ],
    'Training Completion (%)': [50, 47, 52, 46, 52, 46, 58, 46, 50, 46, 40, 45, 46, 38, 47]
}

df_wef_training_data = pd.DataFrame(wef_training_data)

df_wef_training_data.to_csv(os.path.join(processed_data_folder, 'wef_training_completion_page_46.csv'), index=False)

In [29]:
# ==========================================
# load Kaggle dataset
# ==========================================

# load Tech Job Listings with Skills and Demand (2024-25)
tech_job_skills_path = 'data/raw/jobs_dataset_processed.csv'
df_kaggle_tech_job_skills = pd.read_csv(tech_job_skills_path)
df_kaggle_tech_job_skills.columns = df_kaggle_tech_job_skills.columns.str.lower().str.replace(' ', '_')
df_kaggle_tech_job_skills = df_kaggle_tech_job_skills[df_kaggle_tech_job_skills['job_title'].apply(is_tech_occupation)]
df_kaggle_tech_job_skills.to_csv(os.path.join(processed_data_folder, 'df_kaggle_tech_job_skills.csv'), index=False)

In [31]:
# load Global AI Job Market Trends & Salary Insights 2025
ai_job_path = 'data/raw/ai_job_dataset.csv'
df_kaggle_ai_job = pd.read_csv(ai_job_path)
df_kaggle_ai_job.columns = df_kaggle_ai_job.columns.str.lower().str.replace(' ', '_')
df_kaggle_ai_job = df_kaggle_ai_job[df_kaggle_ai_job['job_title'].apply(is_tech_occupation)]
df_kaggle_ai_job.to_csv(os.path.join(processed_data_folder, 'df_kaggle_ai_job.csv'), index=False)