In [None]:
# %% [markdown]
# # Strategic Workforce Analysis: AI Integration vs. Structural Risk (2010-2025)
# ## Task 1: Data Preprocessing & Outlier Handling

# %%
import pandas as pd
import numpy as np
import os

# --- Local Path Configuration ---
BASE_DIR = os.path.dirname(os.getcwd())
RAW_DATA_PATH = os.path.join(BASE_DIR, 'data', 'raw', 'ai_impact_jobs_2010_2025.csv')
PROCESSED_DIR = os.path.join(BASE_DIR, 'data', 'processed')

# %%
# 1. Load Dataset
if os.path.exists(RAW_DATA_PATH):
    df = pd.read_csv(RAW_DATA_PATH)
    print(f"Raw Data Loaded from local. Shape: {df.shape}")
else:
    print(f"Error: File not found at {RAW_DATA_PATH}")

# 2. Basic Cleaning
df['ai_skills'] = df['ai_skills'].fillna('Not Specified')
df['ai_keywords'] = df['ai_keywords'].fillna('None')

# 3. IQR-based Outlier Removal for Salary
# We use IQR to focus our analysis on the mainstream market behavior
Q1 = df['salary_usd'].quantile(0.25)
Q3 = df['salary_usd'].quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR

df_cleaned = df[(df['salary_usd'] >= lower_limit) & (df['salary_usd'] <= upper_limit)].copy()

# %%
# 4. Save the Cleaned Dataset for subsequent tasks
# We save this to ensure consistency across all analysis branches
os.makedirs(PROCESSED_DIR, exist_ok=True)

CLEANED_FILE_PATH = os.path.join(PROCESSED_DIR, 'ai_impact_jobs_cleaned.csv')
df_cleaned.to_csv(CLEANED_FILE_PATH, index=False)

print(f"Processed data saved to: {CLEANED_FILE_PATH}")

âœ… Raw Data Loaded from local. Shape: (5000, 22)
Processed data saved to: /Users/miraekang/proyectos/eda/data/processed/ai_impact_jobs_cleaned.csv


In [None]:
# %% [markdown]
# ## Task 2: Establishing the Salary Baseline (Q1)
# **Goal:** Understand the global salary distribution and identify the characteristics of the core workforce.

# %%
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Load the cleaned data we just saved
df_cleaned = pd.read_csv(CLEANED_FILE_PATH)

# %% [markdown]
# ### 1.1 Salary Distribution with Core Statistics
# We compare the Mean vs Median to check for remaining skewness.

# %%
# Summary statistics for the CEO
stats = df_cleaned['salary_usd'].describe()
print(f"Market Baseline Statistics (USD):")
print(f"- Mean Salary: ${stats['mean']:,.0f}")
print(f"- Median Salary: ${stats['50%']:,.0f}")
print(f"- Market Range: ${stats['min']:,.0f} to ${stats['max']:,.0f}")

# Visualization: Distribution of Salary
fig = px.histogram(df_cleaned, x="salary_usd", 
                   nbins=30, 
                   marginal="box", # Adds a boxplot on top to show quartiles
                   title="<b>Salary Distribution: The Global Workforce Baseline</b>",
                   labels={'salary_usd': 'Annual Salary (USD)'},
                   color_discrete_sequence=['#2ecc71'], 
                   template="plotly_white")

fig.update_layout(
    xaxis_title="Annual Salary (USD)",
    yaxis_title="Count of Job Postings",
    showlegend=False
)
fig.show()

# %% [markdown]
# ### 1.2 Salary by Seniority: Validating the Hierarchy
# To ensure our cleaned data makes sense, we check if salaries align with seniority levels.

# %%
fig2 = px.box(df_cleaned, x="seniority_level", y="salary_usd",
              category_orders={"seniority_level": ["Intern", "Junior", "Mid", "Senior", "Lead", "Executive"]},
              color="seniority_level",
              title="<b>Salary Tiers by Seniority Level (Cleaned Data)</b>",
              labels={'salary_usd': 'Salary (USD)', 'seniority_level': 'Seniority Level'},
              template="simple_white")

fig2.update_layout(showlegend=False)
fig2.show()