In [None]:
# %% [markdown]
# # Strategic Workforce Analysis: AI Integration vs. Structural Risk (2010-2025)
# ## Task 1: Data Preprocessing & Outlier Handling

# %%
import pandas as pd
import numpy as np
import os

# --- Local Path Configuration ---
BASE_DIR = os.path.dirname(os.getcwd())
RAW_DATA_PATH = os.path.join(BASE_DIR, 'data', 'raw', 'ai_impact_jobs_2010_2025.csv')
PROCESSED_DIR = os.path.join(BASE_DIR, 'data', 'processed')

# %%
# 1. Load Dataset
if os.path.exists(RAW_DATA_PATH):
    df = pd.read_csv(RAW_DATA_PATH)
    print(f"Raw Data Loaded from local. Shape: {df.shape}")
else:
    print(f"Error: File not found at {RAW_DATA_PATH}")

# 2. NESTED IQR LOGIC
# We clean outliers within each (Region + Seniority) bucket to reveal true market rates.
def clean_market_anomalies(group):
    if len(group) < 5: # Too small to calculate IQR
        return group
    Q1 = group['salary_usd'].quantile(0.25)
    Q3 = group['salary_usd'].quantile(0.75)
    IQR = Q3 - Q1
    return group[(group['salary_usd'] >= Q1 - 1.5 * IQR) & (group['salary_usd'] <= Q3 + 1.5 * IQR)]

# 3. Apply double-grouping for maximum precision
df_cleaned = df.groupby(['region', 'seniority_level'], group_keys=False).apply(clean_market_anomalies).reset_index(drop=True)

# 5. Final Verification
print(f"Precision Cleaning Results:")
print(f"- Original Records: {len(df)}")
print(f"- Records after Nested IQR: {len(df_cleaned)}")
print(f"- Rows removed as anomalies: {len(df) - len(df_cleaned)}")

# %%
# 6. Save the Cleaned Dataset for subsequent tasks
# We save this to ensure consistency across all analysis branches
os.makedirs(PROCESSED_DIR, exist_ok=True)

CLEANED_FILE_PATH = os.path.join(PROCESSED_DIR, 'ai_impact_jobs_cleaned.csv')
df_cleaned.to_csv(CLEANED_FILE_PATH, index=False)

print(f"Grouped Cleaning Complete. Shape: {df_cleaned.shape}")

Raw Data Loaded from local. Shape: (5000, 22)
Precision Cleaning Results:
- Original Records: 5000
- Records after Nested IQR: 4938
- Rows removed as anomalies: 62
Grouped Cleaning Complete. Shape: (4938, 20)
