In [None]:
# %% [markdown]
# # Strategic Workforce Analysis: AI Integration vs. Structural Risk (2010-2025)
# ## Task 1: Data Preprocessing & Outlier Handling

# %%
import pandas as pd
import numpy as np
import os

# --- Local Path Configuration ---
BASE_DIR = os.path.dirname(os.getcwd())
RAW_DATA_PATH = os.path.join(BASE_DIR, 'data', 'raw', 'ai_impact_jobs_2010_2025.csv')
PROCESSED_DIR = os.path.join(BASE_DIR, 'data', 'processed')

# %%
# 1. Load Dataset
if os.path.exists(RAW_DATA_PATH):
    df = pd.read_csv(RAW_DATA_PATH)
    print(f"Raw Data Loaded from local. Shape: {df.shape}")
else:
    print(f"Error: File not found at {RAW_DATA_PATH}")

# 2. REFINED IQR LOGIC
def remove_group_outliers(group):
    """Calculates IQR for each group individually."""
    Q1 = group['salary_usd'].quantile(0.25)
    Q3 = group['salary_usd'].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    
    # Keeping data within the group's normal range
    return group[(group['salary_usd'] >= lower_limit) & (group['salary_usd'] <= upper_limit)]

# 3. Apply grouped outlier removal by seniority_level
df_cleaned = df.groupby('seniority_level', group_keys=False).apply(remove_group_outliers).reset_index(drop=True)

# %%
# 4. Save the Cleaned Dataset for subsequent tasks
# We save this to ensure consistency across all analysis branches
os.makedirs(PROCESSED_DIR, exist_ok=True)

CLEANED_FILE_PATH = os.path.join(PROCESSED_DIR, 'ai_impact_jobs_cleaned.csv')
df_cleaned.to_csv(CLEANED_FILE_PATH, index=False)

print(f"Grouped Cleaning Complete. Shape: {df_cleaned.shape}")

Raw Data Loaded from local. Shape: (5000, 22)
Grouped Cleaning Complete. Shape: (4970, 21)
