In [None]:
# %% [markdown]
# # Strategic Workforce Analysis: AI Integration vs. Structural Risk (2010-2025)
# ## Task 1: Data Preprocessing & Outlier Handling

# %%
import pandas as pd
import numpy as np
import os

# --- Environment Detection for Data Path ---
# Logic to handle both Google Colab Drive and Local file system
try:
    from google.colab import drive
    drive.mount('/content/drive')
    DATA_PATH = '/content/drive/MyDrive/ai_impact_jobs_2010_2025.csv'
    print("Status: Running on Google Colab")
except:
    DATA_PATH = '../data/raw/ai_impact_jobs_2010_2025.csv'
    print("Status: Running on Local Environment")

# %%
# 1. Load Dataset
df = pd.read_csv(DATA_PATH)
print(f"Dataset Loaded: {df.shape[0]} rows, {df.shape[1]} columns")

# 2. Basic Cleaning
# Handling missing values for categorical columns to prevent errors in grouping
df['ai_skills'] = df['ai_skills'].fillna('Not Specified')
df['ai_keywords'] = df['ai_keywords'].fillna('None')

# 3. IQR-based Outlier Removal for Salary
# We use IQR to focus our analysis on the mainstream market behavior
Q1 = df['salary_usd'].quantile(0.25)
Q3 = df['salary_usd'].quantile(0.75)
IQR = Q3 - Q1

lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR

# Filtering the data
df_cleaned = df[(df['salary_usd'] >= lower_limit) & (df['salary_usd'] <= upper_limit)].copy()
df_outliers = df[df['salary_usd'] > upper_limit].copy()

print(f"Data Preprocessing Complete:")
print(f"- Rows removed as outliers: {len(df_outliers)}")
print(f"- Rows kept for core analysis: {len(df_cleaned)}")
print(f"- Salary range for analysis: ${df_cleaned['salary_usd'].min():,.0f} to ${df_cleaned['salary_usd'].max():,.0f}")

# %%
# 4. Save the Cleaned Dataset for subsequent tasks
# We save this to ensure consistency across all analysis branches
PROCESSED_DATA_PATH = '../data/processed/ai_impact_jobs_cleaned.csv'

df_cleaned.to_csv(PROCESSED_DATA_PATH, index=False)

print(f"ðŸ’¾ Cleaned dataset saved to: {PROCESSED_DATA_PATH}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Status: Running on Google Colab
Dataset Loaded: 5000 rows, 22 columns
Data Preprocessing Complete:
- Rows removed as outliers: 30
- Rows kept for core analysis: 4970
- Salary range for analysis: $15,321 to $149,448
