# Machine Learning Analysis

Machine learning analysis including data preprocessing, feature engineering, model training, evaluation, and comparison of multiple algorithms.

This notebook runs the analysis script: `scripts/python/ml_analysis.py`

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import xgboost as xgb
import lightgbm as lgb
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline

# Change to project root directory
notebook_dir = os.getcwd()
project_root = os.path.dirname(os.path.dirname(notebook_dir))
os.chdir(project_root)

# Create results directories
os.makedirs('results/models', exist_ok=True)
os.makedirs('results/plots', exist_ok=True)
os.makedirs('results/tables', exist_ok=True)

# Load cleaned dataset
df = pd.read_csv('data/processed/employees_cleaned.csv')

print("="*80)
print("MACHINE LEARNING ANALYSIS - EMPLOYEE DATASET")
print("="*80)
print(f'Working directory: {os.getcwd()}')
print(f'Dataset shape: {df.shape}')

## 1. Data Preprocessing for ML


In [None]:
# Create a copy for ML
df_ml = df.copy()

# Feature engineering
df_ml['Start_Year'] = pd.to_datetime(df_ml['Start_Date']).dt.year
df_ml['Start_Month'] = pd.to_datetime(df_ml['Start_Date']).dt.month
df_ml['Years_of_Service'] = (pd.to_datetime('today') - pd.to_datetime(df_ml['Start_Date'])).dt.days / 365.25

# Handle missing values
df_ml['Gender'] = df_ml['Gender'].fillna('Unknown')
df_ml['Team'] = df_ml['Team'].fillna('Unknown')
df_ml['Senior_Management'] = df_ml['Senior_Management'].fillna(False)

# Encode categorical variables
le_gender = LabelEncoder()
le_team = LabelEncoder()
df_ml['Gender_encoded'] = le_gender.fit_transform(df_ml['Gender'])
df_ml['Team_encoded'] = le_team.fit_transform(df_ml['Team'])
df_ml['Senior_Management_encoded'] = df_ml['Senior_Management'].astype(int)

# Select features for modeling
feature_columns = ['Gender_encoded', 'Team_encoded', 'Senior_Management_encoded', 
                   'Bonus_pct', 'Years_of_Service', 'Start_Year', 'Start_Month']

# Remove rows with missing target or features
df_ml = df_ml.dropna(subset=['Salary'] + feature_columns)

print(f"Dataset shape for ML: {df_ml.shape}")
print(f"Features used: {feature_columns}")
df_ml[feature_columns + ['Salary']].head()


## 2. Salary Prediction


In [None]:
# Prepare data for salary prediction
X_salary = df_ml[feature_columns]
y_salary = df_ml['Salary']

# Split data
X_train_salary, X_test_salary, y_train_salary, y_test_salary = train_test_split(
    X_salary, y_salary, test_size=0.2, random_state=42
)

# Scale features
scaler_salary = StandardScaler()
X_train_salary_scaled = scaler_salary.fit_transform(X_train_salary)
X_test_salary_scaled = scaler_salary.transform(X_test_salary)

print(f"Training set size: {X_train_salary.shape[0]}")
print(f"Test set size: {X_test_salary.shape[0]}")


### 2.1 Random Forest (Most Appropriate for this Dataset)


In [None]:
# Random Forest - Best for mixed data types and feature importance
print("Random Forest Model")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=10)
rf_model.fit(X_train_salary, y_train_salary)
y_pred_rf = rf_model.predict(X_test_salary)

rf_r2 = r2_score(y_test_salary, y_pred_rf)
rf_rmse = np.sqrt(mean_squared_error(y_test_salary, y_pred_rf))
rf_mae = mean_absolute_error(y_test_salary, y_pred_rf)

print(f"R² Score: {rf_r2:.4f}")
print(f"RMSE: ${rf_rmse:,.2f}")
print(f"MAE: ${rf_mae:,.2f}")

# Cross-validation
rf_cv_scores = cross_val_score(rf_model, X_train_salary, y_train_salary, cv=5, scoring='r2')
print(f"Cross-validation R²: {rf_cv_scores.mean():.4f} (+/- {rf_cv_scores.std() * 2:.4f})")

# Feature importance
rf_feature_importance = pd.DataFrame({
    'Feature': feature_columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nFeature Importance:")
print(rf_feature_importance)

# Visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(rf_feature_importance['Feature'], rf_feature_importance['Importance'])
plt.xlabel('Importance')
plt.title('Random Forest Feature Importance')
plt.tight_layout()
plt.savefig('results/plots/rf_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

# Save model
pickle.dump(rf_model, open('results/models/rf_salary_model.pkl', 'wb'))
print("\nModel saved to: results/models/rf_salary_model.pkl")


### 2.2 XGBoost


In [None]:
# XGBoost
print("XGBoost Model")
xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42, learning_rate=0.1, max_depth=5)
xgb_model.fit(X_train_salary, y_train_salary)
y_pred_xgb = xgb_model.predict(X_test_salary)

xgb_r2 = r2_score(y_test_salary, y_pred_xgb)
xgb_rmse = np.sqrt(mean_squared_error(y_test_salary, y_pred_xgb))
xgb_mae = mean_absolute_error(y_test_salary, y_pred_xgb)

print(f"R² Score: {xgb_r2:.4f}")
print(f"RMSE: ${xgb_rmse:,.2f}")
print(f"MAE: ${xgb_mae:,.2f}")

# Feature importance
xgb_feature_importance = pd.DataFrame({
    'Feature': feature_columns,
    'Importance': xgb_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nFeature Importance:")
print(xgb_feature_importance)


### 2.3 Model Comparison


In [None]:
# Compare all models
model_results = pd.DataFrame({
    'Model': ['Random Forest', 'XGBoost'],
    'R² Score': [rf_r2, xgb_r2],
    'RMSE': [rf_rmse, xgb_rmse],
    'MAE': [rf_mae, xgb_mae]
})

print("Model Comparison:")
print(model_results)
model_results.to_csv('results/tables/model_comparison_salary.csv', index=False)

# Visualize predictions
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Random Forest predictions
axes[0].scatter(y_test_salary, y_pred_rf, alpha=0.5)
axes[0].plot([y_test_salary.min(), y_test_salary.max()], 
             [y_test_salary.min(), y_test_salary.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Salary')
axes[0].set_ylabel('Predicted Salary')
axes[0].set_title(f'Random Forest Predictions (R² = {rf_r2:.4f})')
axes[0].grid(True, alpha=0.3)

# XGBoost predictions
axes[1].scatter(y_test_salary, y_pred_xgb, alpha=0.5)
axes[1].plot([y_test_salary.min(), y_test_salary.max()], 
             [y_test_salary.min(), y_test_salary.max()], 'r--', lw=2)
axes[1].set_xlabel('Actual Salary')
axes[1].set_ylabel('Predicted Salary')
axes[1].set_title(f'XGBoost Predictions (R² = {xgb_r2:.4f})')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('results/plots/model_predictions_salary.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n" + "="*80)
print("ML ANALYSIS COMPLETED SUCCESSFULLY!")
print("="*80)
print(f"\nResults saved in:")
print("- Models: results/models/")
print("- Plots: results/plots/")
print("- Tables: results/tables/")
