# ML Models
---

## Libraries

In [49]:
import pandas as pd
import numpy as np

import warnings 
warnings.filterwarnings("ignore")

import re

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(palette="muted")

from sklearn.model_selection import train_test_split 
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import bootstrap

---
## Load data

In [None]:
df = pd.read_csv('..\\datasets\\2. Processed Dataset\\result.csv')
df = df.drop(columns=["id"], errors="ignore")

In [None]:
df = pd.read_csv('..\\datasets\\2. Processed Dataset\\result.csv')

# Split of the dataset into features and target
X = df.drop('Salary',axis=1) 
y = df['Salary'] 

# Split data into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=0) # 25% test

# Iniciate and Train models
dummy_model = DummyRegressor(strategy="mean")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

dummy_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

# Predictions
y_pred_dummy = dummy_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)

# Compute metrics
def compute_metrics(y_true, y_pred):
    return {
        "MAE": mean_absolute_error(y_true, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_true, y_pred)),
        "R2": r2_score(y_true, y_pred)
    }

results_dummy = compute_metrics(y_test, y_pred_dummy)
results_rf = compute_metrics(y_test, y_pred_rf)

In [48]:
# Confidence Intervals
boot_metrics = {"MAE": [], "RMSE": [], "R2": []}
num_bootstrap = 1000

for _ in range(num_bootstrap):
    indices = np.random.choice(len(y_test), size=len(y_test), replace=True)
    y_test_sample = y_test.iloc[indices]
    y_pred_sample = y_pred_rf[indices]
    metrics = compute_metrics(y_test_sample, y_pred_sample)
    for key in boot_metrics:
        boot_metrics[key].append(metrics[key])

ci_95 = {metric: (np.percentile(values, 2.5), np.percentile(values, 97.5)) for metric, values in boot_metrics.items()}

# Print results
print("Dummy Model:", results_dummy)
print("Random Forest Model:", results_rf)
print("95% Confidence Intervals:", ci_95)

Dummy Model: {'MAE': np.float64(40690.50882167612), 'RMSE': np.float64(48180.83218379831), 'R2': -0.00029651269527053437}
Random Forest Model: {'MAE': np.float64(7069.021739130435), 'RMSE': np.float64(11428.70504095033), 'R2': 0.9437173924934072}
95% Confidence Intervals: {'MAE': (np.float64(5389.592391304348), np.float64(8974.85054347826)), 'RMSE': (np.float64(7912.561196785485), np.float64(15138.136084376583)), 'R2': (np.float64(0.8939846585553445), np.float64(0.9726176853210466))}
