# ML Models
---

## Libraries

In [13]:
import pandas as pd
import numpy as np

import warnings 
warnings.filterwarnings("ignore")

import re

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(palette="muted")

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from xgboost import XGBRegressor


from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import validation_curve

import matplotlib.pyplot as plt


from scipy.stats import bootstrap

---
## Load data

In [14]:
# Original dataset after ETL
df = pd.read_csv('..\\datasets\\2. Processed Dataset\\result.csv')
df = df.drop(columns=["id"], errors="ignore")
# PCA dataset + One-Hotenconded
df_pca = pd.read_csv('..\\datasets\\2. Processed Dataset\\pca_result.csv')

---
# Compute Metrics

In [15]:
def compute_metrics(y_true, y_pred):
    return {
        "MAE": f"{mean_absolute_error(y_true, y_pred):.2f}",
        "RMSE": f"{root_mean_squared_error(y_true, y_pred):.2f}",
        "%RMSE": f"{(root_mean_squared_error(y_true, y_pred) / y_true.mean()) * 100 :.2f}",
        "R²": f"{r2_score(y_true, y_pred):.2f}"
    }

---
## First approach models training

In [16]:
# Split of the dataset into features and target
X = df_pca
y = df['Salary'] 

# Split data into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=0) # 25% test

# Standardize Salary
scaler_y = MinMaxScaler()
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).ravel()
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1)).ravel()

# Initialize and train models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=0),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=0),
    "Neural Network": MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=0)
}


In [17]:
results = {}
for name, model in models.items():
    model.fit(X_train, y_train_scaled)
    y_pred_scaled = model.predict(X_test)
    y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()
    results[name] = compute_metrics(y_test, y_pred)

# Mostrar resultados en DataFrame
df_results = pd.DataFrame(results).T
df_results

Unnamed: 0,MAE,RMSE,%RMSE,R²
Linear Regression,9580.0,13087.34,13.08,0.93
Random Forest,7737.37,11778.31,11.77,0.94
XGBoost,8030.99,12429.49,12.42,0.93
Neural Network,9782.02,12843.88,12.84,0.93


---
### Model Hyperparameter Tunning with Grid Search

In [18]:
# Hyperparameter grids
param_grids = {
    "Linear Regression": {},
    "Random Forest": {
        "n_estimators": [100, 200, 300],
        "max_depth": [2, 4, 8, 10, 20, None],
        "min_samples_split": [2, 5, 10, 20]  
    },
    "XGBoost": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 6, 10],
        "learning_rate": [0.01, 0.1, 0.2],
    },
    "Neural Network": {
        'hidden_layer_sizes': [(64,), (128, 64), (64, 32)],
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate_init': [0.001, 0.01, 0.1]
    }
}

# Initialize models
models = {
    "Linear Regression":LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=0),
    "XGBoost": XGBRegressor(random_state=0),
    "Neural Network": MLPRegressor(max_iter=500, random_state=0)
}

In [19]:
# Train models with hyperparameter tuning
best_models = {}
for name, model in models.items():
    if name in param_grids:
        search = GridSearchCV(model, param_grids[name], cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1)
        search.fit(X_train, y_train_scaled)
        best_models[name] = search.best_estimator_
        print(f"✅ Best params for {name}: {search.best_params_}")
    else:
        model.fit(X_train, y_train_scaled)
        best_models[name] = model

# Compute and display model metrics
for name, model in best_models.items():
    y_pred_scaled = model.predict(X_test)
    y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()
    results[name] = compute_metrics(y_test, y_pred)

df_results = pd.DataFrame(results).T
df_results

✅ Best params for Linear Regression: {}
✅ Best params for Random Forest: {'max_depth': 8, 'min_samples_split': 10, 'n_estimators': 300}
✅ Best params for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
✅ Best params for Neural Network: {'alpha': 0.01, 'hidden_layer_sizes': (64, 32), 'learning_rate_init': 0.01}


Unnamed: 0,MAE,RMSE,%RMSE,R²
Linear Regression,9580.0,13087.34,13.08,0.93
Random Forest,8281.45,12637.87,12.63,0.93
XGBoost,8645.05,12102.86,12.1,0.94
Neural Network,6906.78,9880.52,9.87,0.96


---
## Overfitting Evaluation

### Compare Training vs. Test Performance

In [20]:
# Evaluatiing models
def evaluate_models(models, X_train, y_train, X_test, y_test, scaler_y):
    results = []  
    for name, model in models.items():
        y_train_pred_scaled = model.predict(X_train)
        y_test_pred_scaled = model.predict(X_test)

        # Inverse scale predictions
        y_train_pred = scaler_y.inverse_transform(y_train_pred_scaled.reshape(-1, 1)).ravel()
        y_test_pred = scaler_y.inverse_transform(y_test_pred_scaled.reshape(-1, 1)).ravel()

        # Compute training and test metrics
        train_metrics = compute_metrics(y_train, y_train_pred)
        test_metrics = compute_metrics(y_test, y_test_pred)

        # Append results
        results.append({
            "Model": name,
            "Train_R²": train_metrics["R²"],
            "Test_R²": test_metrics["R²"],
            "Train_RMSE": train_metrics["RMSE"],
            "Test_RMSE": test_metrics["RMSE"],
            "Train_MAE": train_metrics["MAE"],
            "Test_MAE": test_metrics["MAE"],
            "Train_%RMSE": train_metrics["%RMSE"],
            "Test_%RMSE": test_metrics["%RMSE"]
        })

    # DataFrame
    df_results = pd.DataFrame(results)
    return df_results

# Apply this function to all trained models
df_results = evaluate_models(best_models, X_train, y_train, X_test, y_test, scaler_y)
df_results

Unnamed: 0,Model,Train_R²,Test_R²,Train_RMSE,Test_RMSE,Train_MAE,Test_MAE,Train_%RMSE,Test_%RMSE
0,Linear Regression,0.92,0.93,13623.02,13087.34,9650.31,9580.0,13.5,13.08
1,Random Forest,0.97,0.93,8432.13,12637.87,5748.04,8281.45,8.36,12.63
2,XGBoost,0.98,0.94,7306.06,12102.86,5296.12,8645.05,7.24,12.1
3,Neural Network,0.95,0.96,10943.24,9880.52,7611.28,6906.78,10.85,9.87


### Cross-Validation

In [21]:
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).ravel()

def cross_validate_models(models, X, y, cv=5):
    results = []

    y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).ravel()

    for name, model in models.items():
        scores = cross_val_score(model, X, y_scaled, cv=cv, scoring='r2', n_jobs=-1)
        
        # Append results
        results.append({
            "Model": name,
            "CV_Mean_R²": round(scores.mean(), 2),
            "CV_Std_R²": round(scores.std(), 2)
        })

    # DataFrame
    df_cv_results = pd.DataFrame(results)
    return df_cv_results

# Apply this function to all trained models
df_cv_results = cross_validate_models(best_models, X, y, cv=5)
df_cv_results

Unnamed: 0,Model,CV_Mean_R²,CV_Std_R²
0,Linear Regression,0.9,0.05
1,Random Forest,0.89,0.07
2,XGBoost,0.9,0.05
3,Neural Network,0.92,0.03


### Conclusions
- Linear Regression
    - Train R²: 0.92 | Test R²: 0.93 - Train and Test R² are close what suggest is doing a good generalization.
    - RMSE values are also close → No large performance drop.
    - Cross-validation confirms stability (0.90 ± 0.05)
    
- Random Forest: 
    - Train R²: 0.97 | Test R²: 0.93 - R2 Train is much grater than Test R2, the model is overfitting.
    - Train RMSE (8432.13) vs Test RMSE (12637.87) - Large gap suggests overfitting
    
- XGBoost
    - Train R²: 0.98 | Test R²: 0.94 and Train RMSE: 7306.06 | Test RMSE: 12102.86 the model is overfitting.
    - Cross-validation is stable (0.90 ± 0.05)
    - Second best model with a slight overfitting bus stable performance.

- Neural Network:
    - Train R²: 0.95 | Test R²: 0.96 and Train RMSE: 10943.24 | Test RMSE: 9880.52 shows a balance model and no major overfitting despite being a more complex model.
    - Cross-Validation shows highest mean R² (0.92) with lowest variance (±0.03) suggesting a strong generalization to unseen data.
    - Best overall model

In [None]:
# Diccionario para almacenar métricas de Bootstrap
boot_metrics = {"MAE": [], "RMSE": [], "R²": []}
num_bootstrap = 1000

# Bootstrap loop
for _ in range(num_bootstrap):
    indices = np.random.choice(len(y_test), size=len(y_test), replace=True)
    y_test_sample = y_test.iloc[indices]  # Selección correcta con iloc
    y_pred_sample = scaler_y.inverse_transform(best_models["Random Forest"].predict(X_test.iloc[indices]).reshape(-1, 1)).ravel()
    
    # Guardar métricas
    boot_metrics["MAE"].append(mean_absolute_error(y_test_sample, y_pred_sample))
    boot_metrics["RMSE"].append(root_mean_squared_error(y_test_sample, y_pred_sample))
    boot_metrics["R²"].append(r2_score(y_test_sample, y_pred_sample))

# Calcular intervalos de confianza del 95%
ci_95 = {metric: (np.percentile(values, 2.5), np.percentile(values, 97.5)) for metric, values in boot_metrics.items()}

# Convertir resultados en DataFrame
df_ci_95 = pd.DataFrame(ci_95, index=["Lower Bound", "Upper Bound"]).T

# Mostrar resultados
print(df_ci_95)