imports

In [1]:
# %% Imports -----------------------------------------------------------
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import KNNImputer

from statsmodels.stats.outliers_influence import variance_inflation_factor

import sys
sys.path.insert(0, "../src")   # keep original relative module path
import SCORE_functions as sf     # project‑specific helpers stay the same

load data

In [5]:
# DATA_PATH = "../data/alldatatoML.xlsx"
PATH_long = "../data/alldatatoML_preprocessed_long.xlsx"
PATH_short = "../data/alldatatoML_preprocessed_short.xlsx"

dataset = pd.read_excel(PATH_short)

split data

In [6]:
sco1_cols = [c for c in dataset.columns if c.startswith("sco1 - ")]
sco2_cols = [c for c in dataset.columns if c.startswith("sco2 - ")]
cat_cols = [c for c in dataset.columns if c.startswith("cat - ")]

feature_cols = cat_cols + sco1_cols

X = dataset[feature_cols]
y = dataset[sco2_cols[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)#, stratify=y)

long:

In [None]:
# ---------------------------------------------------------------------------
# 0. PRELIMINARIES – run once
# ---------------------------------------------------------------------------
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# ---------------------------------------------------------------------------
# 1. DEFINE A HELPER FOR METRICS
# ---------------------------------------------------------------------------
def evaluate(reg, X_test, y_test):
    y_pred = reg.predict(X_test)
    mse  = mean_squared_error(y_test, y_pred)
    mae  = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2   = r2_score(y_test, y_pred)

    print(f"  ▸ MSE : {mse:.4f}")
    print(f"  ▸ MAE : {mae:.4f}")
    print(f"  ▸ RMSE: {rmse:.4f}")
    print(f"  ▸ R²  : {r2:.4f}\n")

# ---------------------------------------------------------------------------
# 2. PUT ALL MODELS + PARAM GRIDS INTO ONE DICTIONARY
# ---------------------------------------------------------------------------
from sklearn.linear_model   import LinearRegression
from sklearn.ensemble       import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm            import SVR
from sklearn.neighbors      import KNeighborsRegressor
from sklearn.tree           import DecisionTreeRegressor
# (No Naïve Bayes because scikit-learn’s NB implementations are *classification* only)

models_and_grids = {

    "Linear Regression": {
        "estimator": Pipeline([
            ("scaler", StandardScaler(with_mean=False)),  # scaling avoids cross-collinearity warnings
            ("reg", LinearRegression())
        ]),
        "param_grid": {
            "reg__fit_intercept": [True, False],
            "reg__positive":      [False, True],
        }
    },

    "Random Forest Regressor": {
        "estimator": RandomForestRegressor(random_state=42),
        "param_grid": {
            "n_estimators":      [100, 300],
            "max_depth":         [None, 10, 20],
            "min_samples_split": [2, 5],
        }
    },

    "Support-Vector Regressor": {
        "estimator": Pipeline([
            ("scaler", StandardScaler()),
            ("reg", SVR())
        ]),
        "param_grid": {
            "reg__kernel":  ["rbf", "linear"],
            "reg__C":       [0.1, 1, 10],
            "reg__epsilon": [0.01, 0.1, 0.5],
            "reg__gamma":   ["scale", "auto"],
        }
    },

    "K-NN Regressor": {
        "estimator": Pipeline([
            ("scaler", StandardScaler()),
            ("reg", KNeighborsRegressor())
        ]),
        "param_grid": {
            "reg__n_neighbors": [3, 5, 11],
            "reg__weights":     ["uniform", "distance"],
            "reg__p":           [1, 2],          # 1 = Manhattan, 2 = Euclidean
        }
    },

    "Decision-Tree Regressor": {
        "estimator": DecisionTreeRegressor(random_state=42),
        "param_grid": {
            "max_depth":         [None, 5, 10, 20],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf":  [1, 2, 4],
        }
    },

    "Gradient-Boosting Regressor": {
        "estimator": GradientBoostingRegressor(random_state=42),
        "param_grid": {
            "n_estimators":   [100, 300],
            "learning_rate":  [0.05, 0.1],
            "max_depth":      [3, 5],
            "subsample":      [0.8, 1.0],
        }
    },
}

# ---------------------------------------------------------------------------
# 3. LOOP THROUGH, RUN GRID SEARCH, REPORT
# ---------------------------------------------------------------------------
for name, cfg in models_and_grids.items():
    try:
        print(f"\n=== {name} ===")
        search = GridSearchCV(
            cfg["estimator"],
            cfg["param_grid"],
            cv=5,
            scoring="neg_mean_squared_error",
            n_jobs=-1
        ).fit(X_train, y_train)

        print("Best params :", search.best_params_)
        print(f"Best CV MSE : {-search.best_score_:.4f}")
        evaluate(search.best_estimator_, X_test, y_test)

    except Exception as e:
        print(f"{name} -- error: {e}\n")



=== Linear Regression ===
Best params : {'reg__fit_intercept': True, 'reg__positive': True}
Best CV MSE : 0.4650
  ▸ MSE : 0.5996
  ▸ MAE : 0.5226
  ▸ RMSE: 0.7743
  ▸ R²  : 0.4788


=== Random Forest Regressor ===
Best params : {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}
Best CV MSE : 0.3101
  ▸ MSE : 0.4994
  ▸ MAE : 0.4295
  ▸ RMSE: 0.7067
  ▸ R²  : 0.5658


=== Support-Vector Regressor ===
Best params : {'reg__C': 0.1, 'reg__epsilon': 0.1, 'reg__gamma': 'scale', 'reg__kernel': 'linear'}
Best CV MSE : 0.4313
  ▸ MSE : 0.4520
  ▸ MAE : 0.4662
  ▸ RMSE: 0.6723
  ▸ R²  : 0.6071


=== K-NN Regressor ===
Best params : {'reg__n_neighbors': 11, 'reg__p': 1, 'reg__weights': 'distance'}
Best CV MSE : 0.5537
  ▸ MSE : 0.6525
  ▸ MAE : 0.5839
  ▸ RMSE: 0.8078
  ▸ R²  : 0.4328


=== Decision-Tree Regressor ===
Best params : {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best CV MSE : 0.4147
  ▸ MSE : 0.6660
  ▸ MAE : 0.5072
  ▸ RMSE: 0.8161
  ▸ R²  : 0.4210


| Rank | Model                       | R²        | Test RMSE | Gap\*      |
| ---- | --------------------------- | --------- | --------- | ---------- |
| 🥇   | **SVR (linear kernel)**     | **0.607** | 0.672     | **+0.155** |
| 🥈   | Random Forest Regressor     | 0.566     | 0.707     | +0.189     |
| 🥉   | Gradient Boosting Regressor | 0.565     | 0.707     | +0.191     |
| 4    | Linear Regression           | 0.479     | 0.774     | +0.135     |
| 5    | K-NN Regressor              | 0.433     | 0.808     | +0.099     |
| 6    | Decision Tree               | 0.421     | 0.816     | +0.251     |


short

In [7]:
# ---------------------------------------------------------------------------
# 0. PRELIMINARIES – run once
# ---------------------------------------------------------------------------
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# ---------------------------------------------------------------------------
# 1. DEFINE A HELPER FOR METRICS
# ---------------------------------------------------------------------------
def evaluate(reg, X_test, y_test):
    y_pred = reg.predict(X_test)
    mse  = mean_squared_error(y_test, y_pred)
    mae  = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2   = r2_score(y_test, y_pred)

    print(f"  ▸ MSE : {mse:.4f}")
    print(f"  ▸ MAE : {mae:.4f}")
    print(f"  ▸ RMSE: {rmse:.4f}")
    print(f"  ▸ R²  : {r2:.4f}\n")

# ---------------------------------------------------------------------------
# 2. PUT ALL MODELS + PARAM GRIDS INTO ONE DICTIONARY
# ---------------------------------------------------------------------------
from sklearn.linear_model   import LinearRegression
from sklearn.ensemble       import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm            import SVR
from sklearn.neighbors      import KNeighborsRegressor
from sklearn.tree           import DecisionTreeRegressor
# (No Naïve Bayes because scikit-learn’s NB implementations are *classification* only)

models_and_grids = {

    "Linear Regression": {
        "estimator": Pipeline([
            ("scaler", StandardScaler(with_mean=False)),  # scaling avoids cross-collinearity warnings
            ("reg", LinearRegression())
        ]),
        "param_grid": {
            "reg__fit_intercept": [True, False],
            "reg__positive":      [False, True],
        }
    },

    "Random Forest Regressor": {
        "estimator": RandomForestRegressor(random_state=42),
        "param_grid": {
            "n_estimators":      [100, 300],
            "max_depth":         [None, 10, 20],
            "min_samples_split": [2, 5],
        }
    },

    "Support-Vector Regressor": {
        "estimator": Pipeline([
            ("scaler", StandardScaler()),
            ("reg", SVR())
        ]),
        "param_grid": {
            "reg__kernel":  ["rbf", "linear"],
            "reg__C":       [0.1, 1, 10],
            "reg__epsilon": [0.01, 0.1, 0.5],
            "reg__gamma":   ["scale", "auto"],
        }
    },

    "K-NN Regressor": {
        "estimator": Pipeline([
            ("scaler", StandardScaler()),
            ("reg", KNeighborsRegressor())
        ]),
        "param_grid": {
            "reg__n_neighbors": [3, 5, 11],
            "reg__weights":     ["uniform", "distance"],
            "reg__p":           [1, 2],          # 1 = Manhattan, 2 = Euclidean
        }
    },

    "Decision-Tree Regressor": {
        "estimator": DecisionTreeRegressor(random_state=42),
        "param_grid": {
            "max_depth":         [None, 5, 10, 20],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf":  [1, 2, 4],
        }
    },

    "Gradient-Boosting Regressor": {
        "estimator": GradientBoostingRegressor(random_state=42),
        "param_grid": {
            "n_estimators":   [100, 300],
            "learning_rate":  [0.05, 0.1],
            "max_depth":      [3, 5],
            "subsample":      [0.8, 1.0],
        }
    },
}

# ---------------------------------------------------------------------------
# 3. LOOP THROUGH, RUN GRID SEARCH, REPORT
# ---------------------------------------------------------------------------
for name, cfg in models_and_grids.items():
    try:
        print(f"\n=== {name} ===")
        search = GridSearchCV(
            cfg["estimator"],
            cfg["param_grid"],
            cv=5,
            scoring="neg_mean_squared_error",
            n_jobs=-1
        ).fit(X_train, y_train)

        print("Best params :", search.best_params_)
        print(f"Best CV MSE : {-search.best_score_:.4f}")
        evaluate(search.best_estimator_, X_test, y_test)

    except Exception as e:
        print(f"{name} -- error: {e}\n")



=== Linear Regression ===
Best params : {'reg__fit_intercept': False, 'reg__positive': False}
Best CV MSE : 0.3708
  ▸ MSE : 0.4963
  ▸ MAE : 0.4796
  ▸ RMSE: 0.7045
  ▸ R²  : 0.5686


=== Random Forest Regressor ===
Best params : {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 300}
Best CV MSE : 0.3184
  ▸ MSE : 0.5057
  ▸ MAE : 0.4337
  ▸ RMSE: 0.7111
  ▸ R²  : 0.5604


=== Support-Vector Regressor ===
Best params : {'reg__C': 0.1, 'reg__epsilon': 0.1, 'reg__gamma': 'scale', 'reg__kernel': 'linear'}
Best CV MSE : 0.3673
  ▸ MSE : 0.5015
  ▸ MAE : 0.4860
  ▸ RMSE: 0.7082
  ▸ R²  : 0.5640


=== K-NN Regressor ===
Best params : {'reg__n_neighbors': 11, 'reg__p': 1, 'reg__weights': 'distance'}
Best CV MSE : 0.5539
  ▸ MSE : 0.6546
  ▸ MAE : 0.5855
  ▸ RMSE: 0.8091
  ▸ R²  : 0.4310


=== Decision-Tree Regressor ===
Best params : {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 10}
Best CV MSE : 0.4286
  ▸ MSE : 0.7358
  ▸ MAE : 0.5280
  ▸ RMSE: 0.8578
  ▸ R²  : 0.36

| Rank | Model                       | R²         | Test RMSE | Gap\*       |
| ---- | --------------------------- | ---------- | --------- | ----------- |
| 🥇   | **Linear Regression**       | **0.5686** | 0.7045    | **+0.1339** |
| 🥈   | Support-Vector Regressor    | 0.5640     | 0.7082    | +0.1409     |
| 🥉   | Random Forest Regressor     | 0.5604     | 0.7111    | +0.1927     |
| 4    | Gradient Boosting Regressor | 0.5572     | 0.7137    | +0.2012     |
| 5    | K-NN Regressor              | 0.4310     | 0.8091    | +0.2552     |
| 6    | Decision Tree Regressor     | 0.3603     | 0.8578    | +0.4292     |
