imports

In [1]:
# %% Imports -----------------------------------------------------------
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import KNNImputer

from statsmodels.stats.outliers_influence import variance_inflation_factor

import sys
sys.path.insert(0, "../src")   # keep original relative module path
import SCORE_functions as sf     # project‑specific helpers stay the same

load data

In [3]:
# DATA_PATH = "../data/alldatatoML.xlsx"
PATH_long = "../data/alldatatoML_preprocessed_long.xlsx"
PATH_short = "../data/alldatatoML_preprocessed_short.xlsx"

dataset = pd.read_excel(PATH_long)

split data

In [4]:
sco1_cols = [c for c in dataset.columns if c.startswith("sco1 - ")]
sco2_cols = [c for c in dataset.columns if c.startswith("sco2 - ")]
cat_cols = [c for c in dataset.columns if c.startswith("cat - ")]

feature_cols = cat_cols + sco1_cols

X = dataset[feature_cols]
y = dataset[sco2_cols[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)#, stratify=y)

In [5]:
# Common utilities — run *once* before any model blocks
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

def evaluate(model, X_test, y_test):
    """Print MSE, MAE, RMSE and R² on X_test / y_test."""
    y_pred = model.predict(X_test)
    mse  = mean_squared_error(y_test, y_pred)
    mae  = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2   = r2_score(y_test, y_pred)

    print(f"  ▸ MSE : {mse:.4f}")
    print(f"  ▸ MAE : {mae:.4f}")
    print(f"  ▸ RMSE: {rmse:.4f}")
    print(f"  ▸ R²  : {r2:.4f}\n")


In [6]:
# --- Linear Regression ------------------------------------------------------
try:
    from sklearn.linear_model import LinearRegression

    lr = LinearRegression()
    lr_grid = {"fit_intercept": [True, False]}
    lr_search = GridSearchCV(
        lr,
        lr_grid,
        cv=5,
        scoring="neg_mean_squared_error",
        n_jobs=-1
    ).fit(X_train, y_train)

    print("Linear Regression")
    print(" Best params :", lr_search.best_params_)
    print(f" Best CV MSE : {-lr_search.best_score_:.4f}")
    evaluate(lr_search.best_estimator_, X_test, y_test)
except Exception as e:
    print("Linear Regression -- error:", e, "\n")


Linear Regression
 Best params : {'fit_intercept': True}
 Best CV MSE : 0.4977
  ▸ MSE : 0.4953
  ▸ MAE : 0.4881
  ▸ RMSE: 0.7038
  ▸ R²  : 0.5694



In [7]:
# --- Random Forest Regressor -------------------------------------------------
try:
    from sklearn.ensemble import RandomForestRegressor

    rf = RandomForestRegressor(random_state=42)
    rf_grid = {
        "n_estimators": [100, 300],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5]
    }
    rf_search = GridSearchCV(
        rf,
        rf_grid,
        cv=5,
        scoring="neg_mean_squared_error",
        n_jobs=-1
    ).fit(X_train, y_train)

    print("Random Forest Regressor")
    print(" Best params :", rf_search.best_params_)
    print(f" Best CV MSE : {-rf_search.best_score_:.4f}")
    evaluate(rf_search.best_estimator_, X_test, y_test)
except Exception as e:
    print("Random Forest Regressor -- error:", e, "\n")


Random Forest Regressor
 Best params : {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}
 Best CV MSE : 0.3101
  ▸ MSE : 0.4994
  ▸ MAE : 0.4295
  ▸ RMSE: 0.7067
  ▸ R²  : 0.5658



In [8]:
# --- Support-Vector Regressor -----------------------------------------------
try:
    from sklearn.svm import SVR

    svr = SVR()
    svr_grid = {
        "kernel": ["rbf", "poly", "linear"],
        "C": [0.1, 1, 10],
        "epsilon": [0.01, 0.1, 0.5],
        "gamma": ["scale", "auto"]
    }
    svr_search = GridSearchCV(
        svr,
        svr_grid,
        cv=5,
        scoring="neg_mean_squared_error",
        n_jobs=-1
    ).fit(X_train, y_train)

    print("Support-Vector Regressor")
    print(" Best params :", svr_search.best_params_)
    print(f" Best CV MSE : {-svr_search.best_score_:.4f}")
    evaluate(svr_search.best_estimator_, X_test, y_test)
except Exception as e:
    print("SVR -- error:", e, "\n")


Support-Vector Regressor
 Best params : {'C': 0.1, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
 Best CV MSE : 0.3615
  ▸ MSE : 0.4490
  ▸ MAE : 0.4673
  ▸ RMSE: 0.6700
  ▸ R²  : 0.6097



In [9]:
# --- K-Nearest-Neighbors Regressor ------------------------------------------
try:
    from sklearn.neighbors import KNeighborsRegressor

    knn = KNeighborsRegressor()
    knn_grid = {
        "n_neighbors": [3, 5, 11],
        "weights": ["uniform", "distance"],
        "p": [1, 2]                 # 1=Manhattan, 2=Euclidean
    }
    knn_search = GridSearchCV(
        knn,
        knn_grid,
        cv=5,
        scoring="neg_mean_squared_error",
        n_jobs=-1
    ).fit(X_train, y_train)

    print("K-NN Regressor")
    print(" Best params :", knn_search.best_params_)
    print(f" Best CV MSE : {-knn_search.best_score_:.4f}")
    evaluate(knn_search.best_estimator_, X_test, y_test)
except Exception as e:
    print("K-NN Regressor -- error:", e, "\n")


K-NN Regressor
 Best params : {'n_neighbors': 11, 'p': 1, 'weights': 'distance'}
 Best CV MSE : 0.5367
  ▸ MSE : 0.6045
  ▸ MAE : 0.5439
  ▸ RMSE: 0.7775
  ▸ R²  : 0.4745



In [10]:
# --- Decision-Tree Regressor -------------------------------------------------
try:
    from sklearn.tree import DecisionTreeRegressor

    dt = DecisionTreeRegressor(random_state=42)
    dt_grid = {
        "max_depth": [None, 5, 10, 20],
        "min_samples_split": [2, 5, 10]
    }
    dt_search = GridSearchCV(
        dt,
        dt_grid,
        cv=5,
        scoring="neg_mean_squared_error",
        n_jobs=-1
    ).fit(X_train, y_train)

    print("Decision-Tree Regressor")
    print(" Best params :", dt_search.best_params_)
    print(f" Best CV MSE : {-dt_search.best_score_:.4f}")
    evaluate(dt_search.best_estimator_, X_test, y_test)
except Exception as e:
    print("Decision-Tree Regressor -- error:", e, "\n")


Decision-Tree Regressor
 Best params : {'max_depth': 5, 'min_samples_split': 2}
 Best CV MSE : 0.4147
  ▸ MSE : 0.6660
  ▸ MAE : 0.5072
  ▸ RMSE: 0.8161
  ▸ R²  : 0.4210



In [11]:
# --- Gradient-Boosting Regressor --------------------------------------------
try:
    from sklearn.ensemble import GradientBoostingRegressor

    gb = GradientBoostingRegressor(random_state=42)
    gb_grid = {
        "n_estimators": [100, 300],
        "learning_rate": [0.05, 0.1],
        "max_depth": [3, 5],
        "subsample": [0.8, 1.0]
    }
    gb_search = GridSearchCV(
        gb,
        gb_grid,
        cv=5,
        scoring="neg_mean_squared_error",
        n_jobs=-1
    ).fit(X_train, y_train)

    print("Gradient-Boosting Regressor")
    print(" Best params :", gb_search.best_params_)
    print(f" Best CV MSE : {-gb_search.best_score_:.4f}")
    evaluate(gb_search.best_estimator_, X_test, y_test)
except Exception as e:
    print("Gradient-Boosting Regressor -- error:", e, "\n")


Gradient-Boosting Regressor
 Best params : {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}
 Best CV MSE : 0.3092
  ▸ MSE : 0.5003
  ▸ MAE : 0.4479
  ▸ RMSE: 0.7073
  ▸ R²  : 0.5651

