## Model Selection Regression

## Import libraries

In [1]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import pickle


## Load data

In [2]:
with open('pca_data.pkl', 'rb') as f:
    pca_data = pickle.load(f)

X_raw = pca_data['X_raw']
y = pca_data['y']

## Select models to train

In [3]:
available_models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.1),
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "SVR": SVR()
}

## USER INPUT: Choose models
selected_models = ["LinearRegression", "RandomForest", "Ridge", "Lasso", "GradientBoosting", "SVR"]


## USER INPUT: Specify number of folds for cross-validation
use_pca = False  # Set to False to skip PCA
n_components = 5  # Number of principal components to project onto (if use_pca=True)

## Training

In [None]:
# Initializing CV
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_results = {}
best_model = None
best_mean_mse = float("inf")

# Training each of the selected models
for model_name in tqdm(selected_models, desc="Training Models"):
    if model_name not in available_models:
        print(f"Model {model_name} is not available.")
        continue

    # Initializing the model
    model = available_models[model_name]

    # Defining components for pipeline
    steps = []
    steps.append(("scaler", StandardScaler()))
    if use_pca:
        steps.append(("pca", PCA(n_components=n_components)))
    steps.append(("regressor", model))

    # Creating the pipeline
    pipeline = Pipeline(steps)

    # Performing CV
    print(f"\nTraining {model_name} with 5-fold Cross-Validation (PCA: {use_pca}, Components: {n_components}):")
    cv_scores = -cross_val_score(pipeline, X_raw, y, cv=cv, scoring='neg_mean_squared_error')

    # Storing the results
    mean_mse = np.mean(cv_scores)
    std_mse = np.std(cv_scores)
    cv_results[model_name] = {
        "mean_mse": mean_mse,
        "std_mse": std_mse,
        "pipeline": pipeline
    }

    # Finding best model (not statistically proven)
    if mean_mse < best_mean_mse:
        best_mean_mse = mean_mse
        best_model = pipeline

    # Displaying results
    print(f"\n{model_name} Cross-Validation Results:")
    print(f"Mean MSE: {mean_mse:.2f}")
    print(f"Standard Deviation of MSE: {std_mse:.2f}")
    print("_" * 100)

## Save best model

In [None]:
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print(f"\nBest Model: {best_model.named_steps['regressor'].__class__.__name__}")
print(f"Best Model Mean MSE: {best_mean_mse:.2f}")

## Cross-validation results

In [None]:
def plot_cv_results(cv_results):
    model_names = []
    mean_mse = []
    std_mse = []

    for model_name, result in cv_results.items():
        model_names.append(model_name)
        mean_mse.append(result["mean_mse"])
        std_mse.append(result["std_mse"])

    plt.figure(figsize=(10, 6))
    plt.barh(model_names, mean_mse, xerr=std_mse, alpha=0.7, color='skyblue', capsize=5)
    plt.xlabel('Mean MSE (with Std)')
    plt.title('Model Comparison: Cross-Validated Mean Squared Error')
    plt.gca().invert_yaxis()
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.show()

plot_cv_results(cv_results)