# Model Selection

In this exercise section, we will see some of the methods to select a model in the first place. These includes:

- backward feature selection, which is a supervised methodology to identify the relevant features for a given task;
- regularization with the $\lambda$ parameter of Lasso;
- Hyperparameter tuning of XGBoost and MLP Neural Network.

In [55]:
import os
import pandas as pd
import numpy as np

In [56]:
np.random.seed(42)

In [57]:
data_path = "data"

In [58]:
# import the clean data from the previous lesson
station_df = pd.read_excel(os.path.join(data_path, "clean_data.xlsx"), index_col=0)

In [None]:
station_df.head()

## Backward Feature Selection

Let us try to identify a convenient subset of $k$ features for our regression task. In principle, we would have to train $\binom{9}{k}$ models, one for each $k \in \{ 1, ..., 9 \}$, and to compare one model against the others. This sounds quite inefficient. Instead, we start from the complete set of features and we try to iteratively remove the least relevant feature. The relevance of a feature can be specified in different ways. For simple models like linear regression, usually the one with the coefficient with the highest p-value is removed. We stop when there are no features with a p-value > 0.5.

In [60]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

In [61]:
# split the dataframe into features and target
X = station_df.drop(columns=["DOC (mg/l)"]).copy()
y = station_df["DOC (mg/l)"].copy()

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

# scale the data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

# revert to dataframe
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)

In [62]:
# Let's first train a linear regression model with all the features
X_train_scaled = sm.add_constant(X_train_scaled)

initial_model = sm.OLS(y_train, X_train_scaled).fit()

In [63]:
import numpy as np
import statsmodels.api as sm

def backward_feature_selection(X_bfs, y_bfs):
    features = list(X_bfs.columns)
    best_features = features.copy()
    best_model = None
    best_aic = initial_model.aic

    # while there are features to remove
    while len(features) > 1:
        
        # Initialize the best feature and the current best features
        current_best_features = best_features.copy()

        # for each feature
        for feature in features:
            remaining_features = [f for f in features if f != feature]  # Remove one feature
            X_step = X_bfs[remaining_features]
            X_step = sm.add_constant(X_step)
            model = sm.OLS(y_bfs, X_step).fit()
            aic = model.aic

            # If the AIC of the new model is better, store the new AIC and the new features
            if aic < best_aic:
                best_aic = aic
                # Store the new best features
                best_features = remaining_features
                best_model = model
                removed_feature = feature  # Store the removed feature
            
        # If no feature was removed, stop the loop
        if best_features == current_best_features:
            print("No further improvement in AIC. Stopping.")
            break

        print(f"New best AIC: {best_aic} | Removed feature: {removed_feature}")
        # Update the features to the current best features for the next iteration
        features = best_features.copy()

    return best_features, best_model


In [None]:
selected_features, model = backward_feature_selection(X_train_scaled, y_train)

In [None]:
print(f"Selected features: {selected_features}")
print()
print(model.summary())

## Regularization

We are going to use Lasso regularization, which is indeed a model selection and feature selection technique, as it adds a $L_1$-norm to the loss function shrinking the coefficients of less important features toward zero. We are going to perform cross validation for a set of values of the regularization term $\lambda$. Note that if $\lambda = 0$, we go back to classic OLS linear regression.

In [66]:
from sklearn.linear_model import Lasso
import matplotlib.pyplot as plt

In [67]:
# split the dataframe into features and target
X = station_df.drop(columns=["DOC (mg/l)"]).copy()
y = station_df["DOC (mg/l)"].copy()

# split the data into training and testing sets
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

# scale the data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

# revert to dataframe
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
X_valid_scaled = pd.DataFrame(X_valid_scaled, columns=X.columns, index=X_valid.index)

In [None]:
# show how the coefficients change with different lambdas (alphas in sklearn)
# together with the model performance

alphas = np.linspace(1e-5, 1, 1000)
coeffs = []
errors = []

for alpha in alphas:
    model = Lasso(alpha=alpha)
    model.fit(X_train_scaled, y_train)
    coeffs.append(model.coef_)
    errors.append(mean_squared_error(y_valid, model.predict(X_valid_scaled)))
    
coeffs = np.array(coeffs)

fig, ax1 = plt.subplots(figsize=(10, 6))

for i in range(coeffs.shape[1]):
    ax1.plot(alphas, coeffs[:, i], label=model.feature_names_in_[i])
    
ax1.set_xscale("log")
ax1.set_xlabel("Alpha (log scale)")
ax1.set_ylabel("Coefficient Value")
ax1.set_title("Lasso Coefficients and Model Performance vs. Alpha")
ax1.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
ax1.grid(True)
    
ax2 = ax1.twinx()
ax2.plot(alphas, errors, color="red", linestyle="dashed", label="MSE")
ax2.set_ylabel("MSE", color="red")
ax2.tick_params(axis="y", labelcolor="red")
ax2.legend(loc="lower right")
    
plt.show()


We can see that as alpha increases, coefficients shrink to zero, leading to a too much simple model, where only the constant term is left, meaning that we get a constant mse. When alpha is small, many coefficients retain their magnitude. When alpha reaches 10-3, coefficients start to shrink and around alpha = 10-2 a subset of features has a coefficient close to zero, meaning that Lasso is performing feature selection. We can also see that we have a slight decrease of the MSE, meaning that Lasso feature selection is leading to a better representation of the relationship between the target and features variable.

## Hyperparameter Tuning

Hyperparameter tuning is the process of finding the optimal values for a modelâ€™s hyperparameters, which are parameters that are not learned directly from the data but rather set before training. These hyperparameters control how the model learns and generalizes.

We are going to see two simple techniques, Grid Search (systematically evaluates predefined hyperparameter values) and Random Search (samples random combinations within a given range), but more sophisticated ones can also be used, such as [Bayesian Optimization](https://en.wikipedia.org/wiki/Bayesian_optimization#:~:text=Bayesian%20optimization%20is%20a%20sequential,expensive%2Dto%2Devaluate%20functions.) or frameworks like [Optuna](https://optuna.readthedocs.io/en/stable/).

The models we are going to tune are MLP Neural Network and XGBoost. We use the scikit-learn package which offers both Grid Search and Random Search. For further information, [GridSearchCV](https://scikit-learn.org/dev/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV) and [RandomSearchCV](https://scikit-learn.org/dev/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV).

Hyperparameter tuning is usually performed with cross-validation in order to ensure that the selected hyperparameters generalize well on unseen data.

In [69]:
# import necessary libraries
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [70]:
# split the dataframe into features and target
X = station_df.drop(columns=["DOC (mg/l)"]).copy()
y = station_df["DOC (mg/l)"].copy()

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

# scale the data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# revert to dataframe
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)

### Neural Network

Each neuron in the neural network is characterized by an activation function which is a transformation of its input. Different activation functions are present. The most used are usually:

- ReLU (Rectified Linear Unit)
- Tanh
- Sigmoid

In [None]:
# ReLu activation function

x = np.linspace(-10, 10, 100)
y = np.maximum(0, x)

plt.figure(figsize=(10, 6))
plt.plot(x, y)
plt.title("ReLu Activation Function")
plt.grid(True)
plt.show()

In [None]:
# Tanh activation function

x = np.linspace(-10, 10, 100)
y = np.tanh(x)

plt.figure(figsize=(10, 6))
plt.plot(x, y)
plt.title("Tanh Activation Function")
plt.grid(True)
plt.show()

In [None]:
# Sigmoid activation function

x = np.linspace(-10, 10, 100)
y = 1 / (1 + np.exp(-x))

plt.figure(figsize=(10, 6))
plt.plot(x, y)
plt.title("Sigmoid Activation Function")
plt.grid(True)
plt.show()

In [28]:
# we are going to define the hyperparameters to search and their ranges of values
# since there are a lof of hyperparameters, we just define a few of them

mlp_params = {
    "hidden_layer_sizes": [(20,), (50,) , (20, 20), (50, 50)], # number of neurons in each layer
    "activation": ["relu", "logistic", "tanh" ], # activation function
    "solver": ["adam", "sgd"], # optimization algorithm
    "learning_rate": ["constant", "adaptive"], # learning rate schedule
    "batch_size": [16, 32,], # size of minibatches
}

#### Grid Search

In [29]:
estimator = MLPRegressor(max_iter=1000)

grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=mlp_params,
    cv=5,
    scoring="neg_mean_squared_error", # we want to minimize the MSE
    verbose=3, # change to higher values to see more information
)

In [None]:
# this could take a couple of minutes
grid_search.fit(X_train_scaled, y_train)

In [None]:
import pprint

# print the results of the best model found
print("The best parameters are:")
pprint.pprint(grid_search.best_params_)
print()
print(f"The best score is: {-grid_search.best_score_.round(3)}")

#### Random Search

In [32]:
estimator = MLPRegressor(max_iter=1000)

random_search = RandomizedSearchCV(
    n_iter=10,
    estimator=estimator,
    param_distributions=mlp_params,
    cv=5,
    scoring="neg_mean_squared_error", # we want to minimize the MSE
    verbose=3, # change to higher values to see more information
)

In [None]:
random_search.fit(X_train_scaled, y_train)

In [None]:
# print the results of the best model found
print("The best parameters are:")
pprint.pprint(random_search.best_params_)
print()
print(f"The best score is: {-random_search.best_score_.round(3)}")

### XGBoost

In [35]:
# we are going to use tree-based models
xgb_params = {
    "n_estimators": [20, 50, 100], # number of trees
    "eta": [0.01, 0.1, 0.3, 0.5, 0.7], # learning rate
    "max_depth": [3, 5, 7], # maximum depth of the trees\
}

#### Grid Search

In [36]:
estimator = XGBRegressor()

grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=xgb_params,
    cv=5,
    scoring="neg_mean_squared_error", # we want to minimize the MSE
    verbose=3, # change to higher values to see more information
)

In [None]:
grid_search.fit(X_train_scaled, y_train)

In [None]:
# print the results of the best model found
print("The best parameters are:")
pprint.pprint(grid_search.best_params_)
print()
print(f"The best score is: {-grid_search.best_score_.round(3)}")

#### Random Search

In [39]:
random_search = RandomizedSearchCV(
    estimator=estimator,
    param_distributions=xgb_params,
    cv=5,
    scoring="neg_mean_squared_error", # we want to minimize the MSE
    verbose=3, # change to higher values to see more information
)

In [None]:
random_search.fit(X_train_scaled, y_train)

In [None]:
# print the results of the best model found
print("The best parameters are:")
pprint.pprint(random_search.best_params_)
print()
print(f"The best score is: {-random_search.best_score_.round(3)}")

We can see that grid search evaluates every possible combination of parameters' values given, while random search just a subset. The trade-off is that with grid search, the combination with the smallest error will be surely found but the computation time will be much higher with respect to random search as it is basically a brute-forcing technique.