In [1]:
import numpy as np

# Load the data
X_train = np.load('/kaggle/input/ftml-project-regression/X_train.npy')
X_test = np.load('/kaggle/input/ftml-project-regression/X_test.npy')
y_train = np.load('/kaggle/input/ftml-project-regression/y_train.npy').ravel()
y_test = np.load('/kaggle/input/ftml-project-regression/y_test.npy').ravel()

# Inspect the data
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Check for NaN values
print("NaN values in X_train:", np.isnan(X_train).sum())
print("NaN values in X_test:", np.isnan(X_test).sum())
print("NaN values in y_train:", np.isnan(y_train).sum())
print("NaN values in y_test:", np.isnan(y_test).sum())

X_train shape: (200, 200)
X_test shape: (200, 200)
y_train shape: (200,)
y_test shape: (200,)
NaN values in X_train: 0
NaN values in X_test: 0
NaN values in y_train: 0
NaN values in y_test: 0


In this exercise we will test 3 different methods : LGBM, Lasso, and ensemble (leveraging the two methods). We will compare their f2 score on the test set.

In [2]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both train and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

LightGBM uses Bayesian optimization for efficient hyperparameter tuning. The solver employs gradient-based one-side sampling and exclusive feature bundling for faster training. Hyperparameters like num_leaves, max_depth, and learning_rate have been adjusted. Cross-validation with 5-folds is used to help the model generalize well.

In [3]:
from sklearn.feature_selection import RFE
from lightgbm import LGBMRegressor

# Initialize the model
base_model = LGBMRegressor(random_state=42, verbose=-1)

# Apply Recursive Feature Elimination
selector = RFE(base_model, n_features_to_select=150, step=1)
selector = selector.fit(X_train_scaled, y_train)

# Transform the data
X_train_selected = selector.transform(X_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

In [9]:
from bayes_opt import BayesianOptimization
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score
import numpy as np

# Define the function we want to optimize
def lgb_evaluate(num_leaves, max_depth, learning_rate, n_estimators, subsample, min_child_samples, reg_alpha, reg_lambda):
    params = {
        'num_leaves': int(num_leaves),
        'max_depth': int(max_depth),
        'learning_rate': learning_rate,
        'n_estimators': int(n_estimators),
        'subsample': subsample,
        'min_child_samples': int(min_child_samples),
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda,
        'random_state': 42,
        'verbose': -1
    }
    model = LGBMRegressor(**params)
    # Perform 5-fold cross-validation and return the mean score
    score = cross_val_score(model, X_train_selected, y_train, cv=5, scoring='r2').mean()
    return score

# Define the parameter space
param_bounds = {
    'num_leaves': (2, 256),
    'max_depth': (3, 20),
    'learning_rate': (0.01, 0.3),
    'n_estimators': (100, 1000),
    'subsample': (0.4, 1.0),
    'min_child_samples': (5, 100),
    'reg_alpha': (1e-3, 10.0),
    'reg_lambda': (1e-3, 10.0)
}

# Initialize the Bayesian Optimizer
optimizer = BayesianOptimization(f=lgb_evaluate, pbounds=param_bounds, random_state=42)

# Optimize the hyperparameters
optimizer.maximize(init_points=10, n_iter=200)

# Extract the best parameters
best_params = optimizer.max['params']
best_params['num_leaves'] = int(best_params['num_leaves'])
best_params['max_depth'] = int(best_params['max_depth'])
best_params['n_estimators'] = int(best_params['n_estimators'])
best_params['min_child_samples'] = int(best_params['min_child_samples'])

|   iter    |  target   | learni... | max_depth | min_ch... | n_esti... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.5195   [0m | [0m0.1186   [0m | [0m19.16    [0m | [0m74.54    [0m | [0m638.8    [0m | [0m41.63    [0m | [0m1.561    [0m | [0m0.5818   [0m | [0m0.9197   [0m |
| [0m2        [0m | [0m0.2395   [0m | [0m0.1843   [0m | [0m15.04    [0m | [0m6.956    [0m | [0m972.9    [0m | [0m213.4    [0m | [0m2.124    [0m | [0m1.819    [0m | [0m0.51     [0m |
| [95m3        [0m | [95m0.6484   [0m | [95m0.09823  [0m | [95m11.92    [0m | [95m46.03    [0m | [95m362.1    [0m | [95m157.4    [0m | [95m1.396    [0m | [95m2.922    [0m | [95m0.6198   [0m |
| [0m4        [0m | [0m0.5082   [0m | [0m0.1423   [0m | [0m16.35    [0m | [0m23.97    [0m | [0m562.8    [0m | [0m152.5    [0m 

In [10]:
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from sklearn.metrics import r2_score

# Train the model with the best parameters

best_model = LGBMRegressor(**best_params)
best_model.fit(
    X_train_selected, y_train,
    eval_set=[(X_test_selected, y_test)],
    eval_metric='r2',
    callbacks=[early_stopping(stopping_rounds=100)],
)

# Predict and evaluate
y_pred = best_model.predict(X_test_selected)
r2 = r2_score(y_test, y_pred)
print(f'R2 Score for LightGBM with Bayesian Optimization and Feature Selection: {r2}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000997 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13675
[LightGBM] [Info] Number of data points in the train set: 200, number of used features: 200
[LightGBM] [Info] Start training from score 4.906834
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[290]	valid_0's l2: 0.12785
R2 Score for LightGBM with Bayesian Optimization and Feature Selection: 0.8263351418820309


Lasso regression also uses Bayesian optimization to find the best alpha value. The solver is a coordinate descent algorithm. Hyperparameters such as alpha control the regularization strength. 5-fold cross-validation is used to validate performance and avoid overfitting.

In [11]:
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.model_selection import GridSearchCV, cross_val_score

# Feature selection using RFE
base_model = LinearRegression()
selector = RFE(base_model, n_features_to_select=200, step=1)
selector = selector.fit(X_train_scaled, y_train)
X_train_selected = selector.transform(X_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

# Define the function we want to optimize
def lasso_evaluate(alpha):
    model = Lasso(alpha=alpha, max_iter=10000)
    score = cross_val_score(model, X_train_selected, y_train, cv=5, scoring='r2').mean()
    return score

# Define the parameter bounds
lasso_param_bounds = {
    'alpha': (1e-4, 1)
}


In [12]:
# Perform Bayesian Optimization for Lasso
lasso_optimizer = BayesianOptimization(
    f=lasso_evaluate,
    pbounds=lasso_param_bounds,
    random_state=42,
    verbose=2
)

lasso_optimizer.maximize(init_points=40, n_iter=100)

# Extract the best parameters
best_lasso_params = lasso_optimizer.max['params']


|   iter    |  target   |   alpha   |
-------------------------------------
| [0m1        [0m | [0m-0.09098 [0m | [0m0.3746   [0m |
| [0m2        [0m | [0m-0.1097  [0m | [0m0.9507   [0m |
| [0m3        [0m | [0m-0.1097  [0m | [0m0.732    [0m |
| [0m4        [0m | [0m-0.1097  [0m | [0m0.5987   [0m |
| [95m5        [0m | [95m0.5833   [0m | [95m0.1561   [0m |
| [95m6        [0m | [95m0.5834   [0m | [95m0.1561   [0m |
| [95m7        [0m | [95m0.8916   [0m | [95m0.05818  [0m |
| [0m8        [0m | [0m-0.1097  [0m | [0m0.8662   [0m |
| [0m9        [0m | [0m-0.1097  [0m | [0m0.6012   [0m |
| [0m10       [0m | [0m-0.1097  [0m | [0m0.7081   [0m |
| [95m11       [0m | [95m0.9248   [0m | [95m0.02068  [0m |
| [0m12       [0m | [0m-0.1097  [0m | [0m0.9699   [0m |
| [0m13       [0m | [0m-0.1097  [0m | [0m0.8325   [0m |
| [0m14       [0m | [0m0.3175   [0m | [0m0.2124   [0m |
| [0m15       [0m | [0m0.4649   [0m | [

In [13]:
# Train on the train set and evaluate the best Lasso model on the test set
best_lasso_model = Lasso(**best_lasso_params, max_iter=10000, random_state=42)
best_lasso_model.fit(X_train_selected, y_train)
y_pred_lasso = best_lasso_model.predict(X_test_selected)
r2_lasso = r2_score(y_test, y_pred_lasso)
print(f'R2 Score for Lasso Regression: {r2_lasso}')

R2 Score for Lasso Regression: 0.9230806362284361


The stacking model combines the predictions of Lasso and LightGBM. We used Cross-validation with 5-folds is used to ensure performance.

In [14]:
# Model Stacking
from sklearn.ensemble import StackingRegressor

stacking_model = StackingRegressor(
    estimators=[
        ('lasso', best_lasso_model),
        ('lgbm', best_model)
    ],
    final_estimator=LinearRegression(),
    cv=5
)

stacking_model.fit(X_train_scaled, y_train)
y_pred_stacking = stacking_model.predict(X_test_scaled)
r2_stacking = r2_score(y_test, y_pred_stacking)

# Evaluate the stacking model on the test set
print(f'R2 Score for Stacking Model: {r2_stacking}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002095 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13675
[LightGBM] [Info] Number of data points in the train set: 200, number of used features: 200
[LightGBM] [Info] Start training from score 4.906834
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005325 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11000
[LightGBM] [Info] Number of data points in the train set: 160, number of used features: 200
[LightGBM] [Info] Start training from score 4.831619
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000644 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11000
[LightGBM] [Info] Number of data points in the train se

The best performing model is the ensemble model which performs slighlty better than the lasso. 