In [21]:
import numpy as np

# Load the data
X_train = np.load('/kaggle/input/ftml-project-regression/X_train.npy')
X_test = np.load('/kaggle/input/ftml-project-regression/X_test.npy')
y_train = np.load('/kaggle/input/ftml-project-regression/y_train.npy').ravel()
y_test = np.load('/kaggle/input/ftml-project-regression/y_test.npy').ravel()

# Inspect the data
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Check for NaN values
print("NaN values in X_train:", np.isnan(X_train).sum())
print("NaN values in X_test:", np.isnan(X_test).sum())
print("NaN values in y_train:", np.isnan(y_train).sum())
print("NaN values in y_test:", np.isnan(y_test).sum())

X_train shape: (200, 200)
X_test shape: (200, 200)
y_train shape: (200,)
y_test shape: (200,)
NaN values in X_train: 0
NaN values in X_test: 0
NaN values in y_train: 0
NaN values in y_test: 0


In [22]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both train and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [24]:
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
import xgboost as xgb
from sklearn.model_selection import cross_val_score

def objective(trial: Trial, X_train, y_train):
    param = {
        'tree_method': 'hist',  # Change from gpu_hist to hist
        'device': 'cuda',       # Add device parameter for GPU usage
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'random_state': 42,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10)
    }

    model = xgb.XGBRegressor(**param)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='r2').mean()
    return score

study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(lambda trial: objective(trial, X_train_scaled, y_train), n_trials=30)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

best_params = study.best_trial.params


[I 2024-06-29 18:56:46,998] A new study created in memory with name: no-name-ba236278-031a-4a07-83cd-11c7edb98f5a
[I 2024-06-29 18:56:47,938] Trial 0 finished with value: 0.12180637907868216 and parameters: {'lambda': 1.7560157373165404, 'alpha': 0.7380226850939409, 'colsample_bytree': 0.49465977214736245, 'subsample': 0.8273261179374776, 'learning_rate': 0.2980200576941274, 'n_estimators': 130, 'max_depth': 6, 'min_child_weight': 3}. Best is trial 0 with value: 0.12180637907868216.
[I 2024-06-29 18:56:48,968] Trial 1 finished with value: 0.3450975657626422 and parameters: {'lambda': 5.444206161768323, 'alpha': 6.5790876617204574, 'colsample_bytree': 0.8691864026446074, 'subsample': 0.9360811699001079, 'learning_rate': 0.06567189033401297, 'n_estimators': 118, 'max_depth': 5, 'min_child_weight': 9}. Best is trial 1 with value: 0.3450975657626422.
[I 2024-06-29 18:56:51,094] Trial 2 finished with value: 0.3065462017367861 and parameters: {'lambda': 2.2923701385193302, 'alpha': 4.7173357

Number of finished trials: 30
Best trial: {'lambda': 6.910651209694412, 'alpha': 2.6737009843519686, 'colsample_bytree': 0.8043113653840722, 'subsample': 0.6166381281401319, 'learning_rate': 0.047292440672808106, 'n_estimators': 730, 'max_depth': 3, 'min_child_weight': 5}


In [25]:
from sklearn.metrics import r2_score

# Best XGBoost model with optimal parameters
best_xgb_model = xgb.XGBRegressor(**best_params, random_state=42)
best_xgb_model.fit(X_train_scaled, y_train)
y_pred_xgb = best_xgb_model.predict(X_test_scaled)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f'R2 Score for XGBoost with Optuna: {r2_xgb}')

R2 Score for XGBoost with Optuna: 0.6229927897144087


In [26]:
from sklearn.feature_selection import RFE
from lightgbm import LGBMRegressor

# Initialize the model
base_model = LGBMRegressor(random_state=42, verbose=-1)

# Apply Recursive Feature Elimination
selector = RFE(base_model, n_features_to_select=150, step=1)
selector = selector.fit(X_train_scaled, y_train)

# Transform the data
X_train_selected = selector.transform(X_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

In [27]:
from bayes_opt import BayesianOptimization
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score
import numpy as np

# Define the function we want to optimize
def lgb_evaluate(num_leaves, max_depth, learning_rate, n_estimators, subsample, min_child_samples, reg_alpha, reg_lambda):
    params = {
        'num_leaves': int(num_leaves),
        'max_depth': int(max_depth),
        'learning_rate': learning_rate,
        'n_estimators': int(n_estimators),
        'subsample': subsample,
        'min_child_samples': int(min_child_samples),
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda,
        'random_state': 42,
        'verbose': -1
    }
    model = LGBMRegressor(**params)
    # Perform 5-fold cross-validation and return the mean score
    score = cross_val_score(model, X_train_selected, y_train, cv=5, scoring='r2').mean()
    return score

# Define the parameter space
param_bounds = {
    'num_leaves': (2, 256),
    'max_depth': (3, 20),
    'learning_rate': (0.01, 0.3),
    'n_estimators': (100, 1000),
    'subsample': (0.4, 1.0),
    'min_child_samples': (5, 100),
    'reg_alpha': (1e-3, 10.0),
    'reg_lambda': (1e-3, 10.0)
}

# Initialize the Bayesian Optimizer
optimizer = BayesianOptimization(f=lgb_evaluate, pbounds=param_bounds, random_state=42)

# Optimize the hyperparameters
optimizer.maximize(init_points=10, n_iter=200)

# Extract the best parameters
best_params = optimizer.max['params']
best_params['num_leaves'] = int(best_params['num_leaves'])
best_params['max_depth'] = int(best_params['max_depth'])
best_params['n_estimators'] = int(best_params['n_estimators'])
best_params['min_child_samples'] = int(best_params['min_child_samples'])

|   iter    |  target   | learni... | max_depth | min_ch... | n_esti... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.5543   [0m | [0m0.1186   [0m | [0m19.16    [0m | [0m74.54    [0m | [0m638.8    [0m | [0m41.63    [0m | [0m1.561    [0m | [0m0.5818   [0m | [0m0.9197   [0m |
| [0m2        [0m | [0m0.3798   [0m | [0m0.1843   [0m | [0m15.04    [0m | [0m6.956    [0m | [0m972.9    [0m | [0m213.4    [0m | [0m2.124    [0m | [0m1.819    [0m | [0m0.51     [0m |
| [95m3        [0m | [95m0.6802   [0m | [95m0.09823  [0m | [95m11.92    [0m | [95m46.03    [0m | [95m362.1    [0m | [95m157.4    [0m | [95m1.396    [0m | [95m2.922    [0m | [95m0.6198   [0m |
| [0m4        [0m | [0m0.5489   [0m | [0m0.1423   [0m | [0m16.35    [0m | [0m23.97    [0m | [0m562.8    [0m | [0m152.5    [0m 

In [28]:
# Train the model with the best parameters
best_model = LGBMRegressor(**best_params)
best_model.fit(X_train_selected, y_train)

# Predict and evaluate
y_pred = best_model.predict(X_test_selected)
r2 = r2_score(y_test, y_pred)
print(f'R2 Score for LightGBM with Bayesian Optimization and Feature Selection: {r2}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000727 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10260
[LightGBM] [Info] Number of data points in the train set: 200, number of used features: 150
[LightGBM] [Info] Start training from score 4.906834
R2 Score for LightGBM with Bayesian Optimization and Feature Selection: 0.8252882203647978
