In [3]:
import pandas as pd
import numpy as np

In [5]:
dataset = pd.read_csv("preprocessed_data_with_boxcox.csv")
dataset.head()

Unnamed: 0,mac,linux,tag_Other,tag_2D,tag_Action RPG,tag_Action Roguelike,tag_Anime,tag_Atmospheric,tag_Automobile Sim,tag_Base-Building,...,genre_Strategy,genre_Other,supported_languages_count,full_audio_languages_count,review_category,review_count_category,log_achievements,log_ccu,log_owners_average,price_transformed
0,0,0,16,0,0,0,0,0,1,0,...,0,0,16,6,4,4,5.187386,9.029418,15.068274,9.980796
1,1,1,17,0,0,0,0,0,0,1,...,0,0,4,4,5,4,3.828641,9.253879,15.068274,6.377498
2,1,0,16,0,0,0,0,0,0,0,...,0,0,21,2,5,4,4.317488,7.176255,15.068274,1.828411
3,0,0,18,0,0,0,0,0,0,0,...,0,0,13,0,4,3,5.003946,9.319553,15.068274,0.0
4,0,0,16,0,0,0,0,0,0,0,...,0,0,14,8,4,4,3.78419,7.040536,14.220976,8.160536


In [7]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score

In [8]:
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    evs = explained_variance_score(y_true, y_pred)
    print(f"{model_name} Evaluation Metrics:")
    print(f"MAE: {mae:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R2 Score: {r2:.4f}")
    print(f"Explained Variance Score: {evs:.4f}")

In [11]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler

In [13]:
# Define features and target
X = dataset.drop(columns=['price_transformed'])
y = dataset['price_transformed']

# Get the column names that were not dropped
remaining_columns = X.columns.to_list()
print("Remaining columns (features):", remaining_columns)

# Clip extreme values in the target variable (to handle outliers effectively)
lower_bound = y.quantile(0.01)
upper_bound = y.quantile(0.99)
y_clipped = y.clip(lower=lower_bound, upper=upper_bound)

print(y_clipped)

# Use RobustScaler to handle potential outliers in features
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# Perform train-test split with stratification if needed
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_clipped, test_size=0.2, random_state=42
)

Remaining columns (features): ['mac', 'linux', 'tag_Other', 'tag_2D', 'tag_Action RPG', 'tag_Action Roguelike', 'tag_Anime', 'tag_Atmospheric', 'tag_Automobile Sim', 'tag_Base-Building', 'tag_Battle Royale', 'tag_Building', 'tag_Card Game', 'tag_Character Customization', 'tag_Choices Matter', 'tag_City Builder', 'tag_Classic', 'tag_Co-op', 'tag_Colony Sim', 'tag_Comedy', 'tag_Crime', 'tag_Cute', 'tag_Cyberpunk', 'tag_Dark Fantasy', 'tag_Difficult', 'tag_Driving', 'tag_Exploration', 'tag_FPS', 'tag_Fantasy', 'tag_Female Protagonist', 'tag_Fighting', 'tag_First-Person', 'tag_Free to Play', 'tag_Funny', 'tag_Gore', 'tag_Grand Strategy', 'tag_Great Soundtrack', 'tag_Hack and Slash', 'tag_Historical', 'tag_Horror', 'tag_Isometric', 'tag_Loot', 'tag_Looter Shooter', 'tag_MMORPG', 'tag_Management', 'tag_Medieval', 'tag_Military', 'tag_Multiplayer', 'tag_Nudity', 'tag_Online Co-Op', 'tag_Open World', 'tag_Open World Survival Craft', 'tag_Physics', 'tag_Pixel Graphics', 'tag_Platformer', 'tag_P

In [39]:
import optuna
from sklearn.model_selection import cross_val_score

In [41]:
import catboost as cb

In [45]:
# Define optimization function
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 300, 1000),
        'depth': trial.suggest_int('depth', 6, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.7, 1.0),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'random_strength': trial.suggest_float('random_strength', 0.1, 1.0),
    }
    model = cb.CatBoostRegressor(
        **params, 
        objective='RMSE', 
        random_state=42, 
        verbose=0
    )
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    return -np.mean(cv_scores)

# Run optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=200)

# Best parameters
best_params = study.best_params
print("Best Parameters:", best_params)

# Train with best parameters
optimized_catboost_model = cb.CatBoostRegressor(
    **best_params,
    objective='RMSE',
    random_state=42,
    verbose=100
)
optimized_catboost_model.fit(X_train, y_train)

# Evaluate
y_pred = optimized_catboost_model.predict(X_test)
evaluate_model(y_test, y_pred, "Optimized CatBoost")

[I 2025-01-13 01:30:13,387] A new study created in memory with name: no-name-39803e90-df63-4c33-b016-8f5d303012b0
[I 2025-01-13 01:30:27,033] Trial 0 finished with value: 3.0382963062557438 and parameters: {'iterations': 559, 'depth': 8, 'learning_rate': 0.031722928659380735, 'subsample': 0.8809252082745994, 'colsample_bylevel': 0.970537211025855, 'l2_leaf_reg': 1.9180550781691783, 'random_strength': 0.4785750980329798}. Best is trial 0 with value: 3.0382963062557438.
[I 2025-01-13 01:31:00,176] Trial 1 finished with value: 3.1033977474881915 and parameters: {'iterations': 760, 'depth': 9, 'learning_rate': 0.014647194808834427, 'subsample': 0.7204128550148495, 'colsample_bylevel': 0.8708028130441742, 'l2_leaf_reg': 6.618374510076433, 'random_strength': 0.8585396335167547}. Best is trial 0 with value: 3.0382963062557438.
[I 2025-01-13 01:32:00,140] Trial 2 finished with value: 3.1618545407547045 and parameters: {'iterations': 803, 'depth': 10, 'learning_rate': 0.16201058037787794, 'subs

Best Parameters: {'iterations': 909, 'depth': 7, 'learning_rate': 0.024845540351614487, 'subsample': 0.9009412148411962, 'colsample_bylevel': 0.86554927634105, 'l2_leaf_reg': 2.0800095515814907, 'random_strength': 0.2373347577237853}
0:	learn: 3.0144711	total: 3.29ms	remaining: 2.99s
100:	learn: 1.5842399	total: 353ms	remaining: 2.83s
200:	learn: 1.3054064	total: 705ms	remaining: 2.48s
300:	learn: 1.1302107	total: 1.06s	remaining: 2.15s
400:	learn: 0.9871386	total: 1.44s	remaining: 1.82s
500:	learn: 0.8666961	total: 1.78s	remaining: 1.45s
600:	learn: 0.7642341	total: 2.14s	remaining: 1.1s
700:	learn: 0.6796689	total: 2.55s	remaining: 757ms
800:	learn: 0.6086938	total: 2.93s	remaining: 395ms
900:	learn: 0.5524172	total: 3.38s	remaining: 30ms
908:	learn: 0.5484418	total: 3.42s	remaining: 0us
Optimized CatBoost Evaluation Metrics:
MAE: 1.2090
MSE: 2.8986
RMSE: 1.7025
R2 Score: 0.6943
Explained Variance Score: 0.7027


In [47]:
import joblib

In [49]:
# Save the model
joblib.dump(optimized_catboost_model, 'optimized_catboost_model.pkl')

['optimized_catboost_model.pkl']

In [51]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

In [111]:
# Define the optimization function for Optuna
def objective(trial):
    # Suggest hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 10, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

    # Initialize the model with suggested parameters
    rf_regressor = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42,
        n_jobs=-1
    )

    # Perform 5-fold cross-validation and return the negative mean squared error
    cv_scores = cross_val_score(rf_regressor, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    return -np.mean(cv_scores)

# Create and run the study
study = optuna.create_study(direction='minimize', study_name="Random Forest Optimization")
study.optimize(objective, n_trials=200)

# Print the best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Train the model with the best parameters
best_rf_regressor = RandomForestRegressor(
    **best_params,
    random_state=42,
    n_jobs=-1
)
best_rf_regressor.fit(X_train, y_train)

# Predict using the optimized Random Forest model
y_pred_rf_tuned = best_rf_regressor.predict(X_test)

# Evaluate the optimized model
evaluate_model(y_test, y_pred_rf_tuned, "Optimized Random Forest Regression Model")


[I 2025-01-13 17:27:40,083] A new study created in memory with name: Random Forest Optimization
[I 2025-01-13 17:27:45,697] Trial 0 finished with value: 3.247230507655916 and parameters: {'n_estimators': 436, 'max_depth': 21, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 0 with value: 3.247230507655916.
[I 2025-01-13 17:27:50,004] Trial 1 finished with value: 3.4485872509357796 and parameters: {'n_estimators': 781, 'max_depth': 47, 'min_samples_split': 17, 'min_samples_leaf': 6}. Best is trial 0 with value: 3.247230507655916.
[I 2025-01-13 17:27:52,338] Trial 2 finished with value: 3.584581883055363 and parameters: {'n_estimators': 631, 'max_depth': 34, 'min_samples_split': 16, 'min_samples_leaf': 9}. Best is trial 0 with value: 3.247230507655916.
[I 2025-01-13 17:27:55,062] Trial 3 finished with value: 3.4000586203591125 and parameters: {'n_estimators': 594, 'max_depth': 23, 'min_samples_split': 12, 'min_samples_leaf': 6}. Best is trial 0 with value: 3.247230507655916.

Best Hyperparameters: {'n_estimators': 625, 'max_depth': 49, 'min_samples_split': 6, 'min_samples_leaf': 1}
Optimized Random Forest Regression Model Evaluation Metrics:
MAE: 1.1735
MSE: 2.8937
RMSE: 1.7011
R2 Score: 0.6948
Explained Variance Score: 0.7003


In [113]:
joblib.dump(best_rf_regressor, 'best_rf_regressor.pkl')

['best_rf_regressor.pkl']

In [57]:
from sklearn.ensemble import ExtraTreesRegressor

In [59]:
# Define the optimization function for Optuna
def objective(trial):
    # Suggest hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    use_max_depth = trial.suggest_categorical('use_max_depth', [True, False])
    max_depth = trial.suggest_int('max_depth', 6, 50) if use_max_depth else None
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])

    # Initialize the model with suggested parameters
    extra_trees = ExtraTreesRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42,
        n_jobs=-1
    )

    # Perform 5-fold cross-validation and return the negative mean squared error
    cv_scores = cross_val_score(extra_trees, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    return -np.mean(cv_scores)

# Create and run the study
study = optuna.create_study(direction='minimize', study_name="Extra Trees Optimization")
study.optimize(objective, n_trials=200)

# Print the best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Handle 'max_depth' for None case
best_max_depth = best_params['max_depth'] if best_params.get('use_max_depth') else None

# Train the model with the best parameters
best_extra_trees = ExtraTreesRegressor(
    n_estimators=best_params['n_estimators'],
    max_depth=best_max_depth,
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    max_features=best_params['max_features'],
    random_state=42,
    n_jobs=-1
)
best_extra_trees.fit(X_train, y_train)

# Predict and evaluate
y_pred_et = best_extra_trees.predict(X_test)
evaluate_model(y_test, y_pred_et, "Optimized Extra Trees Regressor")


[I 2025-01-13 02:58:21,306] A new study created in memory with name: Extra Trees Optimization
[I 2025-01-13 02:58:22,068] Trial 0 finished with value: 4.761065049458304 and parameters: {'n_estimators': 483, 'use_max_depth': True, 'max_depth': 16, 'min_samples_split': 18, 'min_samples_leaf': 10, 'max_features': 'sqrt'}. Best is trial 0 with value: 4.761065049458304.
[I 2025-01-13 02:58:23,464] Trial 1 finished with value: 4.144757963431202 and parameters: {'n_estimators': 852, 'use_max_depth': True, 'max_depth': 31, 'min_samples_split': 9, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 1 with value: 4.144757963431202.
[I 2025-01-13 02:58:24,605] Trial 2 finished with value: 3.3127150299625567 and parameters: {'n_estimators': 257, 'use_max_depth': False, 'min_samples_split': 12, 'min_samples_leaf': 4, 'max_features': None}. Best is trial 2 with value: 3.3127150299625567.
[I 2025-01-13 02:58:25,522] Trial 3 finished with value: 3.979752423597244 and parameters: {'n_estimato

Best Hyperparameters: {'n_estimators': 970, 'use_max_depth': True, 'max_depth': 44, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': None}
Optimized Extra Trees Regressor Evaluation Metrics:
MAE: 1.0907
MSE: 2.7112
RMSE: 1.6466
R2 Score: 0.7140
Explained Variance Score: 0.7206


In [61]:
joblib.dump(best_extra_trees, 'best_extra_trees.pkl')

['best_extra_trees.pkl']

In [65]:
import lightgbm as lgb

In [67]:
# Define the optimization function for Optuna
def objective(trial):
    # Suggest hyperparameters for LightGBM
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
    }
    
    # Initialize the LightGBM model
    lightgbm_model = lgb.LGBMRegressor(
        objective='regression', 
        random_state=42, 
        verbose=-1, 
        **params
    )
    
    # Perform 5-fold cross-validation and return the negative mean squared error
    cv_scores = cross_val_score(lightgbm_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    return -np.mean(cv_scores)

# Create and run the Optuna study
study = optuna.create_study(direction='minimize', study_name="LightGBM Optimization")
study.optimize(objective, n_trials=200)

# Print the best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Train the model with the best parameters
best_lightgbm_model = lgb.LGBMRegressor(
    objective='regression',
    random_state=42,
    verbose=-1,
    **best_params
)
best_lightgbm_model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = best_lightgbm_model.predict(X_test)
evaluate_model(y_test, y_pred, "Optimized LightGBM")


[I 2025-01-13 03:12:48,881] A new study created in memory with name: LightGBM Optimization
[I 2025-01-13 03:12:54,364] Trial 0 finished with value: 3.60005085108926 and parameters: {'n_estimators': 488, 'max_depth': 15, 'learning_rate': 0.08622080815119619, 'subsample': 0.669043996267172, 'colsample_bytree': 0.8897044455696459, 'reg_alpha': 1.9949677169009006, 'reg_lambda': 8.442828759229927}. Best is trial 0 with value: 3.60005085108926.
[I 2025-01-13 03:12:57,129] Trial 1 finished with value: 3.466329931607123 and parameters: {'n_estimators': 283, 'max_depth': 15, 'learning_rate': 0.024496389826391147, 'subsample': 0.6821314957073155, 'colsample_bytree': 0.852431473403735, 'reg_alpha': 6.5311286654453, 'reg_lambda': 3.4969481335562067}. Best is trial 1 with value: 3.466329931607123.
[I 2025-01-13 03:12:57,294] Trial 2 finished with value: 3.605128092340388 and parameters: {'n_estimators': 123, 'max_depth': 5, 'learning_rate': 0.19460984114829058, 'subsample': 0.9685162281312353, 'col

Best Hyperparameters: {'n_estimators': 808, 'max_depth': 8, 'learning_rate': 0.01162576459096325, 'subsample': 0.7260723487286591, 'colsample_bytree': 0.7084987216974544, 'reg_alpha': 0.43451147136721047, 'reg_lambda': 0.2183806545644044}
Optimized LightGBM Evaluation Metrics:
MAE: 1.2726
MSE: 3.1612
RMSE: 1.7780
R2 Score: 0.6666
Explained Variance Score: 0.6697


In [69]:
joblib.dump(best_lightgbm_model, 'best_lightgbm_model.pkl')

['best_lightgbm_model.pkl']

In [115]:
# Load base models
best_extra_trees = joblib.load('best_extra_trees.pkl')
best_rf_regressor = joblib.load('best_rf_regressor.pkl')
optimized_catboost_model = joblib.load('optimized_catboost_model.pkl')
best_lightgbm_model = joblib.load('best_lightgbm_model.pkl')

In [117]:
from sklearn.ensemble import VotingRegressor
from scipy.optimize import minimize

In [119]:
# Define the objective function for Optuna
def objective(trial):
    # Suggest weights for each model
    weight_extra_trees = trial.suggest_float('weight_extra_trees', 0.0, 1.0)
    weight_random_forest = trial.suggest_float('weight_random_forest', 0.0, 1.0)
    weight_catboost = trial.suggest_float('weight_catboost', 0.0, 1.0)
    weight_lightgbm = trial.suggest_float('weight_lightgbm', 0.0, 1.0)
    
    # Combine weights and normalize to sum to 1
    weights = np.array([weight_extra_trees, weight_random_forest, weight_catboost, weight_lightgbm])
    if weights.sum() == 0:  # Avoid division by zero
        return float('inf')
    weights /= weights.sum()
    
    # Create a VotingRegressor with the suggested weights
    voting_regressor = VotingRegressor(
        estimators=[
            ('extra_trees', best_extra_trees),
            ('random_forest', best_rf_regressor),
            ('catboost', optimized_catboost_model),
            ('lightgbm', best_lightgbm_model)
        ],
        weights=weights,
        n_jobs=-1
    )
    
    # Fit the model
    voting_regressor.fit(X_train, y_train)
    
    # Predict and calculate MSE
    y_pred = voting_regressor.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse

# Create and run the Optuna study
study = optuna.create_study(direction='minimize', study_name="Voting Regressor Optimization")
study.optimize(objective, n_trials=200)

# Get the best weights from the study
best_params = study.best_params
best_weights = np.array([
    best_params['weight_extra_trees'],
    best_params['weight_random_forest'],
    best_params['weight_catboost'],
    best_params['weight_lightgbm']
])
best_weights /= best_weights.sum()  # Normalize to ensure weights sum to 1

print("Best weights found:", best_weights)

# Create the final VotingRegressor with the optimized weights
optimized_voting_regressor = VotingRegressor(
    estimators=[
        ('extra_trees', best_extra_trees),
        ('random_forest', best_rf_regressor),
        ('catboost', optimized_catboost_model),
        ('lightgbm', best_lightgbm_model)
    ],
    weights=best_weights,
    n_jobs=-1
)

# Fit the optimized VotingRegressor
optimized_voting_regressor.fit(X_train, y_train)

# Predict and evaluate
y_pred_optimized = optimized_voting_regressor.predict(X_test)
evaluate_model(y_test, y_pred_optimized, "Optimized Weighted Voting Regressor")

[I 2025-01-13 17:42:50,074] A new study created in memory with name: Voting Regressor Optimization
[I 2025-01-13 17:42:57,592] Trial 0 finished with value: 2.7391643109578534 and parameters: {'weight_extra_trees': 0.5930463890537007, 'weight_random_forest': 0.6919466590052218, 'weight_catboost': 0.6592424128054633, 'weight_lightgbm': 0.9332341700867389}. Best is trial 0 with value: 2.7391643109578534.
[I 2025-01-13 17:43:03,354] Trial 1 finished with value: 2.725281031085051 and parameters: {'weight_extra_trees': 0.5659555627723802, 'weight_random_forest': 0.26773359960316445, 'weight_catboost': 0.22602134544087082, 'weight_lightgbm': 0.7252039001620829}. Best is trial 1 with value: 2.725281031085051.
[I 2025-01-13 17:43:10,238] Trial 2 finished with value: 2.6827030788930277 and parameters: {'weight_extra_trees': 0.9454136789494997, 'weight_random_forest': 0.7014850784531528, 'weight_catboost': 0.3516959899029638, 'weight_lightgbm': 0.18389307417464495}. Best is trial 2 with value: 2.

Best weights found: [0.6752127  0.01381851 0.11022799 0.2007408 ]
Optimized Weighted Voting Regressor Evaluation Metrics:
MAE: 1.1168
MSE: 2.6547
RMSE: 1.6293
R2 Score: 0.7200
Explained Variance Score: 0.7259


In [121]:
joblib.dump(optimized_voting_regressor, 'optimized_voting_regressor.pkl')

['optimized_voting_regressor.pkl']

In [135]:
from sklearn.ensemble import GradientBoostingRegressor

In [143]:
# Define the base models
base_models = [
    ('extra_trees', best_extra_trees),
    ('random_forest', best_rf_regressor),
    ('catboost', optimized_catboost_model),
    ('lightgbm', best_lightgbm_model)
]

# Get predictions from base models to form the initial dataset for boosting
X_train_boost = np.column_stack([model.predict(X_train) for _, model in base_models])
X_test_boost = np.column_stack([model.predict(X_test) for _, model in base_models])

# Objective function for Optuna to optimize GradientBoostingRegressor
def objective(trial):
    # Suggest hyperparameters for tuning
    n_estimators = trial.suggest_int("n_estimators", 50, 1000)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.3, log=True)
    max_depth = trial.suggest_int("max_depth", 2, 15)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)
    subsample = trial.suggest_float("subsample", 0.5, 1.0)

    # Define the boosting model with suggested hyperparameters
    model = GradientBoostingRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        subsample=subsample,
        random_state=42
    )

    # Train the model on training data
    model.fit(X_train_boost, y_train)

    # Predict on validation set
    y_pred = model.predict(X_test_boost)

    # Compute the evaluation metric (Mean Squared Error in this case)
    mse = mean_squared_error(y_test, y_pred)
    return mse

# Create an Optuna study to minimize the objective function
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=200, show_progress_bar=True)

# Print the best parameters and the best score
print("Best Parameters:", study.best_params)
print("Best MSE:", study.best_value)

# Train the final model with the best hyperparameters
best_params = study.best_params
optimized_boosting_model = GradientBoostingRegressor(
    n_estimators=best_params["n_estimators"],
    learning_rate=best_params["learning_rate"],
    max_depth=best_params["max_depth"],
    min_samples_split=best_params["min_samples_split"],
    min_samples_leaf=best_params["min_samples_leaf"],
    subsample=best_params["subsample"],
    random_state=42
)

# Train the final optimized model
optimized_boosting_model.fit(X_train_boost, y_train)

# Predict using the optimized Boosting model
y_pred_optimized = optimized_boosting_model.predict(X_test_boost)

# Evaluate the optimized Boosting model
evaluate_model(y_test, y_pred_optimized, "Optimized Boosting Regressor")


[I 2025-01-13 23:10:33,648] A new study created in memory with name: no-name-597d370e-d81c-41ca-b3fc-fc0c69072fa5


  0%|          | 0/200 [00:00<?, ?it/s]

[I 2025-01-13 23:10:34,508] Trial 0 finished with value: 2.709227112363343 and parameters: {'n_estimators': 885, 'learning_rate': 0.03342677383259627, 'max_depth': 4, 'min_samples_split': 11, 'min_samples_leaf': 18, 'subsample': 0.5074750716761323}. Best is trial 0 with value: 2.709227112363343.
[I 2025-01-13 23:10:34,687] Trial 1 finished with value: 2.8231049875384646 and parameters: {'n_estimators': 136, 'learning_rate': 0.012528486273497213, 'max_depth': 4, 'min_samples_split': 10, 'min_samples_leaf': 15, 'subsample': 0.8726437867111416}. Best is trial 0 with value: 2.709227112363343.
[I 2025-01-13 23:10:34,852] Trial 2 finished with value: 7.158686138139062 and parameters: {'n_estimators': 140, 'learning_rate': 0.0015358057508576223, 'max_depth': 11, 'min_samples_split': 11, 'min_samples_leaf': 16, 'subsample': 0.5811597690926986}. Best is trial 0 with value: 2.709227112363343.
[I 2025-01-13 23:10:36,531] Trial 3 finished with value: 2.706418993823597 and parameters: {'n_estimator

In [145]:
joblib.dump(optimized_boosting_model, 'optimized_boosting_model.pkl')

['optimized_boosting_model.pkl']