This notebook is running optuna. Not a showcase just something I wanted to do.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import plotly.express as px
import plotly.figure_factory as ff
from IPython.display import Image
from xgboost import XGBRegressor
import optuna
df = pd.read_csv('abalone.data', names=['Sex', 'Length', 'Diameter', 'Height',
                                        'Whole_weight', 'Shucked_weight',
                                        'Viscera_weight', 'Shell_weight', 'Rings'])
df['Age'] = df['Rings'] + 1.5
df.drop('Rings', axis=1,inplace=True)
df = df[df['Height'] < 0.4]
df = df[df['Viscera_weight'] < 0.6]
df = df[df['Shucked_weight'] < 1.3]
df = df[df['Shell_weight'] < 1]
df = df[df['Whole_weight'] < 2.8]
df = df[df['Shell_weight'] < 0.7]
df['Sex'] = df['Sex'].map({'M':0,'F':1,'I':2})
y = df['Age']
X = df.drop('Age', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=62)



In [145]:
pipe = make_pipeline(StandardScaler(), SGDRegressor(random_state=62)).fit(X_train, y_train)
mse_scores = cross_val_score(pipe, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-mse_scores)
print("Default SGD RMSE (per fold):", rmse_scores)
print("Default SGD Average RMSE:", rmse_scores.mean())

y_pred = pipe.predict(X_test)
test_rmse = root_mean_squared_error(y_test, y_pred)
print(f"\nDefault SGD Test Set RMSE: {test_rmse}")

Default SGD RMSE (per fold): [2.30041923 2.23278552 2.19069519 2.21638679 2.08783076]
Default SGD Average RMSE: 2.2056234978407123

Default SGD Test Set RMSE: 2.142379842849004


In [None]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 5.0)
    }

    model = XGBRegressor(**params)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
    return -scores.mean()  # Optuna minimizes, so negate RMSE


In [151]:
study = optuna.create_study(direction='minimize')  # Minimize RMSE
study.optimize(objective, n_trials=100, show_progress_bar=True)

[I 2025-06-01 00:28:59,989] A new study created in memory with name: no-name-03219e65-aaf9-4a9c-bb28-59143a2d1d36


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-06-01 00:29:01,324] Trial 0 finished with value: 2.2310044400010183 and parameters: {'n_estimators': 349, 'max_depth': 4, 'learning_rate': 0.14095718028236953, 'subsample': 0.9458338636699134, 'colsample_bytree': 0.5253598201512037, 'reg_alpha': 4.711686750380074, 'reg_lambda': 4.548477262639667}. Best is trial 0 with value: 2.2310044400010183.
[I 2025-06-01 00:29:03,451] Trial 1 finished with value: 2.191483402465161 and parameters: {'n_estimators': 863, 'max_depth': 3, 'learning_rate': 0.03976291352197873, 'subsample': 0.9917709173758708, 'colsample_bytree': 0.7094917096174403, 'reg_alpha': 1.2917709857477033, 'reg_lambda': 1.4603671832652532}. Best is trial 1 with value: 2.191483402465161.
[I 2025-06-01 00:29:09,773] Trial 2 finished with value: 2.352649845883633 and parameters: {'n_estimators': 788, 'max_depth': 8, 'learning_rate': 0.11447309853825881, 'subsample': 0.7146443891104775, 'colsample_bytree': 0.798848050772377, 'reg_alpha': 1.86899209105392, 'reg_lambda': 1.6271

In [152]:
print("Best RMSE:", study.best_value)
print("Best hyperparameters:", study.best_params)

# Train best model on full training set
best_model = XGBRegressor(**study.best_params)
best_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = best_model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print("Test RMSE:", rmse)


Best RMSE: 2.156522527369947
Best hyperparameters: {'n_estimators': 294, 'max_depth': 5, 'learning_rate': 0.020880966006613363, 'subsample': 0.6592465685166781, 'colsample_bytree': 0.8299978770772269, 'reg_alpha': 3.677567259878882, 'reg_lambda': 4.342758611773636}
Test RMSE: 2.0236291569850104


In [161]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }

    model = RandomForestRegressor(**params, random_state=42, n_jobs=-1)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
    return -scores.mean()

In [162]:
study = optuna.create_study(direction='minimize')  # Minimize RMSE
study.optimize(objective, n_trials=100, show_progress_bar=True)

[I 2025-06-01 00:39:36,136] A new study created in memory with name: no-name-1f46672f-659e-4ca9-8e94-8aff9b1c6e15


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-06-01 00:39:39,284] Trial 0 finished with value: 2.1856695124206444 and parameters: {'n_estimators': 591, 'max_depth': 20, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_features': 'log2', 'bootstrap': False}. Best is trial 0 with value: 2.1856695124206444.
[I 2025-06-01 00:39:43,091] Trial 1 finished with value: 2.288475979853067 and parameters: {'n_estimators': 804, 'max_depth': 5, 'min_samples_split': 3, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': False}. Best is trial 0 with value: 2.1856695124206444.
[I 2025-06-01 00:39:44,519] Trial 2 finished with value: 2.161773565931125 and parameters: {'n_estimators': 168, 'max_depth': 23, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_features': 'log2', 'bootstrap': True}. Best is trial 2 with value: 2.161773565931125.
[I 2025-06-01 00:39:49,349] Trial 3 finished with value: 2.175649336265691 and parameters: {'n_estimators': 886, 'max_depth': 14, 'min_samples_split': 10, 'min_samples_leaf': 9, 'max_features'

In [163]:
print("Best RMSE:", study.best_value)
print("Best hyperparameters:", study.best_params)

# Train best model on full training set
best_model = RandomForestRegressor(**study.best_params)
best_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = best_model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print("Test RMSE:", rmse)


Best RMSE: 2.14973578822285
Best hyperparameters: {'n_estimators': 160, 'max_depth': 23, 'min_samples_split': 4, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': True}
Test RMSE: 2.0399268260806838


In [164]:
epsilon = 1e-8  # to avoid division by zero

# Ratios and proportions
X_train['meat_ratio'] = X_train['Shucked_weight'] / (X_train['Whole_weight'] + epsilon)
X_train['viscera_ratio'] = X_train['Viscera_weight'] / (X_train['Whole_weight'] + epsilon)
X_train['shell_ratio'] = X_train['Shell_weight'] / (X_train['Whole_weight'] + epsilon)
X_train['meat_to_shell_ratio'] = X_train['Shucked_weight'] / (X_train['Shell_weight'] + epsilon)
X_train['viscera_to_meat'] = X_train['Viscera_weight'] / (X_train['Shucked_weight'] + epsilon)

# Geometric approximations
X_train['volume_approx'] = X_train['Length'] * X_train['Diameter'] * X_train['Height']
X_train['cross_section'] = X_train['Length'] * X_train['Diameter']
X_train['length_height_ratio'] = X_train['Length'] / (X_train['Height'] + epsilon)

# Interactions
X_train['weight_length_product'] = X_train['Whole_weight'] * X_train['Length']
X_train['shell_density_proxy'] = X_train['Shell_weight'] / ((X_train['Length'] * X_train['Diameter']) + epsilon)

# Log-transformed features for skewed distributions
for col in ['Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight']:
    X_train['log_' + col] = np.log1p(X_train[col])

X_test['meat_ratio'] = X_test['Shucked_weight'] / (X_test['Whole_weight'] + epsilon)
X_test['viscera_ratio'] = X_test['Viscera_weight'] / (X_test['Whole_weight'] + epsilon)
X_test['shell_ratio'] = X_test['Shell_weight'] / (X_test['Whole_weight'] + epsilon)
X_test['meat_to_shell_ratio'] = X_test['Shucked_weight'] / (X_test['Shell_weight'] + epsilon)
X_test['viscera_to_meat'] = X_test['Viscera_weight'] / (X_test['Shucked_weight'] + epsilon)
X_test['volume_approx'] = X_test['Length'] * X_test['Diameter'] * X_test['Height']
X_test['cross_section'] = X_test['Length'] * X_test['Diameter']
X_test['length_height_ratio'] = X_test['Length'] / (X_test['Height'] + epsilon)
X_test['weight_length_product'] = X_test['Whole_weight'] * X_test['Length']
X_test['shell_density_proxy'] = X_test['Shell_weight'] / ((X_test['Length'] * X_test['Diameter']) + epsilon)

for col in ['Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight']:
    X_test['log_' + col] = np.log1p(X_test[col])


In [None]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 5.0)
    }

    model = XGBRegressor(**params)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
    return -scores.mean()  # Optuna minimizes, so negate RMSE


In [166]:
study = optuna.create_study(direction='minimize')  # Minimize RMSE
study.optimize(objective, n_trials=100, show_progress_bar=True)

[I 2025-06-01 00:46:58,211] A new study created in memory with name: no-name-f53f8186-f24b-48bf-be4a-8afdd0b98180


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-06-01 00:47:09,966] Trial 0 finished with value: 2.3039712662230256 and parameters: {'n_estimators': 975, 'max_depth': 8, 'learning_rate': 0.18074459117976083, 'subsample': 0.7583803888276217, 'colsample_bytree': 0.7060232855791175, 'reg_alpha': 1.0016238530532418, 'reg_lambda': 4.670831486831637}. Best is trial 0 with value: 2.3039712662230256.
[I 2025-06-01 00:47:11,020] Trial 1 finished with value: 2.345184858857522 and parameters: {'n_estimators': 173, 'max_depth': 4, 'learning_rate': 0.2909782808899335, 'subsample': 0.6005172578119264, 'colsample_bytree': 0.7324063441189974, 'reg_alpha': 2.635201667484746, 'reg_lambda': 3.2000877254377427}. Best is trial 0 with value: 2.3039712662230256.
[I 2025-06-01 00:47:12,690] Trial 2 finished with value: 2.118676028473606 and parameters: {'n_estimators': 256, 'max_depth': 4, 'learning_rate': 0.017365878835626167, 'subsample': 0.9989717247085973, 'colsample_bytree': 0.5337388862438857, 'reg_alpha': 0.28925162219150313, 'reg_lambda': 1

In [167]:
print("Best RMSE:", study.best_value)
print("Best hyperparameters:", study.best_params)

# Train best model on full training set
best_model = XGBRegressor(**study.best_params)
best_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = best_model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print("Test RMSE:", rmse)


Best RMSE: 2.099547992824006
Best hyperparameters: {'n_estimators': 136, 'max_depth': 3, 'learning_rate': 0.036403011958195716, 'subsample': 0.5192040332447466, 'colsample_bytree': 0.8364950115095385, 'reg_alpha': 2.5498983395946624, 'reg_lambda': 0.9294309543848766}
Test RMSE: 2.0147145840428466
