In [42]:
import sys
sys.path.append('../src')
from linearboost.linear_boost import LinearBoostClassifier

In [43]:
pip install xgboost lightgbm catboost

Note: you may need to restart the kernel to use updated packages.


In [50]:
from ucimlrepo import fetch_ucirepo 
from sklearn.preprocessing import LabelEncoder

# The Huberman's Survival's id on UCI Machine Learning Repository
dataset_id = 43

dataset = fetch_ucirepo(id=dataset_id) 
  
# data (as pandas dataframes) 
X = dataset.data.features.copy()
y = dataset.data.targets

label_encoder = LabelEncoder()

y = label_encoder.fit_transform(y.values.ravel())

In [51]:

import pandas as pd

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Convert categorical columns to 'category' dtype
for col in categorical_cols:
    X[col] = X[col].astype('category')

# Handle missing values
# Fill numeric columns with median
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
for col in numeric_cols:
    X[col] = X[col].fillna(X[col].median())

# Fill categorical columns with mode
for col in categorical_cols:
    X[col] = X[col].fillna(X[col].mode()[0])


In [52]:
import warnings

warnings.filterwarnings("ignore", message=".*ignore_implicit_zeros.*")
warnings.filterwarnings("ignore", message=".*n_quantiles.*")
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

**LinearBoost results:**

In [53]:
import optuna
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score

def custom_loss(y_true, y_pred, weights):
    return np.mean(weights * (y_true - y_pred) ** 2)

df = X

# One-hot encoding
cat_features = list(df.select_dtypes(include=['object', 'category']).columns)
for col in cat_features:
    df_onehot = pd.get_dummies(df[col], prefix=col)
    df = df.drop(col, axis=1)
    df = pd.concat([df_onehot, df], axis=1)


def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
        'algorithm': trial.suggest_categorical('algorithm', ['SAMME', 'SAMME.R']),
        'scaler': trial.suggest_categorical('scaler', [
            'minmax', 'robust', 'quantile-uniform', 'quantile-normal'
        ]),
        'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly', 'sigmoid']),
    }

    if params['kernel'] != 'linear':
        params['gamma'] = trial.suggest_float('gamma', 1e-3, 10.0, log=True)
    if params['kernel'] == 'poly':
        params['degree'] = trial.suggest_int('degree', 2, 5)
    if params['kernel'] in ['poly', 'sigmoid']:
        params['coef0'] = trial.suggest_float('coef0', 0.0, 1.0)
    
    # Using a custom loss function here
    #params['loss_function'] = custom_loss

    model = LinearBoostClassifier(**params)

    scores = cross_val_score(
        estimator=model,
        X=df,
        y=y,
        scoring='f1_weighted',
        cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),
    )

    return scores.mean()

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

# Display the best trial's results
print('Best trial:')
trial = study.best_trial

print(f'F1 Score: {trial.value}')
print('Parameters: ')
for key, value in trial.params.items():
    print(f'{key}: {value}')

[I 2025-07-28 20:32:30,509] A new study created in memory with name: no-name-39689138-b2a1-447a-af63-be7733a7aeb8
[I 2025-07-28 20:32:30,764] Trial 0 finished with value: 0.7283291353857195 and parameters: {'n_estimators': 256, 'learning_rate': 0.11807746849928968, 'algorithm': 'SAMME', 'scaler': 'minmax', 'kernel': 'rbf', 'gamma': 0.10360685199357951}. Best is trial 0 with value: 0.7283291353857195.
[I 2025-07-28 20:32:32,767] Trial 1 finished with value: 0.7323671972329208 and parameters: {'n_estimators': 363, 'learning_rate': 0.013883181171194234, 'algorithm': 'SAMME', 'scaler': 'robust', 'kernel': 'rbf', 'gamma': 0.3980809182349502}. Best is trial 1 with value: 0.7323671972329208.
...[I 2025-07-28 20:34:42,444] Trial 199 finished with value: 0.7515000210035615 and parameters: {'n_estimators': 245, 'learning_rate': 0.06109405565734974, 'algorithm': 'SAMME', 'scaler': 'minmax', 'kernel': 'rbf', 'gamma': 0.08461411124525335}. Best is trial 167 with value: 0.7583125868901313.


Best trial:
F1 Score: 0.7583125868901313
Parameters: 
n_estimators: 384
learning_rate: 0.06667610599938725
algorithm: SAMME
scaler: robust
kernel: rbf
gamma: 0.002777056559327566


**XGBoost results:**

In [None]:

import xgboost as xgb

def objective(trial):
    params = {
        'objective': 'binary:logistic',
        'use_label_encoder': False,
        'n_estimators': trial.suggest_int('n_estimators', 20, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 20),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.7),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
        'enable_categorical': True,
        'eval_metric': 'logloss',
        'verbosity': 0
    }

    model = xgb.XGBClassifier(**params)

    scores = cross_val_score(
        estimator=model,
        X=X,
        y=y,
        scoring='f1_weighted',
        cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),
        n_jobs=-1
    )

    return scores.mean()


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

best_trial = study.best_trial

print('Best trial:')
print(f'F1 Score: {best_trial.value:.6f}')
print('Parameters:')
for k, v in best_trial.params.items():
    print(f'{k}: {v}')


[I 2025-07-28 20:34:58,357] A new study created in memory with name: no-name-fd2e9753-4f61-49f9-a22c-28b4c0177247
[I 2025-07-28 20:34:59,157] Trial 0 finished with value: 0.653658178784821 and parameters: {'n_estimators': 122, 'max_depth': 18, 'learning_rate': 0.6638400324564625, 'gamma': 1.3755330590290604e-05, 'min_child_weight': 6, 'subsample': 0.5447494156166959, 'colsample_bytree': 0.8879793066877644, 'reg_alpha': 1.0163427568069608e-07, 'reg_lambda': 2.39079916775675e-06}. Best is trial 0 with value: 0.653658178784821.
[I 2025-07-28 20:34:59,681] Trial 1 finished with value: 0.6770776702365993 and parameters: {'n_estimators': 317, 'max_depth': 20, 'learning_rate': 0.13249519981914618, 'gamma': 1.5953454630407348e-08, 'min_child_weight': 8, 'subsample': 0.9709077195029082, 'colsample_bytree': 0.9624785240081783, 'reg_alpha': 2.203094364625466e-07, 'reg_lambda': 1.4395372969777643e-08}. Best is trial 1 with value: 0.6770776702365993.
...[I 2025-07-28 20:35:10,555] Trial 199 finishe

Best trial:
F1 Score: 0.721809
Parameters:
n_estimators: 395
max_depth: 14
learning_rate: 0.5516744736853054
gamma: 0.06586236308160907
min_child_weight: 7
subsample: 0.934260221749137
colsample_bytree: 0.5657192375418623
reg_alpha: 0.0004839456623687217
reg_lambda: 0.024086595827121155


**LightGBM results:**

In [56]:
import lightgbm as lgb

def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart', 'goss']),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 20, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 20),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'min_split_gain': trial.suggest_float('min_split_gain', 1e-8, 1.0, log=True),
        'cat_smooth': trial.suggest_int('cat_smooth', 1, 100),
        'cat_l2': trial.suggest_float('cat_l2', 1e-8, 10.0, log=True),
        'verbosity': -1
    }

    model = lgb.LGBMClassifier(**params)

    scores = cross_val_score(
        estimator=model,
        X=X,
        y=y,
        scoring='f1_weighted',
        cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),
        n_jobs=-1
    )

    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

best_trial = study.best_trial

print('Best trial:')
print(f'F1 Score: {best_trial.value:.6f}')
print('Parameters:')
for k, v in best_trial.params.items():
    print(f'{k}: {v}')



[I 2025-07-28 20:39:17,046] A new study created in memory with name: no-name-d640d2f2-b93d-4408-a7ea-9161e95491e3
[I 2025-07-28 20:39:17,550] Trial 0 finished with value: 0.6231313020966095 and parameters: {'boosting_type': 'goss', 'num_leaves': 157, 'learning_rate': 0.011930732887702704, 'n_estimators': 104, 'max_depth': 8, 'min_child_samples': 71, 'subsample': 0.9402086280771477, 'colsample_bytree': 0.7682647640164344, 'reg_alpha': 1.4241168044547617e-08, 'reg_lambda': 2.0676712941542342e-07, 'min_split_gain': 0.0033415036895292826, 'cat_smooth': 97, 'cat_l2': 0.6970465070605171}. Best is trial 0 with value: 0.6231313020966095.
[I 2025-07-28 20:39:32,638] Trial 1 finished with value: 0.6231313020966095 and parameters: {'boosting_type': 'dart', 'num_leaves': 175, 'learning_rate': 0.0023119406442152646, 'n_estimators': 824, 'max_depth': 8, 'min_child_samples': 45, 'subsample': 0.9676678537059415, 'colsample_bytree': 0.5552007172897914, 'reg_alpha': 2.556389565074754e-07, 'reg_lambda': 

Best trial:
F1 Score: 0.733833
Parameters:
boosting_type: dart
num_leaves: 115
learning_rate: 0.014925187890769775
n_estimators: 440
max_depth: 18
min_child_samples: 25
subsample: 0.8388698484023127
colsample_bytree: 0.871735744058394
reg_alpha: 0.0002339943750255717
reg_lambda: 0.008719224583360354
min_split_gain: 6.975191054445815e-05
cat_smooth: 52
cat_l2: 1.870771829368486e-07


**CatBoost results:**

In [57]:
from catboost import CatBoostClassifier

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 50, 500),
        'depth': trial.suggest_int('depth', 1, 16),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.5, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
        'random_strength': trial.suggest_float('random_strength', 1e-8, 10.0, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.1, 10.0, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
        'rsm': trial.suggest_float('rsm', 0.1, 1.0),
        'loss_function': 'Logloss',
        'eval_metric': 'F1',
        'cat_features': categorical_cols,
        'verbose': 0
    }

    model = CatBoostClassifier(**params)

    scores = cross_val_score(
        estimator=model,
        X=X,
        y=y,
        scoring='f1_weighted',
        cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),
        n_jobs=-1
    )

    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

best_trial = study.best_trial

print('Best trial:')
print(f'F1 Score: {best_trial.value:.6f}')
print('Parameters:')
for k, v in best_trial.params.items():
    print(f'{k}: {v}')

[I 2025-07-28 21:10:05,884] A new study created in memory with name: no-name-bf4ade11-fdd4-4e51-8e36-3ce62bb0ef30
[I 2025-07-28 21:10:06,304] Trial 0 finished with value: 0.644865039962623 and parameters: {'iterations': 326, 'depth': 10, 'learning_rate': 0.19634246718523193, 'l2_leaf_reg': 2.874223428781629e-05, 'random_strength': 0.0007132910532975053, 'bagging_temperature': 4.68236671244208, 'border_count': 140, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 71, 'rsm': 0.851641906909135}. Best is trial 0 with value: 0.644865039962623.
[I 2025-07-28 21:10:06,587] Trial 1 finished with value: 0.6769947086440917 and parameters: {'iterations': 364, 'depth': 13, 'learning_rate': 0.04465629029593174, 'l2_leaf_reg': 1.3554273893773743e-08, 'random_strength': 0.4601972154575927, 'bagging_temperature': 1.856284401555177, 'border_count': 182, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 33, 'rsm': 0.27841739352654576}. Best is trial 1 with value: 0.6769947086440917.
...[I 2025-07-28 21:10:

Best trial:
F1 Score: 0.731945
Parameters:
iterations: 168
depth: 5
learning_rate: 0.00889070096045054
l2_leaf_reg: 3.173038372914875e-05
random_strength: 0.0004606096176348796
bagging_temperature: 0.9387985722566684
border_count: 56
grow_policy: Depthwise
min_data_in_leaf: 92
rsm: 0.7489477360324039
