In [12]:
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, GradientBoostingRegressor, RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from optuna.trial import Trial, FrozenTrial
from optuna.study import Study, StudyDirection
from optuna.visualization import  plot_optimization_history, plot_parallel_coordinate
from optuna import create_study
from optuna.samplers import TPESampler
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split, GridSearchCV

In [2]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=42)

X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [3]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [4]:
class EarlyStoppingCallback:
    """
    Early stopping callback for Optuna studies.
    Stops the study if there is no improvement in the best value for a specified number of trials (patience).
    """
    def __init__(self, patience: int, min_delta: float = 0.0):
        """
        Args:
            patience (int): Number of trials to wait for improvement before stopping.
            min_delta (float): Minimum change in the monitored value to qualify as an improvement.
        """
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_value = None

    def __call__(self, study: Study, trial: FrozenTrial):
        if self.best_value is None:
            self.best_value = study.best_value
            return

        if study.direction == StudyDirection.MINIMIZE:
            if study.best_value < self.best_value - self.min_delta:
                self.best_value = study.best_value
                self.counter = 0
            else:
                self.counter += 1
        else:
            if study.best_value > self.best_value + self.min_delta:
                self.best_value = study.best_value
                self.counter = 0
            else:
                self.counter += 1

        if self.counter >= self.patience:
            study.stop()
            print(f'Early stopping triggered after {self.counter} trials with no improvement.')

In [None]:
def objective(trial: Trial):

    model = ExtraTreesClassifier(
        n_estimators=trial.suggest_int('n_estimators', 100, 1000),
        criterion=trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
        max_depth=trial.suggest_int('max_depth', 5, 50),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 32),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 32),
        min_weight_fraction_leaf=trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.5, step=0.05),
        bootstrap=True,
        # bootstrap=trial.suggest_categorical('bootstrap', [True, False]),
        oob_score=trial.suggest_categorical('oob_score', [True, False]),
        ccp_alpha=trial.suggest_float('ccp_alpha', 0.0, 0.1, step=0.01),
        n_jobs=-1
    ) # 0.8610047172921425

    # model = LogisticRegression(
    #     C=trial.suggest_float('C', 0.01, 10.0, log=True),
    #     max_iter=trial.suggest_int('max_iter', 100, 1000),
    #     tol=trial.suggest_float('tol', 1e-5, 1e-1, log=True),
    #     solver=trial.suggest_categorical('solver', ['saga']),
    #     penalty=trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet']),
    #     l1_ratio=trial.suggest_float('l1_ratio', 0.0, 1.0) if trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet']) == 'elasticnet' else None,
    #     n_jobs=-1
    # ) # 0.8160286034537533

    # model = GradientBoostingClassifier(
    #     n_estimators=trial.suggest_int('n_estimators', 100, 1000),
    #     max_depth=trial.suggest_int('max_depth', 3, 20),
    #     learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3, step=0.01),
    #     subsample=trial.suggest_float('subsample', 0.5, 1.0, step=0.1)
    # ) # 0.9310178441914969

    # model = KNeighborsClassifier(
    #     n_neighbors=trial.suggest_int('n_neighbors', 1, 100),
    #     weights=trial.suggest_categorical('weights', ['uniform', 'distance']),
    #     leaf_size=trial.suggest_int('leaf_size', 10, 5000),
    #     p=trial.suggest_float('p', low=0.1, high=4, step=0.025),
    #     n_jobs=-1
    # ) # 0.9329958701216187

    score = cross_val_score(model,
                            X_train,
                            y_train,
                            n_jobs=-1,
                            cv=cv,
                            scoring='accuracy')

    accuracy = score.mean()
    return accuracy

In [None]:
study = create_study(direction='maximize', sampler=TPESampler())
study.optimize(objective, n_trials=200, n_jobs=-1, callbacks=[EarlyStoppingCallback(patience=5, min_delta=1e-4)])

In [None]:
study.best_params

In [None]:
study.best_value

In [None]:
plot_parallel_coordinate(study)

In [5]:
knn_model = KNeighborsClassifier(**{'n_neighbors': 12, 'weights': 'distance', 'leaf_size': 2325, 'p': 1.0250000000000001}) # 0.9280028531525537
gbc_model = GradientBoostingClassifier(**{'n_estimators': 497, 'max_depth': 6, 'learning_rate': 0.14, 'subsample': 0.6})   # 0.927016837196478
etree_model = ExtraTreesClassifier(**{'n_estimators': 996, 'criterion': 'log_loss', 'max_depth': 50, 'min_samples_split': 4,
 'min_samples_leaf': 9, 'min_weight_fraction_leaf': 0.0, 'oob_score': False, 'ccp_alpha': 0.0}) # 0.8669987352622083

In [14]:
models = [knn_model, gbc_model, etree_model]
for model in models:

    model.fit(X_train, y_train)

    score = cross_val_score(model,
                            X_test,
                            y_test,
                            n_jobs=-1,
                            cv=cv,
                            scoring='accuracy')
    accuracy = score.mean()
    print(f'{model.__class__.__name__} Cross Validation Score: {accuracy}')

KNeighborsClassifier Cross Validation Score: 0.8203678576812905
GradientBoostingClassifier Cross Validation Score: 0.8399668325041459
ExtraTreesClassifier Cross Validation Score: 0.7799638172772502


In [10]:
estimators=[('knn', knn_model), ('gbc', gbc_model), ('etree', etree_model)]
voting_clf = VotingClassifier(estimators=estimators, voting='soft')
voting_clf.fit(X_train, y_train)
cross_voting_score = cross_val_score(voting_clf, X_test, y_test, cv=cv, n_jobs=-1, scoring='accuracy').mean()
print(f'Voting Classifier Cross Validation Score: {cross_voting_score}')

Voting Classifier Cross Validation Score: 0.8450927182270466


In [13]:
# tuning
param_grid = {
    'voting': ['hard', 'soft'],
    'weights': [[1, 1, 1], [2, 1, 1], [1, 1, 2], [1, 2, 1]]
}

grid_search = GridSearchCV(estimator=voting_clf,
                           param_grid=param_grid,
                           cv=cv,
                           n_jobs=-1,
                           verbose=1,
                           scoring='accuracy',
                           return_train_score=False)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
best_score = grid_search.best_score_

print(f'Best Parameters: {best_params}')
print(f'Best Model: {best_model}')
print(f'Best Score: {best_score}') # Best Score: 0.9312560427285442

Best Parameters: {'voting': 'soft', 'weights': [2, 1, 1]}
Best Model: VotingClassifier(estimators=[('knn',
                              KNeighborsClassifier(leaf_size=2325,
                                                   n_neighbors=12,
                                                   p=1.0250000000000001,
                                                   weights='distance')),
                             ('gbc',
                              GradientBoostingClassifier(learning_rate=0.14,
                                                         max_depth=6,
                                                         n_estimators=497,
                                                         subsample=0.6)),
                             ('etree',
                              ExtraTreesClassifier(criterion='log_loss',
                                                   max_depth=50,
                                                   min_samples_leaf=9,
                                

In [None]:
best_model = RandomForestRegressor(
    **study.best_params,
    random_state=42,
    n_jobs=-1
)

best_model.fit(X_train, y_train)

# Train performance
y_train_pred = best_model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))

# Test performance
y_test_pred = best_model.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print(f"Train RMSE: {rmse_train:.3f}")
print(f"Test RMSE:  {rmse_test:.3f}")

# If rmse_train << rmse_test → likely overfitting.
# If both are high → likely underfitting.
# If both are reasonably close and low → model is well fit.

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import ExtraTreesClassifier

# best params from your Optuna study (classification)
best_clf = ExtraTreesClassifier(
    **study.best_params,
    n_jobs=-1,
    random_state=42
)

# 1. Train on full training set
best_clf.fit(X_train, y_train)

# 2. Accuracy on training set
y_train_pred = best_clf.predict(X_train)
acc_train = accuracy_score(y_train, y_train_pred)

# 3. Accuracy on test set (hold-out set)
y_test_pred = best_clf.predict(X_test)
acc_test = accuracy_score(y_test, y_test_pred)

print(f"Train accuracy: {acc_train:.3f}")
print(f"Test  accuracy: {acc_test:.3f}")
print(f"Gap (train - test): {acc_train - acc_test:.3f}")

# 4. (optional) Cross-validation on training set with fixed best_clf
from sklearn.model_selection import cross_val_score, StratifiedKFold

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
cv_scores = cross_val_score(best_clf, X_train, y_train, cv=cv, n_jobs=-1, scoring="accuracy")
print(f"CV mean accuracy (train side): {cv_scores.mean():.3f}, std: {cv_scores.std():.3f}")

# Heuristics:
# - If acc_train and acc_test are both high and close -> model is likely stable.
# - If acc_train >> acc_test -> overfitting.
# - If both are low -> underfitting.
# - If CV accuracy ~= test accuracy and does not improve with more trials/models -> stabilized.