Boosting example

In [1]:
import warnings

# Ignore all warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.ensemble import AdaBoostClassifier,BaggingClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
# --------- Classifiers ------------
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier


In [33]:
def train_and_evaluate_adaboost(base_estimator, n_estimators=10, test_size=0.2, random_state=42, cv_folds=5, bagging=False, n_bags=10, max_samples=1.0, max_features=1.0):
    # Load the Iris dataset
    X, y = load_digits(return_X_y=True)

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Apply bagging if specified
    if bagging:
        bagged_estimator = BaggingClassifier(
            estimator=base_estimator,
            n_estimators=n_bags,
            max_samples=max_samples,
            max_features=max_features,
            random_state=random_state,
            n_jobs = -1
        )

    # Initialize the AdaBoost Classifier with the specified base estimator
    if(bagging):
        clf = AdaBoostClassifier(estimator=bagged_estimator, n_estimators=n_estimators, random_state=random_state)
    else:
        clf = AdaBoostClassifier(estimator=base_estimator, n_estimators=n_estimators, random_state=random_state)
    # Perform cross-validation
    cv_scores = cross_val_score(clf, X_train, y_train, cv=cv_folds, scoring='accuracy')

    # Fit the model on the entire training set
    clf.fit(X_train, y_train)

    # Predict the test set
    y_pred = clf.predict(X_test)

    # Evaluate the model on the test set
    test_accuracy = accuracy_score(y_test, y_pred)

    # Print results
    print("---------------------------------------------------------")
    print(f"Cross-validation accuracy with {'Bagged ' if bagging else ''}{type(base_estimator).__name__} as base estimator: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")
    print(f"Test set accuracy with {'Bagged ' if bagging else ''}{type(base_estimator).__name__} as base estimator: {test_accuracy:.2f}")
    print("---------------------------------------------------------")
    # Return the trained classifier
    return clf


## Non Bagged tests

In [34]:
decision_tree_base = DecisionTreeClassifier(max_depth=12)
trained_clf_dt = train_and_evaluate_adaboost(decision_tree_base,
                                             n_estimators=8,
                                              cv_folds=10)

---------------------------------------------------------
Cross-validation accuracy with DecisionTreeClassifier as base estimator: 0.90 ± 0.05
Test set accuracy with DecisionTreeClassifier as base estimator: 0.94
---------------------------------------------------------


In [35]:
rf_base = RandomForestClassifier(n_estimators=10, max_depth=6, random_state=42)
trained_clf_randomforest = train_and_evaluate_adaboost(rf_base,
                                              n_estimators=8,
                                              cv_folds=10)

---------------------------------------------------------
Cross-validation accuracy with RandomForestClassifier as base estimator: 0.93 ± 0.02
Test set accuracy with RandomForestClassifier as base estimator: 0.96
---------------------------------------------------------


In [36]:
# logreg_base = LogisticRegression(solver='lbfgs', max_iter=500)
# trained_clf_logreg = train_and_evaluate_adaboost(logreg_base,
#                                               n_estimators=8,
#                                               cv_folds=10)

## Trying Grad-Boosting

In [37]:
# from sklearn.ensemble import GradientBoostingClassifier
# gradient_boosting_base = GradientBoostingClassifier(n_estimators=15,
#                                                     learning_rate=0.2,
#                                                     max_depth=1)
# trained_clf_gb = train_and_evaluate_adaboost(gradient_boosting_base,
#                                              n_estimators=8,
#                                              cv_folds=10)


## Different Descision tree

In [38]:
# gradient_boosting_base = GradientBoostingClassifier(n_estimators=15,
#                                                     learning_rate=0.2,
#                                                     max_depth=3)
# trained_clf_gb = train_and_evaluate_adaboost(gradient_boosting_base,
#                                              n_estimators=8,
#                                              cv_folds=10)

# Bagging and boosting

In [39]:
decision_tree_base = DecisionTreeClassifier(max_depth=12)
trained_clf_dt = train_and_evaluate_adaboost(decision_tree_base,
                                             n_estimators=8,
                                             bagging = True,
                                             n_bags = 8,
                                              cv_folds=10)

---------------------------------------------------------
Cross-validation accuracy with Bagged DecisionTreeClassifier as base estimator: 0.96 ± 0.02
Test set accuracy with Bagged DecisionTreeClassifier as base estimator: 0.97
---------------------------------------------------------


In [40]:
rf_base = RandomForestClassifier(n_estimators=10, max_depth=6, random_state=42)
trained_clf_randomforest = train_and_evaluate_adaboost(rf_base,
                                              n_estimators=8,
                                              bagging = True,
                                              n_bags = 8,
                                              cv_folds=10)

---------------------------------------------------------
Cross-validation accuracy with Bagged RandomForestClassifier as base estimator: 0.97 ± 0.01
Test set accuracy with Bagged RandomForestClassifier as base estimator: 0.98
---------------------------------------------------------


## Grad boosted, adaptive boosted and bootstrap aggregated model

In [55]:
gradient_boosting_base = GradientBoostingClassifier(n_estimators=15,
                                                    learning_rate=0.1,
                                                    max_depth=3)
trained_clf_gb = train_and_evaluate_adaboost(gradient_boosting_base,
                                             n_estimators=12,
                                             bagging = True,
                                             n_bags = 5,
                                             cv_folds=10)

---------------------------------------------------------
Cross-validation accuracy with Bagged GradientBoostingClassifier as base estimator: 0.96 ± 0.02
Test set accuracy with Bagged GradientBoostingClassifier as base estimator: 0.97
---------------------------------------------------------
