In [4]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()

In [5]:
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
df

In [8]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

In [9]:
def print_accuracy(y_pred, y_test):
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {test_accuracy}")

In [10]:
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
boost = GradientBoostingClassifier()
boost.fit(X_train, y_train)

pred = boost.predict(X_test)
print_accuracy(pred, y_test)

In [12]:
import matplotlib.pyplot as plt
import optuna

In [13]:
def func(trial, X_train, X_val, y_train, y_val):
    n_estimators = trial.suggest_int('n_estimators', 2, 10)
    max_depth = trial.suggest_int('max_depth', 2, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 30)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 30)
    boost = GradientBoostingClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
    )
    boost.fit(X_train, y_train)
    y_pred = boost.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

In [14]:
def get_best_hyperparameters(X_train, X_val, y_train, y_val):
    study = optuna.create_study(direction='maximize')
    study.optimize(
        func=lambda trial: func(trial, X_train, X_val, y_train, y_val),
        n_trials=100,
        show_progress_bar=True
    )
    best_params = study.best_params
    best_value = study.best_value
    return best_params, best_value


In [15]:
def get_split_data():
    data = load_breast_cancer()
    X, y = data.data, data.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    return X_test, X_train, X_val, y_test, y_train, y_val

In [16]:
def pretty_print_best(best_params, best_score):
    print(f"Best parameters: {best_params}")
    print(f"Best score: {best_score}")

In [17]:
X_test, X_train, X_val, y_test, y_train, y_val = get_split_data()
best_params, best_score = get_best_hyperparameters(X_train, X_val, y_train, y_val)
print("~" * 40)
pretty_print_best(best_params, best_score)

In [18]:
def plot_accuracy_vs_n_estimators(
        X_test, X_train, X_val, y_test, y_train, y_val,
        max_depth,
        min_samples_split,
        min_samples_leaf,
        n_est_lst
):
    train_accuracies = []
    test_accuracies = []

    for n_est in n_est_lst:
        boost = GradientBoostingClassifier(
            n_estimators=n_est,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
        )
        boost.fit(X_train, y_train)

        y_train_pred = boost.predict(X_train)
        train_accuracies.append(accuracy_score(y_train, y_train_pred))

        y_test_pred = boost.predict(X_test)
        test_accuracies.append(accuracy_score(y_test, y_test_pred))

    plt.plot(n_est_lst, train_accuracies, label='Train Accuracy')
    plt.plot(n_est_lst, test_accuracies, label='Test Accuracy')
    plt.xlabel('N_estimators')
    plt.ylabel('Accuracy')
    plt.title('Accuracy vs N_estimators')
    plt.legend()
    plt.show()

In [19]:
# n_est_lst = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 25]
n_est_lst = [i for i in range(2, 100, 2)]
plot_accuracy_vs_n_estimators(
    X_test=X_test,
    X_train=X_train,
    X_val=X_val,
    y_test=y_test,
    y_train=y_train,
    y_val=y_val,
    max_depth=best_params["max_depth"],
    min_samples_split=best_params["min_samples_split"],
    min_samples_leaf=best_params["min_samples_leaf"],
    n_est_lst=n_est_lst
)

In [20]:
n_est_lst = [i for i in range(2, 30)]
plot_accuracy_vs_n_estimators(
    X_test=X_test,
    X_train=X_train,
    X_val=X_val,
    y_test=y_test,
    y_train=y_train,
    y_val=y_val,
    max_depth=best_params["max_depth"],
    min_samples_split=best_params["min_samples_split"],
    min_samples_leaf=best_params["min_samples_leaf"],
    n_est_lst=n_est_lst
)

In [21]:
def func(trial, X_train, X_val, y_train, y_val):
    n_estimators = trial.suggest_int('n_estimators', 2, 10)
    max_depth = trial.suggest_int('max_depth', 2, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 30)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 30)
    random_forest = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
    )
    random_forest.fit(X_train, y_train)
    y_pred = random_forest.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

In [22]:
def get_best_hyperparameters(X_train, X_val, y_train, y_val):
    study = optuna.create_study(direction='maximize')
    study.optimize(
        func=lambda trial: func(trial, X_train, X_val, y_train, y_val),
        n_trials=100,
        show_progress_bar=True
    )
    best_params = study.best_params
    best_value = study.best_value
    return best_params, best_value


In [23]:
X_test, X_train, X_val, y_test, y_train, y_val = get_split_data()
best_params, best_score = get_best_hyperparameters(X_train, X_val, y_train, y_val)
print("~" * 40)
pretty_print_best(best_params, best_score)

In [26]:
def plot_accuracy_vs_n_estimators(
        X_test, X_train, X_val, y_test, y_train, y_val,
        max_depth,
        min_samples_split,
        min_samples_leaf,
        n_est_lst
):
    train_accuracies = []
    test_accuracies = []

    for n_est in n_est_lst:
        random_forest = RandomForestClassifier(
            n_estimators=n_est,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
        )
        random_forest.fit(X_train, y_train)

        y_train_pred = random_forest.predict(X_train)
        train_accuracies.append(accuracy_score(y_train, y_train_pred))

        y_test_pred = random_forest.predict(X_test)
        test_accuracies.append(accuracy_score(y_test, y_test_pred))

    plt.plot(n_est_lst, train_accuracies, label='Train Accuracy')
    plt.plot(n_est_lst, test_accuracies, label='Test Accuracy')
    plt.xlabel('N_estimators')
    plt.ylabel('Accuracy')
    plt.title('Accuracy vs N_estimators')
    plt.legend()
    plt.show()

In [27]:
n_est_lst = [i for i in range(2, 30)]
plot_accuracy_vs_n_estimators(
    X_test=X_test,
    X_train=X_train,
    X_val=X_val,
    y_test=y_test,
    y_train=y_train,
    y_val=y_val,
    max_depth=best_params["max_depth"],
    min_samples_split=best_params["min_samples_split"],
    min_samples_leaf=best_params["min_samples_leaf"],
    n_est_lst=n_est_lst
)

In [28]:
n_est_lst = [i for i in range(2, 200,4)]
plot_accuracy_vs_n_estimators(
    X_test=X_test,
    X_train=X_train,
    X_val=X_val,
    y_test=y_test,
    y_train=y_train,
    y_val=y_val,
    max_depth=best_params["max_depth"],
    min_samples_split=best_params["min_samples_split"],
    min_samples_leaf=best_params["min_samples_leaf"],
    n_est_lst=n_est_lst
)