In [24]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
data = load_breast_cancer()

In [25]:
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
df

In [29]:
from my_dt import DecisionTree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [30]:
def print_accuracy(y_pred, y_test):
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {test_accuracy}")

In [31]:
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
dt = DecisionTree()
dt.fit(X_train, y_train)

pred = dt.predict(X_test)
print_accuracy(pred, y_test)

In [8]:
import matplotlib.pyplot as plt

In [13]:
import optuna

In [14]:
def func(trial, X_train, X_val, y_train, y_val):
    max_depth = trial.suggest_int('max_depth', 2, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 30)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 30)
    dt = DecisionTree(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
    )
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

In [15]:
def get_best_hyperparameters(X_train, X_val, y_train, y_val):
    study = optuna.create_study(direction='maximize')
    study.optimize(
        func=lambda trial: func(trial, X_train, X_val, y_train, y_val),
        n_trials=100,
        show_progress_bar=True
    )
    best_params = study.best_params
    best_value = study.best_value
    return best_params, best_value


In [16]:
def get_split_data():
    data = load_breast_cancer()
    X, y = data.data, data.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    return X_test, X_train, X_val, y_test, y_train, y_val

In [17]:
def pretty_print_best(best_params, best_score):
    print(f"Best parameters: {best_params}")
    print(f"Best score: {best_score}")

In [18]:
X_test, X_train, X_val, y_test, y_train, y_val = get_split_data()
best_params, best_score = get_best_hyperparameters(X_train, X_val, y_train, y_val)
print("~" * 40)
pretty_print_best(best_params, best_score)

In [21]:
def plot_accuracy_vs_depth(X_test, X_train, X_val, y_test, y_train, y_val, min_samples_split, min_samples_leaf,
                           depth_lst):
    train_accuracies = []
    test_accuracies = []

    for depth in depth_lst:
        dt = DecisionTree(
            min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
            max_depth=depth,
        )
        dt.fit(X_train, y_train)

        y_train_pred = dt.predict(X_train)
        train_accuracies.append(accuracy_score(y_train, y_train_pred))

        y_test_pred = dt.predict(X_test)
        test_accuracies.append(accuracy_score(y_test, y_test_pred))

    plt.plot(depth_lst, train_accuracies, label='Train Accuracy')
    plt.plot(depth_lst, test_accuracies, label='Test Accuracy')
    plt.xlabel('Depth')
    plt.ylabel('Accuracy')
    plt.title('Accuracy vs Depth')
    plt.legend()
    plt.show()

In [22]:
depth_lst = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 25]
plot_accuracy_vs_depth(
    depth_lst=depth_lst,
    X_test=X_test,
    X_train=X_train,
    X_val=X_val,
    y_test=y_test,
    y_train=y_train,
    y_val=y_val,
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf']
)