In [8]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()

In [9]:
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
df

In [10]:
df.info()

In [11]:
df.describe()
# выбросы? 

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [13]:
def pretty_print_best_params(best_params: dict, best_score: float) -> None:
    print(f"Best Score: {best_score:.4f}")
    print("Best Parameters:")
    for key, value in best_params.items():
        print(f"  {key}: {value}")


def print_accuracy(y_pred, y_test) -> None:
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {test_accuracy}")

In [14]:
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print_accuracy(y_pred, y_test)

In [15]:
import matplotlib.pyplot as plt

In [16]:
def plot_tree_depth_by_params(
        min_samples_split_lst,
        min_samples_leaf_lst,
        X_train,
        y_train
):
    depth_lst = []
    labels = []

    for min_samples_split in min_samples_split_lst:
        for min_samples_leaf in min_samples_leaf_lst:
            dt = DecisionTreeClassifier(min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
            dt.fit(X_train, y_train)

            max_depth = dt.tree_.max_depth
            depth_lst.append(max_depth)
            labels.append(
                f"split:{min_samples_split};leaf:{min_samples_leaf}"
            )
    plt.figure(figsize=(10, 6))
    plt.plot(depth_lst)
    plt.xticks(range(len(depth_lst)), labels, rotation=45, ha='right')
    plt.xlabel("Hyperparameters")
    plt.ylabel("Tree depth")
    plt.title("Tree depth by hyperparameters")
    plt.show()

In [17]:
# min_samples_split_lst = [i for i in range(2, 31)]
min_samples_split_lst = [2, 3, 5, 10, 15, 25]
# min_samples_leaf_lst = [i for i in range(1, 31)]
min_samples_leaf_lst = [2, 3, 5, 10, 15, 25]
plot_tree_depth_by_params(
    min_samples_split_lst=min_samples_split_lst,
    min_samples_leaf_lst=min_samples_leaf_lst,
    X_train=X_train,
    y_train=y_train
)


In [18]:
import optuna
import numpy as np

In [19]:
def func(trial, X_train: np.ndarray, X_val: np.ndarray, y_train: np.ndarray, y_val: np.ndarray) -> float:
    max_depth = trial.suggest_int('max_depth', 2, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 30)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 30)
    dt = DecisionTreeClassifier(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

In [20]:
def get_best_hyperparameters(
        X_train: np.ndarray, X_val: np.ndarray, y_train: np.ndarray, y_val: np.ndarray
) -> tuple[dict, float]:
    study = optuna.create_study(direction='maximize')
    study.optimize(
        func=lambda trial: func(trial, X_train, X_val, y_train, y_val),
        n_trials=100,
        show_progress_bar=True
    )
    best_params = study.best_params
    best_value = study.best_value
    return best_params, best_value


In [21]:
def get_split_data():
    data = load_breast_cancer()
    X, y = data.data, data.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    return X_test, X_train, X_val, y_test, y_train, y_val

In [22]:
X_test, X_train, X_val, y_test, y_train, y_val = get_split_data()
best_params, best_score = get_best_hyperparameters(X_train, X_val, y_train, y_val)
print("~" * 40)
pretty_print_best_params(best_params, best_score)

In [23]:
def plot_accuracy_vs_depth(X_test, X_train, y_test, y_train,
                           min_samples_split, min_samples_leaf, depth_lst):
    train_accuracies = []
    test_accuracies = []

    for depth in depth_lst:
        dt = DecisionTreeClassifier(
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_depth=depth, random_state=42,
        )
        dt.fit(X_train, y_train)

        y_train_pred = dt.predict(X_train)
        train_accuracies.append(accuracy_score(y_train, y_train_pred))

        y_test_pred = dt.predict(X_test)
        test_accuracies.append(accuracy_score(y_test, y_test_pred))

    plt.plot(depth_lst, train_accuracies, label='Train Accuracy')
    plt.plot(depth_lst, test_accuracies, label='Test Accuracy')
    plt.xlabel('Depth')
    plt.ylabel('Accuracy')
    plt.title('Accuracy vs Depth')
    plt.legend()
    plt.show()

In [29]:
# min_samples_split_lst = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 25]
min_samples_split_lst = list(range(1, 12, 1))
plot_accuracy_vs_depth(
    depth_lst=min_samples_split_lst,
    X_test=X_test,
    X_train=X_train,
    y_test=y_test,
    y_train=y_train,
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf']
)