In [7]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()

In [9]:
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
df

In [14]:
df.info()

In [15]:
df.describe()
# выбросы? 

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [16]:
def print_accuracy(y_pred, y_test):
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {test_accuracy}")

In [17]:
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print_accuracy(y_pred, y_test)

In [29]:
import matplotlib.pyplot as plt

In [54]:
def plot_tree_depth_by_params(
        min_samples_split_lst,
        min_samples_leaf_lst,
        X_train,
        y_train
):
    depth_lst = []
    labels = []

    for min_samples_split in min_samples_split_lst:
        for min_samples_leaf in min_samples_leaf_lst:
            dt = DecisionTreeClassifier(min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
            dt.fit(X_train, y_train)

            max_depth = dt.tree_.max_depth
            depth_lst.append(max_depth)
            labels.append(
                f"split:{min_samples_split};leaf:{min_samples_leaf}"
            )
    plt.figure(figsize=(10, 6))
    plt.plot(depth_lst)
    plt.xticks(range(len(depth_lst)), labels, rotation=45, ha='right')
    plt.xlabel("Hyperparameters")
    plt.ylabel("Tree depth")
    plt.title("Tree depth by hyperparameters")
    plt.show()

In [56]:
# min_samples_split_lst = [i for i in range(2, 31)]
depth_lst = [2, 3, 5, 10, 15, 25]
# min_samples_leaf_lst = [i for i in range(1, 31)]
min_samples_leaf_lst = [2, 3, 5, 10, 15, 25]
plot_tree_depth_by_params(
    min_samples_split_lst=depth_lst,
    min_samples_leaf_lst=min_samples_leaf_lst,
    X_train=X_train,
    y_train=y_train
)


In [57]:
import optuna

In [63]:
def func(trial, X_train, X_val, y_train, y_val):
    max_depth = trial.suggest_int('max_depth', 2, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 30)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 30)
    dt = DecisionTreeClassifier(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

In [64]:
def get_best_hyperparameters(X_train, X_val, y_train, y_val):
    study = optuna.create_study(direction='maximize')
    study.optimize(
        func=lambda trial: func(trial, X_train, X_val, y_train, y_val),
        n_trials=100,
        show_progress_bar=True
    )
    best_params = study.best_params
    best_value = study.best_value
    return best_params, best_value


In [65]:
def get_split_data():
    data = load_breast_cancer()
    X, y = data.data, data.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    return X_test, X_train, X_val, y_test, y_train, y_val

In [66]:
def pretty_print_best(best_params, best_score):
    print(f"Best parameters: {best_params}")
    print(f"Best score: {best_score}")

In [67]:
X_test, X_train, X_val, y_test, y_train, y_val = get_split_data()
best_params, best_score = get_best_hyperparameters(X_train, X_val, y_train, y_val)
print("~" * 40)
pretty_print_best(best_params, best_score)

In [78]:
def plot_accuracy_vs_depth(X_test, X_train, X_val, y_test, y_train, y_val, min_samples_split, min_samples_leaf,
                           depth_lst):
    train_accuracies = []
    test_accuracies = []

    for depth in depth_lst:
        dt = DecisionTreeClassifier(
            min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
            max_depth=depth, random_state=42,
        )
        dt.fit(X_train, y_train)

        y_train_pred = dt.predict(X_train)
        train_accuracies.append(accuracy_score(y_train, y_train_pred))

        y_test_pred = dt.predict(X_test)
        test_accuracies.append(accuracy_score(y_test, y_test_pred))

    plt.plot(depth_lst, train_accuracies, label='Train Accuracy')
    plt.plot(depth_lst, test_accuracies, label='Test Accuracy')
    plt.xlabel('Depth')
    plt.ylabel('Accuracy')
    plt.title('Accuracy vs Depth')
    plt.legend()
    plt.show()

In [80]:
depth_lst = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 25]
plot_accuracy_vs_depth(
    depth_lst=depth_lst,
    X_test=X_test,
    X_train=X_train,
    X_val=X_val,
    y_test=y_test,
    y_train=y_train,
    y_val=y_val,
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf']
)