## Построение модели (подбор алгоритма и гиперпараметров)

In [1]:
import optuna
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

from src.config import TARGET_COLUMN

# Models
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier


In [2]:
df = pd.read_csv('./data/preprocessed/train.csv')

features = df.drop(columns=[TARGET_COLUMN])
target = df[TARGET_COLUMN]

Будем использовать кросс-валидацию как функцию ошибки при подборе гиперпараметров.

In [3]:
def cross_validation(classifier) -> float:
    error = 0
    for train, test in KFold(n_splits=5, shuffle=True).split(features, target):
        classifier.fit(features.loc[train], target.loc[train])
        classifier_prediction = classifier.predict(features.loc[test])
        error += mean_squared_error(target.loc[test], classifier_prediction)
    return error

Я бы очень хотел написать функцию `objective` следующим образом:

```python
def objective(trial: optuna.trial.Trial):
    algorithm = trial.suggest_categorical('algorithm', ['SVC',
                                                        'DecisionTreeClassifier',
                                                        'KNeighborsClassifier',
                                                        'GaussianNB',
                                                        'AdaBoostClassifier'])
    match algorithm:
        case 'SVC':
            svc_c = trial.suggest_float('svc_c', 0.01, 10.0)
            kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
            degree = 3
            if kernel == 'poly':
                degree = trial.suggest_int('degree', 1, 5)
            gamma = 'scale'
            if kernel in ['rbf', 'poly', 'sigmoid']:
                gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
            coef0 = 0.0
            if kernel in ['poly', 'sigmoid']:
                coef0 = trial.suggest_float('coef0', -1, 1)
            shrinking = trial.suggest_categorical('shrinking', [False, True])
            probability = trial.suggest_categorical('probability', [False, True])
            classifier = SVC(C=svc_c,
                             kernel=kernel,
                             degree=degree,
                             gamma=gamma,
                             coef0=coef0,
                             shrinking=shrinking,
                             probability=probability)
        case 'DecisionTreeClassifier':
            criterion = trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss'])
            splitter = trial.suggest_categorical('splitter', ['best', 'random'])
            max_depth = None
            if trial.suggest_categorical('use max_depth', [False, True]):
                max_depth = trial.suggest_int('max_depth', 1, 100)
            classifier = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth)
        case 'KNeighborsClassifier':
            n_neighbors = trial.suggest_int('n_neighbors', 1, 100)
            weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
            metric = trial.suggest_categorical('metric', ['cityblock',
                                                          'cosine',
                                                          'euclidean',
                                                          'l1',
                                                          'l2',
                                                          'manhattan',
                                                          'nan_euclidean'])
            classifier = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, metric=metric)
        case 'GaussianNB':
            priors = None
            if trial.suggest_categorical('use prior', [False, True]):
                prior = trial.suggest_float('prior', 0.0, 1.0)
                priors = [prior, 1.0 - prior]
            classifier = GaussianNB(priors=priors)
        case 'AdaBoostClassifier':
            n_estimators = trial.suggest_int('n_estimators', 1, 150)
            learning_rate = trial.suggest_float('learning_rate', 0.001, 5.0)
            classifier = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
        case _:
            raise ValueError(f'Unexpected algorithm: {algorithm}')
    return cross_validation(classifier=classifier)
```

Однако в силу того, что в задании требуется вывести информацию о полезности гиперпараметров - придется вынести все вызовы `suggest_` на верхний уровень.

In [4]:
def objective(trial: optuna.trial.Trial, get_classifier: bool = False):
    algorithm = trial.suggest_categorical('algorithm', ['SVC',
                                                        'DecisionTreeClassifier',
                                                        'KNeighborsClassifier',
                                                        'GaussianNB',
                                                        'AdaBoostClassifier'])

    # Hyperparameters for SVC
    svc_c = trial.suggest_float('svc_c', 0.01, 10.0)
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
    degree = 3
    if kernel == 'poly':
        degree = trial.suggest_int('degree', 1, 5)
    gamma = 'scale'
    if kernel in ['rbf', 'poly', 'sigmoid']:
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
    coef0 = 0.0
    if kernel in ['poly', 'sigmoid']:
        coef0 = trial.suggest_float('coef0', -1, 1)
    shrinking = trial.suggest_categorical('shrinking', [False, True])
    probability = trial.suggest_categorical('probability', [False, True])

    # Hyperparameters for DecisionTreeClassifier
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss'])
    splitter = trial.suggest_categorical('splitter', ['best', 'random'])
    max_depth = None
    if trial.suggest_categorical('use max_depth', [False, True]):
        max_depth = trial.suggest_int('max_depth', 1, 100)

    # Hyperparameters for KNeighborsClassifier
    n_neighbors = trial.suggest_int('n_neighbors', 1, 100)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    metric = trial.suggest_categorical('metric', ['cityblock',
                                                  'cosine',
                                                  'euclidean',
                                                  'l1',
                                                  'l2',
                                                  'manhattan',
                                                  'nan_euclidean'])

    # Hyperparameters for GaussianNB
    priors = None
    if trial.suggest_categorical('use prior', [False, True]):
        prior = trial.suggest_float('prior', 0.0, 1.0)
        priors = [prior, 1.0 - prior]

    # Hyperparameters for AdaBoostClassifier
    n_estimators = trial.suggest_int('n_estimators', 1, 150)
    learning_rate = trial.suggest_float('learning_rate', 0.001, 5.0)

    match algorithm:
        case 'SVC':
            classifier = SVC(C=svc_c,
                             kernel=kernel,
                             degree=degree,
                             gamma=gamma,
                             coef0=coef0,
                             shrinking=shrinking,
                             probability=probability)
        case 'DecisionTreeClassifier':
            classifier = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth)
        case 'KNeighborsClassifier':
            classifier = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, metric=metric)
        case 'GaussianNB':
            classifier = GaussianNB(priors=priors)
        case 'AdaBoostClassifier':
            classifier = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
        case _:
            raise ValueError(f'Unexpected algorithm: {algorithm}')

    if get_classifier:
        return classifier

    return cross_validation(classifier=classifier)

Первый запуск выполним с базовым sampler'ом, установим число итераций равное 100 и посмотрим на результат оптимизации.

In [5]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

study_with_base_sampler = optuna.create_study(study_name="Study with BaseSampler")
study_with_base_sampler.optimize(objective, n_trials=100);

best_trial_for_base_sampler = study_with_base_sampler.best_trial
print(f'Best score for BaseSampler: {best_trial_for_base_sampler.value}')
print(f'Best params for BaseSampler: {best_trial_for_base_sampler.params}')

Best score for BaseSampler: 0.11068943706514864
Best params for BaseSampler: {'algorithm': 'SVC', 'svc_c': 7.452279016078824, 'kernel': 'rbf', 'gamma': 'auto', 'shrinking': False, 'probability': False, 'criterion': 'entropy', 'splitter': 'random', 'use max_depth': True, 'max_depth': 24, 'n_neighbors': 28, 'weights': 'distance', 'metric': 'nan_euclidean', 'use prior': False, 'n_estimators': 134, 'learning_rate': 0.4046155642063821}


Отобразим в браузере график зависимости функции ошибки от номера итерации.

In [12]:
optuna.visualization.plot_optimization_history(study_with_base_sampler).show(renderer='browser')

А также информацию о полезности гиперпараметров.

In [13]:
optuna.visualization.plot_param_importances(study_with_base_sampler).show(renderer='browser');

Для сравнения выполним запуск с использованием RandomSampler с тем же числом итераций.

In [8]:
study_with_random_sampler = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=17),
                                                study_name="Study with RandomSampler(seed=17)")
study_with_random_sampler.optimize(objective, n_trials=100);

best_trial_for_random_sampler = study_with_random_sampler.best_trial
print(f'Best score for RandomSampler(seed=17): {best_trial_for_random_sampler.value}')
print(f'Best params for RandomSampler(seed=17): {best_trial_for_random_sampler.params}')

Best score for RandomSampler(seed=17): 0.10651132823395838
Best params for RandomSampler(seed=17): {'algorithm': 'SVC', 'svc_c': 8.571532161838777, 'kernel': 'rbf', 'gamma': 'auto', 'shrinking': False, 'probability': False, 'criterion': 'gini', 'splitter': 'best', 'use max_depth': False, 'n_neighbors': 92, 'weights': 'uniform', 'metric': 'l1', 'use prior': False, 'n_estimators': 41, 'learning_rate': 0.6289061699686381}


И снова построим необходимые графики.

In [14]:
optuna.visualization.plot_optimization_history(study_with_random_sampler).show(renderer='browser')
optuna.visualization.plot_param_importances(study_with_random_sampler).show(renderer='browser');

Теперь сравним результаты двух лучших моделей на тестовой выборке.

In [10]:
from src.checker import mean_squared_error_of

test_df = pd.read_csv('./data/preprocessed/test.csv')

def predict_for_trial(trial: optuna.trial.Trial) -> pd.DataFrame:
    best_classifier = objective(trial, get_classifier=True)
    best_classifier.fit(features, target)
    return pd.DataFrame(best_classifier.predict(test_df), columns=[TARGET_COLUMN])

prediction_for_base_sampler = predict_for_trial(best_trial_for_base_sampler)
print(f'Final result of BaseSampler: {mean_squared_error_of(prediction_for_base_sampler)}')

prediction_for_random_sampler = predict_for_trial(best_trial_for_random_sampler)
print(f'Final result of RandomSampler(seed=17): {mean_squared_error_of(prediction_for_random_sampler)}')

Final result of BaseSampler: 0.049939098660170524
Final result of RandomSampler(seed=17): 0.05115712545676005


Вывод: здесь только словами, так как в силу того, что подбор не является детерминированным - иногда результаты могут отличаться. Но для оценки результата можно посмотреть на base line - в данном случае предсказание класса нулями/единицами независимо от признаков.

In [11]:
import numpy as np

print(f'Base line with 0: {mean_squared_error_of(pd.DataFrame(np.zeros(1642), columns=[TARGET_COLUMN]))}')
print(f'Base line with 1: {mean_squared_error_of(pd.DataFrame(np.zeros(1642) + 1, columns=[TARGET_COLUMN]))}')

Base line with 0: 0.09622411693057248
Base line with 1: 0.9037758830694276
