Referance: https://www.youtube.com/watch?v=HdlDYng8g9s

In [None]:
import lib._util.visualplot as vp
import lib._util.mlpipe as mlpipe

# Feature scaling
from lib._class.DFStandardScaler import DFStandardScaler
from lib._class.DFRobustScaler import DFRobustScaler
from lib._class.DFMinMaxScaler import DFMinMaxScaler

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from xgboost import XGBClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import BernoulliNB, CategoricalNB, ComplementNB, GaussianNB, MultinomialNB
from sklearn.dummy import DummyClassifier

from sklearn.gaussian_process.kernels import RBF
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, StackingClassifier

from sklearn import datasets
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline

In [None]:
OUT_PATH_GRAPH = 'resources/output/graph/'

In [None]:
def load_data():
    data_dict = datasets.load_iris()
    
    X = pd.DataFrame(
        data_dict['data'],
        columns=data_dict['feature_names']
    )
    y = pd.Series(
        data_dict['target'],
        name='target'
    )
    
    return X, y

In [None]:
X, y = load_data()

X.shape, y.shape

In [None]:
# Class distribution
vp.value_count(y.to_frame(), 'target')

# Best Hyperparameter (GridSearchCV)

In [None]:
search = GridSearchCV(
    estimator=SVC(random_state=0),
    param_grid={
        'C': [1, 10, 20, 30, 40, 50],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'gamma': ['scale', 'auto'],
    },
    scoring='f1_weighted',
    cv=StratifiedKFold(n_splits=10),
    n_jobs=-1,
    verbose=10,
)
search.fit(X, y)

In [None]:
result_df = pd.DataFrame(search.cv_results_)
result_df[['params', 'mean_test_score', 'rank_test_score']].sort_values(by='mean_test_score', ascending=False)

In [None]:
search.best_params_

In [None]:
svc = SVC(**search.best_params_, random_state=0, probability=True)
svc.fit(X, y)

mlpipe.eval_classif(
    y,
    svc.predict(X),
    y_prob=svc.predict_proba(X),
    multi_class='ovr'
)

# Best Hyperparameter (RandomizedSearchCV)

In [None]:
search = RandomizedSearchCV(
    estimator=SVC(random_state=0),
    param_distributions={
        'C': [1, 10, 20, 30, 40, 50],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'gamma': ['scale', 'auto'],
    },
    scoring='f1_weighted',
    cv=StratifiedKFold(n_splits=10),
    n_jobs=-1,
    verbose=10,
    n_iter=10,
    random_state=0
)
search.fit(X, y)

In [None]:
result_df = pd.DataFrame(search.cv_results_)
result_df[['params', 'mean_test_score', 'rank_test_score']].sort_values(by='mean_test_score', ascending=False)

In [None]:
search.best_params_

In [None]:
svc = SVC(**search.best_params_, random_state=0, probability=True)
svc.fit(X, y)

mlpipe.eval_classif(
    y,
    svc.predict(X),
    y_prob=svc.predict_proba(X),
    multi_class='ovr'
)

# Baseline Classifier

In [None]:
def get_estimators():
    kwargs1 = {'random_state': 0, 'n_jobs': -1}
    kwargs2 = {'n_jobs': -1}
    kwargs3 = {'random_state': 0}
    
    estimators = [
        make_pipeline(
            DFStandardScaler(),
            KNeighborsClassifier(**kwargs2)
        ),
        make_pipeline(
            DFStandardScaler(),
            OneVsRestClassifier(LinearSVC(**kwargs3), **kwargs2)
        ),
        make_pipeline(
            DFStandardScaler(),
            OneVsRestClassifier(NuSVC(**kwargs3), **kwargs2)
        ),
        make_pipeline(
            DFStandardScaler(),
            OneVsRestClassifier(SVC(**kwargs3), **kwargs2)
        ),
        XGBClassifier(**kwargs1),
        GradientBoostingClassifier(**kwargs3),
        ExtraTreesClassifier(**kwargs1),
        RandomForestClassifier(**kwargs1),
        DecisionTreeClassifier(**kwargs3),
        ExtraTreeClassifier(**kwargs3),
        make_pipeline(
            DFRobustScaler(),
            DFMinMaxScaler(),
            LogisticRegression(**kwargs1)
        ),
        make_pipeline(
            DFRobustScaler(),
            DFMinMaxScaler(),
            PassiveAggressiveClassifier(**kwargs1)
        ),
        make_pipeline(
            DFRobustScaler(),
            DFMinMaxScaler(),
            Perceptron(**kwargs1)
        ),
        make_pipeline(
            DFStandardScaler(),
            RidgeClassifier(**kwargs3)
        ),
        make_pipeline(
            DFRobustScaler(),
            DFMinMaxScaler(),
            SGDClassifier(**kwargs1)
        ),
        make_pipeline(
            DFStandardScaler(),
            LinearDiscriminantAnalysis()
        ),
        make_pipeline(
            DFStandardScaler(),
            QuadraticDiscriminantAnalysis()
        ),
        make_pipeline(
            DFRobustScaler(),
            DFMinMaxScaler(),
            MLPClassifier(**kwargs3)
        ),
        make_pipeline(
            DFStandardScaler(),
            GaussianProcessClassifier(**kwargs1)
        ),
        BernoulliNB(),
        CategoricalNB(),
        ComplementNB(),
        GaussianNB(),
        MultinomialNB(),
        DummyClassifier(**kwargs3),
    ]
    
    def model_name(model):
        name = model.__class__.__name__
        if hasattr(model, 'estimator'):
            base_name = model.estimator.__class__.__name__
            
            if name == 'OneVsRestClassifier':
                return f'{base_name}_OVR'
            else:
                return f'{base_name}_{name}'
        return name
    
    return [(model_name(x.steps[-1][1]) if type(x) == Pipeline else model_name(x), x) for x in estimators]

In [None]:
def cross_validation(estimator, X, y):
    return cross_val_score(
        estimator,
        X,
        y,
        scoring='f1_macro',
        cv=StratifiedKFold(10),
        verbose=10,
        n_jobs=-1
    )

In [None]:
eval_dict = {
    'model':    [],
    'cv_score': [],
}

for name, estimator in tqdm(get_estimators()):
    eval_dict['model'].append(name)
    eval_dict['cv_score'].append(
        cross_validation(estimator, X, y)
    )

In [None]:
eval_df = pd.DataFrame(eval_dict)
eval_df['cv_mean'] = eval_df['cv_score'].apply(np.mean)
eval_df['cv_std']  = eval_df['cv_score'].apply(np.std)

eval_dfs = []
for index in eval_df.index:
    eval_dfs.append(
        pd.DataFrame({
            'model': eval_df.at[index, 'model'],
            'cv_score': eval_df.at[index, 'cv_score'],
        })
    )

###### Box

In [None]:
vp.box(
    pd.concat(eval_dfs, axis=0),
    color='model',
    max_col=1,
    title='Classifier - Baseline',
    out_path=OUT_PATH_GRAPH,
    layout_kwargs={'showlegend': False},
    box_kwargs={
        'boxmean': 'sd',
        'boxpoints': False,
    }
)

# Tuned Classifier

In [None]:
kneighbors_params = {
    'kneighborsclassifier__n_neighbors': [3, 5, 10, 25, 50, 100],
    'kneighborsclassifier__weights': ['uniform', 'distance'],
    'kneighborsclassifier__metric': ['euclidean', 'manhattan'],
}
linearsvc_params = {
    'onevsrestclassifier__estimator__C': [.001, .01, .1, 1, 10, 100, 1000],
    'onevsrestclassifier__estimator__fit_intercept': [True, False],
    'onevsrestclassifier__estimator__max_iter': [1000],
}
nusvc_params = {
    'onevsrestclassifier__estimator__nu': np.linspace(.1, 1, 10),
    'onevsrestclassifier__estimator__probability': [True],
    'onevsrestclassifier__estimator__max_iter': [1000],
    'onevsrestclassifier__estimator__decision_function_shape': ['ovr'],
    'onevsrestclassifier__estimator__break_ties': [True, False],
}
svc_params = {
    'onevsrestclassifier__estimator__probability': [True],
    'onevsrestclassifier__estimator__max_iter': [1000],
    'onevsrestclassifier__estimator__decision_function_shape': ['ovr'],
    'onevsrestclassifier__estimator__break_ties': [True, False],
}
tree_params = {
    'max_depth': [None, 3, 5, 7, 10, 15],
    'min_samples_split': np.linspace(.1, 1, 10),
    'min_samples_leaf': np.linspace(.1, .5, 5),
    'max_features': ['sqrt', 'log2'] + list(np.linspace(.5, 1, 6)),
}
logistic_params = {
    'logisticregression__C': [.001, .01, .1, 1, 10, 100, 1000],
    'logisticregression__fit_intercept': [True, False],
    'logisticregression__max_iter': [1000],
    'logisticregression__multi_class': ['auto'],
}
sgd_params = {
    'sgdclassifier__loss': ['hinge', 'modified_huber', 'squared_hinge',
                            'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    'sgdclassifier__penalty': ['l1', 'l2', 'elasticnet'],
    'sgdclassifier__l1_ratio': np.linspace(.1, .9, 9),
    'sgdclassifier__fit_intercept': [True, False],
    'sgdclassifier__max_iter': [1000],
    'sgdclassifier__average': [True, False],
}
mlp_params = {
    'mlpclassifier__hidden_layer_sizes': [(100,), (32, 64, 128) , (128, 64, 32)],
    'mlpclassifier__activation': ['relu'],
    'mlpclassifier__max_iter': [1000],
}
naivebayes_params = {
    'alpha': np.linspace(0, 1, 11),
    'fit_prior': [True, False],
}


search_params = [
    # KNeighborsClassifier
    [
        {
            **kneighbors_params,
            'kneighborsclassifier__algorithm': ['ball_tree', 'kd_tree'],
            'kneighborsclassifier__leaf_size': [3, 30, 300, 3000],
        },
        {
            **kneighbors_params,
            'kneighborsclassifier__algorithm': ['brute'],
        }
    ],
    # LinearSVC
    [
        {
            **linearsvc_params,
            'onevsrestclassifier__estimator__multi_class': ['ovr'],
            'onevsrestclassifier__estimator__penalty': ['l1', 'l2'],
            'onevsrestclassifier__estimator__loss': ['hinge', 'squared_hinge'],
            'onevsrestclassifier__estimator__dual': [True, False],
        },
        {
            **linearsvc_params,
            'onevsrestclassifier__estimator__multi_class': ['crammer_singer'],
        }
    ],
    # NuSVC
    [
        {
            **nusvc_params,
            'onevsrestclassifier__estimator__kernel': ['linear', 'precomputed'],
            
        },
        {
            **nusvc_params,
            'onevsrestclassifier__estimator__kernel': ['rbf', 'sigmoid'],
            'onevsrestclassifier__estimator__gamma': [.001, .01, .1, 1, 10, 100, 1000, 'scale', 'auto'],
        },
        {
            **nusvc_params,
            'onevsrestclassifier__estimator__kernel': ['poly'],
            'onevsrestclassifier__estimator__gamma': [.001, .01, .1, 1, 10, 100, 1000, 'scale', 'auto'],
            'onevsrestclassifier__estimator__degree': [.003, .03, .3, 3, 30, 300, 3000],
        }
    ],
    # SVC
    [
        {
            **svc_params,
            'onevsrestclassifier__estimator__kernel': ['linear', 'precomputed'],
        },
        {
            **svc_params,
            'onevsrestclassifier__estimator__kernel': ['rbf', 'sigmoid'],
            'onevsrestclassifier__estimator__gamma': [.001, .01, .1, 1, 10, 100, 1000, 'scale', 'auto'],
        },
        {
            **svc_params,
            'onevsrestclassifier__estimator__kernel': ['poly'],
            'onevsrestclassifier__estimator__gamma': [.001, .01, .1, 1, 10, 100, 1000, 'scale', 'auto'],
            'onevsrestclassifier__estimator__degree': [.003, .03, .3, 3, 30, 300, 3000],
        }
    ],
    # XGBClassifier
    [
        {
            'n_estimators': [25, 50, 100, 500, 1000],
            'max_depth': [None, 3, 5, 7, 10, 15],
            'learning_rate': [.1, .01],
            'objective':['reg:logistic'],
            'booster': ['gbtree', 'gblinear', 'dart'],
            'subsample': [.8],
            'colsample_bytree': np.linspace(.5, 1, 6),
            'reg_alpha': np.linspace(0, 1, 11),
            'reg_lambda': np.linspace(0, 1, 11),
        }
    ],
    # GradientBoostingClassifier
    [
        {
            **tree_params,
            'criterion': ['friedman_mse', 'mse', 'mae'],
            'n_estimators': [25, 50, 100, 500, 1000],
            'loss': ['deviance', 'exponential'],
            'learning_rate': [.1, .01],
            'subsample': np.linspace(.5, 1, 6),
        }
    ],
    # ExtraTreesClassifier
    [
        {
            **tree_params,
            'criterion': ['gini', 'entropy'],
            'n_estimators': [25, 50, 100, 500, 1000],
            'oob_score': [True, False],
            'bootstrap': [False],
        },
        {
            **tree_params,
            'criterion': ['gini', 'entropy'],
            'n_estimators': [25, 50, 100, 500, 1000],
            'oob_score': [True, False],
            'bootstrap': [True],
            'max_samples': np.linspace(.5, 1, 6),
        }
    ],
    # RandomForestClassifier
    [
        {
            **tree_params,
            'criterion': ['gini', 'entropy'],
            'n_estimators': [25, 50, 100, 500, 1000],
            'bootstrap': [False],
        },
        {
            **tree_params,
            'criterion': ['gini', 'entropy'],
            'n_estimators': [25, 50, 100, 500, 1000],
            'bootstrap': [True],
            'max_samples': np.linspace(.5, 1, 6),
        }
    ],
    # DecisionTreeClassifier
    [
        {
            **tree_params,
            'criterion': ['gini', 'entropy'],
            'splitter': ['best'],
        }
    ],
    # ExtraTreeClassifier
    [
        {
            **tree_params,
            'criterion': ['gini', 'entropy'],
            'splitter': ['best'],
        }
    ],
    # LogisticRegression
    [
        {
            **logistic_params,
            'logisticregression__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
            'logisticregression__penalty': ['l1'],
        },
        {
            **logistic_params,
            'logisticregression__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
            'logisticregression__penalty': ['l2'],
        },
        {
            **logistic_params,
            'logisticregression__solver': ['liblinear '],
            'logisticregression__penalty': ['l2'],
            'logisticregression__dual': [True, False],
        },
        {
            **logistic_params,
            'logisticregression__solver': ['saga'],
            'logisticregression__penalty': ['elasticnet'],
            'logisticregression__l1_ratio': np.linspace(.1, .9, 9),
        }
    ],
    # PassiveAggressiveClassifier
    [
        {
            'passiveaggressiveclassifier__C': [.001, .01, .1, 1, 10, 100, 1000],
            'passiveaggressiveclassifier__fit_intercept': [True, False],
            'passiveaggressiveclassifier__max_iter': [1000],
            'passiveaggressiveclassifier__loss': ['hinge', 'squared_hinge'],
            'passiveaggressiveclassifier__average': [True, False],
        }
    ],
    # Perceptron
    [
        {
            'perceptron__penalty': [None, 'l1', 'l2', 'elasticnet'],
            'perceptron__fit_intercept': [True, False],
            'perceptron__max_iter': [1000],
        }
    ],
    # RidgeClassifier
    [
        {
            'ridgeclassifier__alpha': [.001, .01, .1, 1, 10, 100, 1000],
            'ridgeclassifier__fit_intercept': [True, False],
            'ridgeclassifier__max_iter': [1000],
            'ridgeclassifier__solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag','saga'],
        }
    ],
    # SGDClassifier
    [
        {
            **sgd_params,
            'sgdclassifier__learning_rate': ['optimal'],
        },
        {
            **sgd_params,
            'sgdclassifier__learning_rate': ['invscaling', 'adaptive'],
            'sgdclassifier__eta0': [.1],
        }
    ],
    # LinearDiscriminantAnalysis
    [
        {
            'lineardiscriminantanalysis__solver': ['svd'],
        },
        {
            'lineardiscriminantanalysis__solver': ['lsqr', 'eigen'],
            'lineardiscriminantanalysis__shrinkage': ['auto'] + list(np.linspace(0, 1, 11)),
        }
    ],
    # QuadraticDiscriminantAnalysis
    [
        {
            'quadraticdiscriminantanalysis__reg_param': np.linspace(0, 1, 11),
        }
    ],
    # MLPClassifier
    [
        {
            **mlp_params,
            'mlpclassifier__solver': ['lbfgs'],
        },
        {
            **mlp_params,
            'mlpclassifier__solver': ['adam'],
            'mlpclassifier__learning_rate_init': [.01, .001],
        },
        {
            **mlp_params,
            'mlpclassifier__solver': ['sgd'],
            'mlpclassifier__learning_rate': ['invscaling', 'adaptive'],
            'mlpclassifier__learning_rate_init': [.01, .001],
        }
    ],
    # GaussianProcessClassifier
    [
        {
            'gaussianprocessclassifier__kernel': [x * RBF(x) for x in [.001, .01, .1, 1, 10, 100, 1000]],
            'gaussianprocessclassifier__max_iter_predict': [1000],
            'gaussianprocessclassifier__multi_class': ['one_vs_rest', 'one_vs_one'],
        }
    ],
    # BernoulliNB
    [
        {
            **naivebayes_params,
        }
    ],
    # CategoricalNB
    [
        {
            **naivebayes_params,
        }
    ],
    # ComplementNB
    [
        {
            **naivebayes_params,
            'norm': [True, False],
        }
    ],
    # GaussianNB
    [
        {
            'var_smoothing': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9],
        }
    ],
    # MultinomialNB
    [
        {
            **naivebayes_params,
        }
    ],
    # DummyClassifier
    [
        {
            'strategy': ['stratified', 'prior', 'uniform'],
        }
    ]
]

In [None]:
eval_dict = {
    'model':          [],
    'cv_score':       [],
    'best_param':     [],
    'best_estimator': [],
}

for index, (name, estimator) in enumerate(get_estimators()):
    print(name)
    eval_dict['model'].append(name)
    
    search = RandomizedSearchCV(
        estimator=estimator,
        param_distributions=search_params[index],
        scoring='f1_weighted',
        cv=StratifiedKFold(n_splits=10),
        n_jobs=-1,
        verbose=10,
        refit=True,
        n_iter=100,
        random_state=0
    )
    search.fit(X, y)
    eval_dict['best_param'].append(
        search.best_params_
    )
    eval_dict['best_estimator'].append(
        search.best_estimator_
    )
    eval_dict['cv_score'].append(
        cross_validation(search.best_estimator_, X, y)
    )

In [None]:
eval_df = pd.DataFrame(eval_dict)
eval_df['cv_mean'] = eval_df['cv_score'].apply(np.mean)
eval_df['cv_std']  = eval_df['cv_score'].apply(np.std)

eval_dfs = []
for index in eval_df.index:
    eval_dfs.append(
        pd.DataFrame({
            'model': eval_df.at[index, 'model'],
            'cv_score': eval_df.at[index, 'cv_score'],
        })
    )

###### Box

In [None]:
vp.box(
    pd.concat(eval_dfs, axis=0),
    color='model',
    max_col=1,
    title='Classifier - Tuned',
    out_path=OUT_PATH_GRAPH,
    layout_kwargs={'showlegend': False},
    box_kwargs={
        'boxmean': 'sd',
        'boxpoints': False,
    }
)