Referance: https://www.youtube.com/watch?v=HdlDYng8g9s

In [None]:
import lib._util.visualplot as vp
import lib._util.mlpipe as mlpipe

In [None]:
import pandas as pd
import numpy as np

from sklearn import datasets
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.svm import SVC

In [None]:
def load_data():
    data_dict = datasets.load_iris()
    
    X = pd.DataFrame(
        data_dict['data'],
        columns=data_dict['feature_names']
    )
    y = pd.Series(
        data_dict['target'],
        name='target'
    )
    
    return X, y

In [None]:
X, y = load_data()

X.shape, y.shape

In [None]:
# Class distribution
vp.value_count(y.to_frame(), 'target')

# Best Hyperparameter (GridSearchCV)

In [None]:
search = GridSearchCV(
    estimator=SVC(random_state=0),
    param_grid={
        'C': [1, 10, 20, 30, 40, 50],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'gamma': ['scale', 'auto'],
    },
    scoring='f1_weighted',
    cv=StratifiedKFold(n_splits=10),
    n_jobs=-1,
    verbose=10,
)
search.fit(X, y)

In [None]:
result_df = pd.DataFrame(search.cv_results_)
result_df[['params', 'mean_test_score', 'rank_test_score']].sort_values(by='mean_test_score', ascending=False)

In [None]:
search.best_params_

In [None]:
svc = SVC(**search.best_params_, random_state=0, probability=True)
svc.fit(X, y)

mlpipe.eval_classif(
    y,
    svc.predict(X),
    y_prob=svc.predict_proba(X),
    multi_class='ovr'
)

# Best Hyperparameter (RandomizedSearchCV)

In [None]:
search = RandomizedSearchCV(
    estimator=SVC(random_state=0),
    param_distributions={
        'C': [1, 10, 20, 30, 40, 50],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'gamma': ['scale', 'auto'],
    },
    scoring='f1_weighted',
    cv=StratifiedKFold(n_splits=10),
    n_jobs=-1,
    verbose=10,
    n_iter=10,
    random_state=0
)
search.fit(X, y)

In [None]:
result_df = pd.DataFrame(search.cv_results_)
result_df[['params', 'mean_test_score', 'rank_test_score']].sort_values(by='mean_test_score', ascending=False)

In [None]:
search.best_params_

In [None]:
svc = SVC(**search.best_params_, random_state=0, probability=True)
svc.fit(X, y)

mlpipe.eval_classif(
    y,
    svc.predict(X),
    y_prob=svc.predict_proba(X),
    multi_class='ovr'
)

# Best Model

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
model_dict = {
    # Reference: https://medium.com/all-things-ai/in-depth-parameter-tuning-for-svc-758215394769
    'svc': {
        'model': SVC(random_state=0),
        'param_dict': {
            'C': np.logspace(-2, 2, 10),
            'kernel': ['poly', 'rbf', 'sigmoid'],
            'gamma': np.linspace(0.00001, 5, 10),
        }
    },
    # Reference: https://medium.com/all-things-ai/in-depth-parameter-tuning-for-random-forest-d67bb7e920d
    'random_forest': {
        'model': RandomForestClassifier(random_state=0, n_jobs=-1),
        'param_dict': {
            'n_estimators': [10, 50, 100, 200, 300],
            'max_depth': [None, 5, 10, 15, 20],
            'min_samples_split': np.linspace(.1, 1, 10),
            'min_samples_leaf': np.linspace(.1, .5, 5),
            'max_features': ['sqrt', 'log2'] + list(np.linspace(.5, 1, 5)),
        }
    },
    # Reference: https://towardsdatascience.com/logistic-regression-model-tuning-with-scikit-learn-part-1-425142e01af5
    'logistic_regression': {
        'model': LogisticRegression(max_iter=1_000, random_state=0, n_jobs=-1),
        'param_dict': {
            'C': np.logspace(-4, 4, 20),
            'penalty': ['l1', 'l2'],
        }
    }
}

results = []
for k,v in model_dict.items():
    print(f'Searching {k}:')
    
    search = RandomizedSearchCV(
        estimator=v['model'],
        param_distributions=v['param_dict'],
        scoring='f1_weighted',
        cv=StratifiedKFold(n_splits=10),
        n_jobs=-1,
        verbose=10,
        n_iter=100,
        random_state=0
    )
    search.fit(X, y)
    
    results.append({
        'model': k,
        'best_score': search.best_score_,
        'best_params': search.best_params_,
    })

In [None]:
result_df = pd.DataFrame(results)
result_df.sort_values(by='best_score', ascending=False)

In [None]:
model = SVC(**result_df['best_params'][0], random_state=0, probability=True)
model.fit(X, y)

mlpipe.eval_classif(
    y,
    model.predict(X),
    y_prob=model.predict_proba(X),
    multi_class='ovr'
)

In [None]:
model = RandomForestClassifier(**result_df['best_params'][1], random_state=0, n_jobs=-1)
model.fit(X, y)

mlpipe.eval_classif(
    y,
    model.predict(X),
    y_prob=model.predict_proba(X),
    multi_class='ovr'
)

In [None]:
model = LogisticRegression(**result_df['best_params'][2], max_iter=1_000, random_state=0, n_jobs=-1)
model.fit(X, y)

mlpipe.eval_classif(
    y,
    model.predict(X),
    y_prob=model.predict_proba(X),
    multi_class='ovr'
)