In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sbn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
import baseline
from sklearn import svm
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from numpy.typing import NDArray
import sklearn as skl
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
import importlib as imp

In [7]:
raw = pd.read_csv('./wine+quality/winequality-red.csv', sep=';')
RANDOM_STATE = 0 # random_state
column_names = list(raw.columns)
feature_names = column_names[:-1]
target_name = column_names[-1]
train_examples, test_examples = train_test_split(np.asarray(raw[feature_names]), test_size=0.2, random_state=RANDOM_STATE)
train_labels, test_labels = train_test_split(np.asarray(raw[target_name]), test_size=0.2, random_state=RANDOM_STATE)
print(f"train examples: {train_examples.shape}")
print(f"test examples:  {test_examples.shape}")
print(f"train labels:   {train_labels.shape}")
print(f"test labels:    {test_labels.shape}")

train examples: (1279, 11)
test examples:  (320, 11)
train labels:   (1279,)
test labels:    (320,)


In [8]:
# """prepare for Stratified K-Fold cross validation"""
NUMBER_OF_FOLDS = 5
# imp.reload(baseline)
# splitter = StratifiedKFold(n_splits=NUMBER_OF_FOLDS)
# SKF_train_indices : list[NDArray] = []
# SKF_valid_indices : list[NDArray] = []
# for train, test in splitter.split(X=train_examples, y=train_labels):
#     SKF_train_indices.append(train)
#     SKF_valid_indices.append(test)

In [9]:
@ignore_warnings(category=ConvergenceWarning)
def grid_search_best_params(classifier, parameter_space, train_examples, train_labels):
    pipe = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('model', classifier),
        ])
    search = GridSearchCV(estimator=pipe, param_grid=parameter_space, cv=StratifiedKFold(NUMBER_OF_FOLDS).split(X=train_examples, y=train_labels),
        scoring='accuracy')
    search.fit(X=train_examples, y=train_labels)
    return search.best_params_, search.best_estimator_

In [10]:
svc_parameter_space = {
        'model__C': [0.01, 0.1, 1, 10, 100],
        'model__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
        }
svc_best_params, svc_best_estimator = grid_search_best_params(
    classifier=svm.SVC(probability=True, decision_function_shape='ovo', random_state=RANDOM_STATE),
    parameter_space=svc_parameter_space,
    train_examples=train_examples,
    train_labels=train_labels)

In [11]:
mlp_parameter_space = {
        'model__alpha': [1e-4, 1e-3, 1e-2],
        'model__activation': ['logistic', 'tanh', 'relu'],
        'model__solver': ['sgd', 'adam']
        }
mlp_best_params, mlp_best_estimator = grid_search_best_params(
    classifier=MLPClassifier(max_iter=1000, random_state=RANDOM_STATE),
    parameter_space=mlp_parameter_space,
    train_examples=train_examples,
    train_labels=train_labels)

In [12]:
rf_parameter_space = {
        'model__n_estimators': [50, 100, 150],
        'model__criterion': ['gini', 'entropy', 'log_loss'],
        }
rf_best_params, rf_best_estimator = grid_search_best_params(
    classifier=RandomForestClassifier(random_state=RANDOM_STATE),
    parameter_space=rf_parameter_space,
    train_examples=train_examples,
    train_labels=train_labels)

In [13]:
# def get_results_storage(number_of_folds):
#     return {
#         'ROC AUC': {
#             'function': lambda prediction, score, label:
#                 roc_auc_score(y_true=label, y_score=score, multi_class='ovr'),
#             'each round': np.zeros(shape=number_of_folds),
#             'cv value': []
#         },
#         'Average Precision' : {
#             'function': lambda prediction, score, label:
#                 average_precision_score(y_true=label, y_score=score),
#             'each round': np.zeros(shape=number_of_folds),
#             'cv value': []
#         },
#         'Cross Entropy' : {
#             'function': lambda prediction, score, label:
#                 log_loss(y_true=label, y_pred=score),
#             'each round': np.zeros(shape=number_of_folds),
#             'cv value': []
#         },
#         'Accuracy' : {
#             'each round': np.zeros(shape=number_of_folds),
#             'cv value': []
#         },
#     }
# def do_cross_validation(model):
#     results = get_results_storage(NUMBER_OF_FOLDS)
#     # clf = svm.SVC(kernel='rbf', C=0.01, probability=True, decision_function_shape='ovo', random_state=RANDOM_STATE)
#     # clf = MLPClassifier(hidden_layer_sizes=(100, 20), activation='relu', random_state=RANDOM_STATE)
#     for i in range(NUMBER_OF_FOLDS):
#         classifier = skl.base.clone(model)
#         pipe = make_pipeline(StandardScaler(), classifier)
#         pipe.fit(train_examples[SKF_train_indices[i]], train_labels[SKF_train_indices[i]])
#         accuracy = pipe.score(train_examples[SKF_valid_indices[i]], train_labels[SKF_valid_indices[i]])
#         score = pipe.predict_proba(train_examples[SKF_valid_indices[i]])
#         prediction = pipe.predict(train_examples[SKF_valid_indices[i]])
#         # print(train_labels[SKF_valid_indices[i]].shape)
#         # print(score.shape)
#         for metric in results.keys():
#             if metric == 'Accuracy':
#                 results[metric]['each round'][i] = accuracy
#             else:
#                 results[metric]['each round'][i] = results[metric]['function'](
#                     label=train_labels[SKF_valid_indices[i]], score=score, prediction=prediction)
#     for metric in results.keys():
#         results[metric]['cv value'] = np.mean(results[metric]['each round'])
#     return results
# do_cross_validation(
#     model=svm.SVC(kernel='linear', probability=True, decision_function_shape='ovo', random_state=RANDOM_STATE),
#     )