#### **Cross Validation**

---
Evaluates the best classifier and the best parameters for application.

<b>Decision Tree:</b> 98,30%<br>
<b>Random Forest:</b> 98,70%<br>
<b>KNN:</b> 98,00%<br>
<b>Logistic Regression:</b> 94,85%<br>
<b>SVM:</b> 98,30%<br>
<b>Neural Network:</b> 99,64%<br>

In [4]:
# General imports
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

import seaborn as sns

##### Base Credit Data

###### Import Base

In [5]:
import numpy as np
import pickle
# necessary step to standardize categories
with open('../examples/credit.pkl', 'rb') as f:
  x_credit_trainning, y_credit_trainning, x_credit_test, y_credit_test = pickle.load(f)

In [None]:
x_credit_trainning.shape, y_credit_trainning.shape

In [None]:
x_credit_test.shape, y_credit_test.shape

In [6]:
# concatenate x credit trainning and test in the same array
x_credit = np.concatenate((x_credit_trainning, x_credit_test), axis = 0)
x_credit.shape

(2000, 3)

In [7]:
# concatenate y credit trainning and test in the same array
y_credit = np.concatenate((y_credit_trainning, y_credit_test), axis = 0)
y_credit.shape

(2000,)

In [None]:
y_credit

###### Get best learning params

In [None]:
# Decision Tree - 98,3%
params = {'criterion': ['gini', 'entropy'],
          'splitter': ['best', 'random'],
          'min_samples_split': [2, 5, 10],
          'min_samples_leaf': [1, 5, 10]}
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=params)
grid_search.fit(x_credit, y_credit)
best_params = grid_search.best_params_
print(best_params) # {'criterion': 'entropy', 'min_samples_leaf': 1, 'min_samples_split': 5, 'splitter': 'best'}
best_result = grid_search.best_score_
print(best_result)

In [None]:
# Random Forest - 98,7%
params = {'criterion': ['gini', 'entropy'],
          'n_estimators': [10, 40, 100, 150],
          'min_samples_split': [2, 5, 10],
          'min_samples_leaf': [1, 5, 10]}
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=params)
grid_search.fit(x_credit, y_credit)
best_params = grid_search.best_params_
print(best_params) # {'criterion': 'entropy', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 150}
best_result = grid_search.best_score_
print(best_result)

In [None]:
# KNN (Instances) - 98,0%
params = {'n_neighbors': [3, 5, 10, 20],
          'p': [1, 2]}
grid_search = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=params)
grid_search.fit(x_credit, y_credit)
best_params = grid_search.best_params_
print(best_params) # {'n_neighbors': 20, 'p': 1}
best_result = grid_search.best_score_
print(best_result)

In [None]:
# Logistic Regression - 94,85%
params = {'tol': [0.0001, 0.00001, 0.000001],
          'C': [1.0, 1.5, 2.0],
          'solver': ['lbfgs', 'sag', 'saga']}
grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=params)
grid_search.fit(x_credit, y_credit)
best_params = grid_search.best_params_
print(best_params) # {'C': 1.0, 'solver': 'lbfgs', 'tol': 0.0001}
best_result = grid_search.best_score_
print(best_result)

In [None]:
# SVM - 98,30%
params = {'tol': [0.001, 0.0001, 0.00001],
          'C': [1.0, 1.5, 2.0],
          'kernel': ['rbf', 'linear', 'poly', 'sigmoid']}
grid_search = GridSearchCV(estimator=SVC(), param_grid=params)
grid_search.fit(x_credit, y_credit)
best_params = grid_search.best_params_
print(best_params) # {'C': 1.5, 'kernel': 'rbf', 'tol': 0.001}
best_result = grid_search.best_score_
print(best_result)

In [None]:
# Neural network - 99,64%
params = {'activation': ['relu', 'logistic', 'tahn'],
          'solver': ['adam', 'sgd'],
          'batch_size': [10, 56]}
grid_search = GridSearchCV(estimator=MLPClassifier(), param_grid=params)
grid_search.fit(x_credit, y_credit)
best_params = grid_search.best_params_
print(best_params) # {'activation': 'relu', 'batch_size': 10, 'solver': 'adam'}
best_result = grid_search.best_score_
print(best_result)

###### Get score

In [8]:
from sklearn.model_selection import cross_val_score, KFold

result_decision_tree = []
result_random_forest = []
result_knn = []
result_logistic_regression = []
result_svm = []
result_neural = []

# get mean of results of respective algorithm to evaluate
for i in range(3):
  kfold = KFold(n_splits=10, shuffle=True, random_state=i)
  # Decision Tree
  tree = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=1, min_samples_split=5, splitter='best')
  scores = cross_val_score(tree, x_credit, y_credit, cv=kfold)
  result_decision_tree.append(scores.mean())
  # Random Forest
  forest = RandomForestClassifier(criterion='entropy', min_samples_leaf=1, min_samples_split=5, n_estimators=150)
  scores = cross_val_score(forest, x_credit, y_credit, cv=kfold)
  result_random_forest.append(scores.mean())
  # KNN
  knn = KNeighborsClassifier()
  scores = cross_val_score(knn, x_credit, y_credit, cv=kfold)
  result_knn.append(scores.mean())
  # Logistic Regression
  regression = LogisticRegression(C=1.0, solver='lbfgs', tol=0.0001)
  scores = cross_val_score(regression, x_credit, y_credit, cv=kfold)
  result_logistic_regression.append(scores.mean())
  # SVM
  svm = SVC(C=1.5, kernel='rbf', tol= 0.001)
  scores = cross_val_score(svm, x_credit, y_credit, cv=kfold)
  result_svm.append(scores.mean())
  # Neural Network
  neural = MLPClassifier(activation='relu', batch_size=10, solver='adam')
  scores = cross_val_score(neural, x_credit, y_credit, cv=kfold)
  result_neural.append(scores.mean())

In [None]:
import pandas as pd
# unify results
results = pd.DataFrame({
  'Tree': result_decision_tree,
  'Forest': result_random_forest,
  'Knn': result_knn,
  'Regression': result_logistic_regression,
  'SVM': result_svm,
  'Neural': result_neural})
results

In [None]:
# describe
results.describe()

In [None]:
# variance
results.var()

In [None]:
# coefficient of variation
(results.std() / results.mean()) * 100

###### Check distribution

In [None]:
# checks that the data set comes from a normal distribution
from scipy.stats import shapiro
# pvalue less than alpha value (0.05 default) means that it is a non-normal distribution
shapiro(result_decision_tree), shapiro(result_random_forest), shapiro(result_knn), shapiro(result_logistic_regression), shapiro(result_svm), shapiro(result_neural)

In [None]:
sns.displot(result_decision_tree, kind='kde')

In [None]:
sns.displot(result_random_forest, kind='kde')

In [None]:
sns.displot(result_knn, kind='kde')

In [None]:
sns.displot(result_logistic_regression, kind='kde')

In [None]:
sns.displot(result_svm, kind='kde')

In [None]:
sns.displot(result_neural, kind='kde')

###### Classificate hypothesis

In [None]:
# check that exists statistics diferences between group of results
from scipy.stats import f_oneway
_, p = f_oneway(result_decision_tree, result_random_forest, result_knn, result_logistic_regression, result_svm, result_neural)

alpha = 0.05
if p <= alpha:
  print('Null hypothesis rejected, data is differents.')
else:
  print('Alternative hypothesis rejected, data is equals.')

results_algorithm = {'accuracy': np.concatenate([result_decision_tree, result_random_forest, result_knn, result_logistic_regression, result_svm, result_neural]),
                    'algorithm': [
                      'Tree', 'Tree', 'Tree',
                      'Forest', 'Forest', 'Forest',
                      'Knn', 'Knn', 'Knn',
                      'Regression', 'Regression', 'Regression',
                      'SVM', 'SVM', 'SVM',
                      'Neural', 'Neural', 'Neural',
                    ]}
results_df = pd.DataFrame(results_algorithm)
results_df

In [None]:
from statsmodels.stats.multicomp import MultiComparison
compare_algorithms = MultiComparison(results_df['accuracy'], results_df['algorithm'])
# registers not rejected means that not has statistics differences by groups
statistics_test = compare_algorithms.tukeyhsd()
print(statistics_test)
# otherwise, regs rejected must be compare performance by results.mean()

In [None]:
# the most advanced category means it has the best performance
statistics_test.plot_simultaneous()