In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [2]:
import pickle

with open('../../assets/credit.pkl', 'rb') as file:
  X_credit_train, y_credit_train, X_credit_test, y_credit_test = pickle.load(file)

In [4]:
X_credit_train.shape, y_credit_train.shape

((1500, 3), (1500,))

In [5]:
X_credit_test.shape, y_credit_test.shape

((500, 3), (500,))

In [6]:
import numpy as np

In [8]:
X_credit = np.concatenate((X_credit_train, X_credit_test), axis=0)
X_credit.shape

(2000, 3)

In [9]:
X_credit

array([[-1.3754462 ,  0.50630999,  0.10980934],
       [ 1.45826409, -1.64894017, -1.21501497],
       [-0.79356829,  0.22531104, -0.43370226],
       ...,
       [ 1.37445674, -1.05746369, -1.12564819],
       [-1.57087737, -0.6348826 , -0.36981671],
       [-1.03572293, -0.93978209,  0.04244312]])

In [11]:
y_credit = np.concatenate((y_credit_train, y_credit_test))
y_credit.shape

(2000,)

In [12]:
y_credit

array([0, 0, 0, ..., 0, 1, 1])

## Árvore de decisão

In [14]:
parameters = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}

In [15]:
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=parameters)
grid_search.fit(X_credit, y_credit)

best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

best_parameters, best_score

({'criterion': 'gini',
  'min_samples_leaf': 1,
  'min_samples_split': 5,
  'splitter': 'best'},
 0.9835)

## Random forest

In [16]:
parameters = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': [10, 40, 100, 150],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}

In [17]:
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=parameters)
grid_search.fit(X_credit, y_credit)

best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

best_parameters, best_score

({'criterion': 'gini',
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 40},
 0.986)

## kNN

In [18]:
parameters = {
    "n_neighbors": [3, 5, 10, 20],
    "p": [1, 2]
}

In [19]:
grid_search = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=parameters)
grid_search.fit(X_credit, y_credit)

best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

best_parameters, best_score

({'n_neighbors': 20, 'p': 1}, 0.9800000000000001)

## Regressão logística

In [21]:
parameters = {
    "tol": [0.0001, 0.00001, 0.000001],
    "C": [1.0, 1.5, 2.0],
    "solver": ["lbfgs", "sag", "saga"]
}

In [22]:
grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=parameters)
grid_search.fit(X_credit, y_credit)

best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

best_parameters, best_score

({'C': 1.0, 'solver': 'lbfgs', 'tol': 0.0001}, 0.9484999999999999)

## SVM

In [25]:
parameters = {
    "tol": [0.001, 0.0001, 0.00001],
    "C": [1.0, 1.5, 2.0],
    "kernel": ["rbf", "linear", "poly", "sigmoid"]
}

In [26]:
grid_search = GridSearchCV(estimator=SVC(), param_grid=parameters)
grid_search.fit(X_credit, y_credit)

best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

best_parameters, best_score

({'C': 1.5, 'kernel': 'rbf', 'tol': 0.001}, 0.9829999999999999)

## Redes neurais

In [27]:
parameters = {
    'activation': ['relu', 'logistic', 'tahn'],
    'solver': ['adam', 'sgd'],
    'batch_size': [10, 56]
}

In [28]:
grid_search = GridSearchCV(estimator=MLPClassifier(), param_grid=parameters)
grid_search.fit(X_credit, y_credit)

best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

best_parameters, best_score

20 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/home/gabriel/Projects/machine-learning-e-data-science-com-python/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/gabriel/Projects/machine-learning-e-data-science-com-python/.venv/lib/python3.10/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/home/gabriel/Projects/machine-learning-e-data-science-com-python/.venv/lib/python3.10/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parame

({'activation': 'relu', 'batch_size': 56, 'solver': 'adam'},
 0.9964999999999999)

## Validação cruzada

In [29]:
from sklearn.model_selection import cross_val_score, KFold

In [36]:
results_tree = []
results_random_forest = []
results_knn = []
results_logistica = []
results_svm = []
results_rede_neural = []

for i in range(30):
    kfold = KFold(n_splits=10, shuffle=True, random_state=i)

    tree = DecisionTreeClassifier(
        criterion="entropy",
        min_samples_leaf=1,
        min_samples_split=5,
        splitter="best"
    )

    scores = cross_val_score(tree, X_credit, y_credit, cv=kfold)
    results_tree.append(scores.mean())

    random_forest = RandomForestClassifier(
        criterion='entropy', min_samples_leaf=1, min_samples_split=5, n_estimators=10)
    scores = cross_val_score(random_forest, X_credit, y_credit, cv=kfold)
    results_random_forest.append(scores.mean())

    knn = KNeighborsClassifier()
    scores = cross_val_score(knn, X_credit, y_credit, cv=kfold)
    results_knn.append(scores.mean())

    logistica = LogisticRegression(C=1.0, solver='lbfgs', tol=0.0001)
    scores = cross_val_score(logistica, X_credit, y_credit, cv=kfold)
    results_logistica.append(scores.mean())

    svm = SVC(kernel='rbf', C=2.0)
    scores = cross_val_score(svm, X_credit, y_credit, cv=kfold)
    results_svm.append(scores.mean())

    rede_neural = MLPClassifier(
        activation='relu', batch_size=56, solver='adam')
    scores = cross_val_score(rede_neural, X_credit, y_credit, cv=kfold)
    results_rede_neural.append(scores.mean())



In [37]:
import pandas as pd

In [38]:
results = pd.DataFrame({
    'Arvore': results_tree,
    'Random forest': results_random_forest,
    'KNN': results_knn,
    'Logistica': results_logistica,
    'SVM': results_svm,
    'Rede neural': results_rede_neural
})
results

Unnamed: 0,Arvore,Random forest,KNN,Logistica,SVM,Rede neural
0,0.986,0.9855,0.9815,0.9475,0.9845,0.9975
1,0.985,0.987,0.98,0.9465,0.984,0.998
2,0.9905,0.9865,0.9795,0.947,0.9865,0.9975
3,0.9875,0.9835,0.978,0.946,0.985,0.9965
4,0.988,0.986,0.982,0.9465,0.985,0.998
5,0.989,0.9865,0.978,0.9465,0.9845,0.997
6,0.988,0.984,0.9805,0.947,0.986,0.9965
7,0.9875,0.984,0.98,0.948,0.985,0.997
8,0.986,0.984,0.9795,0.9465,0.984,0.997
9,0.987,0.9845,0.982,0.9465,0.9845,0.997
