<a href="https://colab.research.google.com/github/GiovaniMicheloni/mlearning-w-python/blob/main/GridSearchCV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Utilizando SearchGridCV para achar os melhores parametros para a base de credito

In [2]:
!pip -q install plotly

In [3]:
!pip -q  install yellowbrick

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
basecredito = pd.read_csv('/content/credit_data.csv')

In [8]:
basecredito.describe()

Unnamed: 0,clientid,income,age,loan,default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.807559,4444.369695,0.1415
std,577.494589,14326.327119,13.624469,3045.410024,0.348624
min,1.0,20014.48947,-52.42328,1.37763,0.0
25%,500.75,32796.459717,28.990415,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


In [9]:
basecredito.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   clientid  2000 non-null   int64  
 1   income    2000 non-null   float64
 2   age       1997 non-null   float64
 3   loan      2000 non-null   float64
 4   default   2000 non-null   int64  
dtypes: float64(3), int64(2)
memory usage: 78.3 KB


In [10]:
basecredito[basecredito['age']<0]

Unnamed: 0,clientid,income,age,loan,default
15,16,50501.726689,-28.218361,3977.287432,0
21,22,32197.620701,-52.42328,4244.057136,0
26,27,63287.038908,-36.496976,9595.286289,0


In [11]:
basecredito[basecredito['age']>0].mean()

Unnamed: 0,0
clientid,1003.431795
income,45328.856915
age,40.9277
loan,4443.240892
default,0.141926


In [12]:
basecredito.loc[basecredito['age']<0, 'age'] = 40.92

In [13]:
basecredito[basecredito['age']<0]

Unnamed: 0,clientid,income,age,loan,default


In [14]:
basecredito.isnull().sum()

Unnamed: 0,0
clientid,0
income,0
age,3
loan,0
default,0


In [15]:
basecredito.loc[pd.isnull(basecredito['age'])]

Unnamed: 0,clientid,income,age,loan,default
28,29,59417.805406,,2082.625938,0
30,31,48528.852796,,6155.78467,0
31,32,23526.302555,,2862.010139,0


In [16]:
basecredito['age']= basecredito['age'].fillna(basecredito['age'].mean())

In [17]:
xcredito = basecredito.iloc[:,1:4].values
xcredito

array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       ...,
       [4.43114493e+04, 2.80171669e+01, 5.52278669e+03],
       [4.37560566e+04, 6.39717958e+01, 1.62272260e+03],
       [6.94365796e+04, 5.61526170e+01, 7.37883360e+03]])

In [18]:
ycredito = basecredito.iloc[:,4].values
ycredito

array([0, 0, 0, ..., 1, 0, 0])

In [19]:
from sklearn.preprocessing import StandardScaler
standardscaler = StandardScaler()
xcredito = standardscaler.fit_transform(xcredito)
xcredito

array([[ 1.45393393,  1.36538093,  1.20281942],
       [-0.76217555,  0.5426602 ,  0.69642695],
       [ 0.83682073,  1.67417189,  1.17471147],
       ...,
       [-0.07122592, -0.97448519,  0.35420081],
       [-0.11000289,  1.73936739, -0.92675625],
       [ 1.682986  ,  1.14917639,  0.96381038]])

In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [32]:
parametros_decisiontree = {'criterion': ['gini','entropy'],
              'splitter': ['best','random'],
              'min_samples_split':[2,5,10],
              'min_samples_leaf':[1,5,10]
              }

In [33]:
gridsearch = GridSearchCV(estimator=DecisionTreeClassifier(),param_grid=parametros_decisiontree)
gridsearch.fit(xcredito,ycredito)
melhores_parametros = gridsearch.best_params_
melhor_resultado = gridsearch.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'criterion': 'gini', 'min_samples_leaf': 1, 'min_samples_split': 5, 'splitter': 'best'}
0.9845


In [34]:
parametros_randomforest = {'criterion': ['gini','entropy'],
              'n_estimators': [10,40,100,150],
              'min_samples_split':[2,5,10],
              'min_samples_leaf':[1,5,10]
              }

In [36]:
gridsearch = GridSearchCV(estimator=RandomForestClassifier(),param_grid=parametros_randomforest)
gridsearch.fit(xcredito,ycredito)
melhores_parametros = gridsearch.best_params_
melhor_resultado = gridsearch.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'criterion': 'gini', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 10}
0.9890000000000001


In [37]:
parametros_knn = {'n_neighbors':[1,5,10,20],
                  'p':[1,2]}

In [38]:
gridsearch = GridSearchCV(estimator=KNeighborsClassifier(),param_grid=parametros_knn)
gridsearch.fit(xcredito,ycredito)
melhores_parametros = gridsearch.best_params_
melhor_resultado = gridsearch.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'n_neighbors': 5, 'p': 2}
0.9810000000000001


In [40]:
parametros_svm = {'tol':[0.001,0.0001,0.00001],
                  'C':[1.0,1.5,2.0],
                  'kernel':['rbf','linear','poly','sigmoid']}

In [41]:
gridsearch = GridSearchCV(estimator=SVC(),param_grid=parametros_svm)
gridsearch.fit(xcredito,ycredito)
melhores_parametros = gridsearch.best_params_
melhor_resultado = gridsearch.best_score_
print(melhores_parametros)
print(melhor_resultado)

{'C': 1.5, 'kernel': 'rbf', 'tol': 0.001}
0.9835


In [42]:
 parametros_mlp = {'activation':['relu','logistic','tahn'],
                   'solver':['adam','sgd'],
                   'batch_size':[10,56]}

In [43]:
gridsearch = GridSearchCV(estimator=MLPClassifier(),param_grid=parametros_mlp)
gridsearch.fit(xcredito,ycredito)
melhores_parametros = gridsearch.best_params_
melhor_resultado = gridsearch.best_score_
print(melhores_parametros)
print(melhor_resultado)

20 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
sklea

{'activation': 'relu', 'batch_size': 56, 'solver': 'adam'}
0.9970000000000001


