## Exercício: Validação Cruzada

In [1]:
# Bibliotecas 
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate

In [2]:
# Base de dados
ds = pd.read_csv('../Amostragem/credit_data.csv')
ds.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [3]:
# Limpando as NAs
ds.dropna(inplace=True)
ds.shape

(1997, 5)

In [4]:
# Criando as variáveis de classificação e previsores 
X = ds.iloc[:,1:4].values
y = ds.iloc[:, 4].values
X

array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       ...,
       [4.43114493e+04, 2.80171669e+01, 5.52278669e+03],
       [4.37560566e+04, 6.39717958e+01, 1.62272260e+03],
       [6.94365796e+04, 5.61526170e+01, 7.37883360e+03]])

In [5]:
# Criando vários testes 
resultado_cross_validation = []
for i in range(15):
    # Criando o treino e treinado
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=i
    )
    clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
    resultado_cross_validation.append(clf.score(X_test, y_test))

In [6]:
# Visualizando o maior score
print(max(resultado_cross_validation))

0.9566666666666667


In [7]:
# Fazendo a validação cruzada
clf = svm.SVC(kernel='linear', C=1, random_state=42)
score = cross_val_score(clf, X, y, cv=5)
score

array([0.945     , 0.9475    , 0.96240602, 0.92982456, 0.94235589])

In [8]:
# Visualizando a média
score.mean()

0.9454172932330828

In [9]:
# Visualizando a Moda
stats.mode(score, keepdims=True)

ModeResult(mode=array([0.92982456]), count=array([1]))

In [10]:
# Visualizando a mediana
np.median(score)

0.945

In [11]:
# O desvio padrão
score.std()

0.01044637685537856

In [12]:
# Variância
np.set_printoptions(suppress=True)
np.var(score)

0.00010912678940458884

In [13]:
# Visualizando o Coeficiente de Variação
stats.variation(score) * 100

1.1049487808346143

## Resolução do exercício com o Tutor

In [14]:
# Bibliotecas
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [15]:
# Criando vários testes 
resultados_naive_bayes_cv = []
resultados_logistica_cv = []
resultados_forest_cv = []
for i in range(30):
    # Criando o treino e treinado
    kf = KFold(n_splits=10, shuffle=True, random_state=i)
    naive_bayes = GaussianNB()
    scores = cross_val_score(naive_bayes, X, y, cv=kf)
    resultados_naive_bayes_cv.append(scores.mean())

    logistica = LogisticRegression()
    scores = cross_val_score(logistica, X, y, cv=kf)
    resultados_logistica_cv.append(scores.mean())

    random_forest = RandomForestClassifier()
    scores = cross_val_score(random_forest, X, y, cv=kf)
    resultados_forest_cv.append(scores.mean())


In [16]:
print(max(resultados_naive_bayes_cv))
print(max(resultados_logistica_cv))
print(max(resultados_forest_cv))

0.9263844221105527
0.9213618090452261
0.9899924623115577


In [17]:
# Visualizando o Coeficiente de Variação
stats.variation(resultados_naive_bayes_cv) * 100, stats.variation(resultados_logistica_cv) * 100, stats.variation(resultados_forest_cv) * 100

(0.08641071566366061, 0.38801026116292653, 0.10898800664820639)