# 39. Pré-processamento dos dados: variáveis categóricas

In [97]:
# Pré-processamento
import numpy as np
import pandas as pd

In [98]:
df = pd.read_csv('Dataset/heart_tratado.csv', sep=',', encoding='utf-8')

In [99]:
df.head(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289.0,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180.0,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283.0,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214.0,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195.0,0,Normal,122,N,0.0,Up,0
5,39,M,NAP,120,339.0,0,Normal,170,N,0.0,Up,0
6,45,F,ATA,130,237.0,0,Normal,170,N,0.0,Up,0
7,54,M,ATA,110,208.0,0,Normal,142,N,0.0,Up,0
8,37,M,ASY,140,207.0,0,Normal,130,Y,1.5,Flat,1
9,48,F,ATA,120,284.0,0,Normal,120,N,0.0,Up,0


In [100]:
df.shape

(917, 12)

In [101]:
# Transformando as variáveis categóricas nominais em variáveis categóricas ordinais
df2 = pd.DataFrame.copy(df)

In [102]:
df2['Sex'].replace({'M': 1, 'F': 0}, inplace=True)
df2['ChestPainType'].replace({'TA':0, 'ATA':1, 'NAP':2, 'ASY':3}, inplace=True)
df2['RestingECG'].replace({'Normal':0, 'ST':1, 'LVH':2}, inplace=True)
df2['ExerciseAngina'].replace({'N':0, 'Y':1}, inplace=True)
df2['ST_Slope'].replace({'Up':0, 'Flat':1 ,'Down':2}, inplace=True)

In [103]:
df2.head(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289.0,0,0,172,0,0.0,0,0
1,49,0,2,160,180.0,0,0,156,0,1.0,1,1
2,37,1,1,130,283.0,0,1,98,0,0.0,0,0
3,48,0,3,138,214.0,0,0,108,1,1.5,1,1
4,54,1,2,150,195.0,0,0,122,0,0.0,0,0
5,39,1,2,120,339.0,0,0,170,0,0.0,0,0
6,45,0,1,130,237.0,0,0,170,0,0.0,0,0
7,54,1,1,110,208.0,0,0,142,0,0.0,0,0
8,37,1,3,140,207.0,0,0,130,1,1.5,1,1
9,48,0,1,120,284.0,0,0,120,0,0.0,0,0


In [104]:
df2.dtypes

Age                 int64
Sex                 int64
ChestPainType       int64
RestingBP           int64
Cholesterol       float64
FastingBS           int64
RestingECG          int64
MaxHR               int64
ExerciseAngina      int64
Oldpeak           float64
ST_Slope            int64
HeartDisease        int64
dtype: object

In [105]:
df2.shape

(917, 12)

# 40. Pré-processamento: escalonamento e separação de variáveis

In [106]:
df2.head(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289.0,0,0,172,0,0.0,0,0
1,49,0,2,160,180.0,0,0,156,0,1.0,1,1
2,37,1,1,130,283.0,0,1,98,0,0.0,0,0
3,48,0,3,138,214.0,0,0,108,1,1.5,1,1
4,54,1,2,150,195.0,0,0,122,0,0.0,0,0
5,39,1,2,120,339.0,0,0,170,0,0.0,0,0
6,45,0,1,130,237.0,0,0,170,0,0.0,0,0
7,54,1,1,110,208.0,0,0,142,0,0.0,0,0
8,37,1,3,140,207.0,0,0,130,1,1.5,1,1
9,48,0,1,120,284.0,0,0,120,0,0.0,0,0


In [107]:
previsores = df2.iloc[:, 0:11].values

In [108]:
previsores

array([[40. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
       [49. ,  0. ,  2. , ...,  0. ,  1. ,  1. ],
       [37. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
       ...,
       [57. ,  1. ,  3. , ...,  1. ,  1.2,  1. ],
       [57. ,  0. ,  1. , ...,  0. ,  0. ,  1. ],
       [38. ,  1. ,  2. , ...,  0. ,  0. ,  0. ]])

In [109]:
previsores.shape

(917, 11)

In [110]:
alvo = df2.iloc[:, 11].values

In [111]:
alvo

array([0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,

In [112]:
alvo.shape

(917,)

In [113]:
# Análise das escalas dos atributos (Escalonamento)
df2.describe()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,53.509269,0.789531,2.251908,132.540894,244.635389,0.23337,0.604144,136.789531,0.40458,0.886696,0.63795,0.55289
std,9.437636,0.407864,0.931502,17.999749,53.347125,0.423206,0.806161,25.467129,0.491078,1.06696,0.60727,0.497466
min,28.0,0.0,0.0,80.0,85.0,0.0,0.0,60.0,0.0,-2.6,0.0,0.0
25%,47.0,1.0,2.0,120.0,214.0,0.0,0.0,120.0,0.0,0.0,0.0,0.0
50%,54.0,1.0,3.0,130.0,244.635389,0.0,0.0,138.0,0.0,0.6,1.0,1.0
75%,60.0,1.0,3.0,140.0,267.0,0.0,1.0,156.0,1.0,1.5,1.0,1.0
max,77.0,1.0,3.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,2.0,1.0


In [114]:
# Padronização (utiliza a média e o desvio padrão como referência)
from sklearn.preprocessing import StandardScaler
previsores_esc = StandardScaler().fit_transform(previsores)
previsores_esc

array([[-1.43220634,  0.51630861, -1.34470119, ..., -0.82431012,
        -0.83150225, -1.05109458],
       [-0.47805725, -1.9368261 , -0.27058012, ..., -0.82431012,
         0.10625149,  0.59651863],
       [-1.75025603,  0.51630861, -1.34470119, ..., -0.82431012,
        -0.83150225, -1.05109458],
       ...,
       [ 0.37007527,  0.51630861,  0.80354095, ...,  1.21313565,
         0.29380223,  0.59651863],
       [ 0.37007527, -1.9368261 , -1.34470119, ..., -0.82431012,
        -0.83150225,  0.59651863],
       [-1.64423947,  0.51630861, -0.27058012, ..., -0.82431012,
        -0.83150225, -1.05109458]])

In [115]:
previsoresdf = pd.DataFrame(previsores_esc)
previsoresdf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-1.432206,0.516309,-1.344701,0.414627,0.832075,-0.551733,-0.749818,1.383339,-0.824310,-0.831502,-1.051095
1,-0.478057,-1.936826,-0.270580,1.526360,-1.212261,-0.551733,-0.749818,0.754736,-0.824310,0.106251,0.596519
2,-1.750256,0.516309,-1.344701,-0.141240,0.719543,-0.551733,0.491306,-1.523953,-0.824310,-0.831502,-1.051095
3,-0.584074,-1.936826,0.803541,0.303453,-0.574578,-0.551733,-0.749818,-1.131075,1.213136,0.575128,0.596519
4,0.052026,0.516309,-0.270580,0.970493,-0.930931,-0.551733,-0.749818,-0.581047,-0.824310,-0.831502,-1.051095
...,...,...,...,...,...,...,...,...,...,...,...
912,-0.902124,0.516309,-2.418822,-1.252973,0.363191,-0.551733,-0.749818,-0.188170,-0.824310,0.293802,0.596519
913,1.536257,0.516309,0.803541,0.636973,-0.968441,1.812470,-0.749818,0.165420,-0.824310,2.356860,0.596519
914,0.370075,0.516309,0.803541,-0.141240,-2.131275,-0.551733,-0.749818,-0.856061,1.213136,0.293802,0.596519
915,0.370075,-1.936826,-1.344701,-0.141240,-0.161960,-0.551733,1.732430,1.461915,-0.824310,-0.831502,0.596519


In [116]:
previsoresdf.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,1.859654e-16,-7.748558000000001e-17,1.046055e-16,7.767929e-16,-1.86934e-16,4.649135e-17,0.0,-5.114048e-16,-1.046055e-16,7.748558000000001e-17,-3.8742790000000005e-17
std,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546
min,-2.704405,-1.936826,-2.418822,-2.920572,-2.994023,-0.5517333,-0.749818,-3.016886,-0.8243101,-3.269662,-1.051095
25%,-0.6900904,0.5163086,-0.2705801,-0.6971063,-0.5745784,-0.5517333,-0.749818,-0.6596226,-0.8243101,-0.8315022,-1.051095
50%,0.05202558,0.5163086,0.803541,-0.1412398,0.0,-0.5517333,-0.749818,0.04755658,-0.8243101,-0.26885,0.5965186
75%,0.688125,0.5163086,0.803541,0.4146267,0.4194568,-0.5517333,0.491306,0.7547357,1.213136,0.5751284,0.5965186
max,2.490407,0.5163086,0.803541,3.749826,6.721265,1.81247,1.73243,2.561971,1.213136,4.982571,2.244132


# 41. Pré-processamento: LabelEncoder e OnehotEncoder

In [117]:
# Transformação de variáveis categóricas em numéricas automaticamente
from sklearn.preprocessing import LabelEncoder

In [118]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289.0,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180.0,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283.0,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214.0,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195.0,0,Normal,122,N,0.0,Up,0


In [119]:
previsores2 = df.iloc[:, 0:11].values
previsores2

array([[40, 'M', 'ATA', ..., 'N', 0.0, 'Up'],
       [49, 'F', 'NAP', ..., 'N', 1.0, 'Flat'],
       [37, 'M', 'ATA', ..., 'N', 0.0, 'Up'],
       ...,
       [57, 'M', 'ASY', ..., 'Y', 1.2, 'Flat'],
       [57, 'F', 'ATA', ..., 'N', 0.0, 'Flat'],
       [38, 'M', 'NAP', ..., 'N', 0.0, 'Up']], dtype=object)

In [120]:
previsores2[:, 1] = LabelEncoder().fit_transform(previsores2[:, 1])
previsores2[:, 2] = LabelEncoder().fit_transform(previsores2[:, 2])
previsores2[:, 6] = LabelEncoder().fit_transform(previsores2[:, 6])
previsores2[:, 8] = LabelEncoder().fit_transform(previsores2[:, 8])
previsores2[:, 10] = LabelEncoder().fit_transform(previsores2[:, 10])
previsores2

array([[40, 1, 1, ..., 0, 0.0, 2],
       [49, 0, 2, ..., 0, 1.0, 1],
       [37, 1, 1, ..., 0, 0.0, 2],
       ...,
       [57, 1, 0, ..., 1, 1.2, 1],
       [57, 0, 1, ..., 0, 0.0, 1],
       [38, 1, 2, ..., 0, 0.0, 2]], dtype=object)

In [121]:
previsores2.shape

(917, 11)

In [122]:
# OneHotEncoder: Criação de variaveis Dummy (transformação de variáveis numéricas ordinais em binárias)
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [123]:
ColumnTransformer?

In [124]:
previsores3 = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(), [1, 2, 6, 8, 10])], remainder='passthrough').fit_transform(previsores2)
previsores3

array([[0.0, 1.0, 0.0, ..., 0, 172, 0.0],
       [1.0, 0.0, 0.0, ..., 0, 156, 1.0],
       [0.0, 1.0, 0.0, ..., 0, 98, 0.0],
       ...,
       [0.0, 1.0, 1.0, ..., 0, 115, 1.2],
       [1.0, 0.0, 0.0, ..., 0, 174, 0.0],
       [0.0, 1.0, 0.0, ..., 0, 173, 0.0]], dtype=object)

In [125]:
previsores3.shape

(917, 20)

In [126]:
previsores3df = pd.DataFrame(previsores3)
previsores3df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,40,140,289.0,0,172,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,49,160,180.0,0,156,1.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,37,130,283.0,0,98,0.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,48,138,214.0,0,108,1.5
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,54,150,195.0,0,122,0.0
5,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,39,120,339.0,0,170,0.0
6,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,45,130,237.0,0,170,0.0
7,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,54,110,208.0,0,142,0.0
8,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,37,140,207.0,0,130,1.5
9,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,48,120,284.0,0,120,0.0


In [127]:
df.head(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289.0,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180.0,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283.0,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214.0,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195.0,0,Normal,122,N,0.0,Up,0
5,39,M,NAP,120,339.0,0,Normal,170,N,0.0,Up,0
6,45,F,ATA,130,237.0,0,Normal,170,N,0.0,Up,0
7,54,M,ATA,110,208.0,0,Normal,142,N,0.0,Up,0
8,37,M,ASY,140,207.0,0,Normal,130,Y,1.5,Flat,1
9,48,F,ATA,120,284.0,0,Normal,120,N,0.0,Up,0


In [128]:
# Escalonamento
from sklearn.preprocessing import StandardScaler

In [129]:
previsores3_esc = StandardScaler().fit_transform(previsores3)
previsores3_esc

array([[-0.51630861,  0.51630861, -1.08542493, ..., -0.55173333,
         1.38333943, -0.83150225],
       [ 1.9368261 , -1.9368261 , -1.08542493, ..., -0.55173333,
         0.75473573,  0.10625149],
       [-0.51630861,  0.51630861, -1.08542493, ..., -0.55173333,
        -1.52395266, -0.83150225],
       ...,
       [-0.51630861,  0.51630861,  0.92129817, ..., -0.55173333,
        -0.85606123,  0.29380223],
       [ 1.9368261 , -1.9368261 , -1.08542493, ..., -0.55173333,
         1.46191489, -0.83150225],
       [-0.51630861,  0.51630861, -1.08542493, ..., -0.55173333,
         1.42262716, -0.83150225]])

In [130]:
previsores3df = pd.DataFrame(previsores3_esc)
previsores3df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-0.516309,0.516309,-1.085425,2.073784,-0.531524,-0.229810,-0.507826,0.815013,-0.490781,0.824310,-0.824310,-0.271607,-1.001091,1.149573,-1.432206,0.414627,0.832075,-0.551733,1.383339,-0.831502
1,1.936826,-1.936826,-1.085425,-0.482210,1.881384,-0.229810,-0.507826,0.815013,-0.490781,0.824310,-0.824310,-0.271607,0.998910,-0.869888,-0.478057,1.526360,-1.212261,-0.551733,0.754736,0.106251
2,-0.516309,0.516309,-1.085425,2.073784,-0.531524,-0.229810,-0.507826,-1.226974,2.037569,0.824310,-0.824310,-0.271607,-1.001091,1.149573,-1.750256,-0.141240,0.719543,-0.551733,-1.523953,-0.831502
3,1.936826,-1.936826,0.921298,-0.482210,-0.531524,-0.229810,-0.507826,0.815013,-0.490781,-1.213136,1.213136,-0.271607,0.998910,-0.869888,-0.584074,0.303453,-0.574578,-0.551733,-1.131075,0.575128
4,-0.516309,0.516309,-1.085425,-0.482210,1.881384,-0.229810,-0.507826,0.815013,-0.490781,0.824310,-0.824310,-0.271607,-1.001091,1.149573,0.052026,0.970493,-0.930931,-0.551733,-0.581047,-0.831502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
912,-0.516309,0.516309,-1.085425,-0.482210,-0.531524,4.351412,-0.507826,0.815013,-0.490781,0.824310,-0.824310,-0.271607,0.998910,-0.869888,-0.902124,-1.252973,0.363191,-0.551733,-0.188170,0.293802
913,-0.516309,0.516309,0.921298,-0.482210,-0.531524,-0.229810,-0.507826,0.815013,-0.490781,0.824310,-0.824310,-0.271607,0.998910,-0.869888,1.536257,0.636973,-0.968441,1.812470,0.165420,2.356860
914,-0.516309,0.516309,0.921298,-0.482210,-0.531524,-0.229810,-0.507826,0.815013,-0.490781,-1.213136,1.213136,-0.271607,0.998910,-0.869888,0.370075,-0.141240,-2.131275,-0.551733,-0.856061,0.293802
915,1.936826,-1.936826,-1.085425,2.073784,-0.531524,-0.229810,1.969177,-1.226974,-0.490781,0.824310,-0.824310,-0.271607,0.998910,-0.869888,0.370075,-0.141240,-0.161960,-0.551733,1.461915,-0.831502


In [131]:
previsores3df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,1.084798e-16,-1.472226e-16,1.937139e-17,-3.8742790000000005e-17,3.8742790000000005e-17,6.973702000000001e-17,0.0,-9.298269e-17,1.549712e-17,-4.2617070000000006e-17,4.2617070000000006e-17,8.523413e-17,0.0,-3.8742790000000005e-17,1.859654e-16,7.884157e-16,3.014189e-15,-1.549712e-17,-5.114048e-16,-1.859654e-16
std,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546
min,-0.5163086,-1.936826,-1.085425,-0.4822104,-0.5315237,-0.2298105,-0.507826,-1.226974,-0.490781,-1.213136,-0.8243101,-0.2716072,-1.001091,-0.8698879,-2.704405,-2.920572,-2.994023,-0.5517333,-3.016886,-3.269662
25%,-0.5163086,0.5163086,-1.085425,-0.4822104,-0.5315237,-0.2298105,-0.507826,-1.226974,-0.490781,-1.213136,-0.8243101,-0.2716072,-1.001091,-0.8698879,-0.6900904,-0.6971063,-0.5745784,-0.5517333,-0.6596226,-0.8315022
50%,-0.5163086,0.5163086,0.9212982,-0.4822104,-0.5315237,-0.2298105,-0.507826,0.8150134,-0.490781,0.8243101,-0.8243101,-0.2716072,0.99891,-0.8698879,0.05202558,-0.1412398,3.19836e-15,-0.5517333,0.04755658,-0.26885
75%,-0.5163086,0.5163086,0.9212982,-0.4822104,-0.5315237,-0.2298105,-0.507826,0.8150134,-0.490781,0.8243101,1.213136,-0.2716072,0.99891,1.149573,0.688125,0.4146267,0.4194568,-0.5517333,0.7547357,0.5751284
max,1.936826,0.5163086,0.9212982,2.073784,1.881384,4.351412,1.969177,0.8150134,2.037569,0.8243101,1.213136,3.681787,0.99891,1.149573,2.490407,3.749826,6.721265,1.81247,2.561971,4.982571


In [ ]:
# Resumo do Pré-Processamento:
# alvo = variável que se pretende atingir (tem ou não tem doença cardiaca);
# previsores = conjunto de variáveis previsoras com as variáveis categóricas transformadas em numéricas manualmente, sem escalonar;
# previsores_esc = previsores escalonada;
# previsores2 = conjunto de variáveis previsoras com as variáveis categóricas transformadas em numéricas automaticamente pelo labelencoder;
# previsores3 = previsores2 aplicada onehotencoder (variáveis dummy);
# previsores3_esc = previsores3 escalonada.

# 42. Pré-processamento: Redução de dimensionalidade

In [134]:
# Análise dos componentes principais (Seleção de características vs Extração de características / aprendizagem não supervisionada)
from sklearn.decomposition import PCA
pca = PCA(n_components=3, svd_solver='auto')

In [135]:
previsores_pca = pca.fit_transform(previsores2)
previsores_pca.shape

(917, 3)

In [136]:
previsores_pca # componentes principais

array([[  44.01218603,  -36.15137592,   10.64727631],
       [ -63.9918455 ,  -13.93849233,   31.69212503],
       [  38.53844277,   33.91712222,  -12.48837374],
       ...,
       [-113.3467467 ,   23.48595247,   -2.51064362],
       [  -9.1141306 ,  -35.90083188,    4.81814643],
       [ -70.01342608,  -35.68741686,   12.10984845]])

In [137]:
pca.explained_variance_ratio_ # razão das variáveis explicativas

array([0.72847929, 0.17183534, 0.08122019])

In [138]:
pca.explained_variance_ratio_.sum() # Soma das variáveis explicativas

0.9815348236774056

In [139]:
# Kernel PCA (Redução de dimensionalidade com aprendizagem não supervisionada)
from sklearn.decomposition import KernelPCA

In [140]:
kpca = KernelPCA(n_components=4, kernel='rbf')

In [141]:
previsores_kernel = kpca.fit_transform(previsores2)
previsores_kernel.shape

(917, 4)

In [142]:
previsores_kernel

array([[-0.00249772, -0.00290225, -0.0027372 , -0.00191636],
       [-0.00249877, -0.00290359, -0.00273854, -0.00191743],
       [-0.00249785, -0.00290241, -0.00273736, -0.00191649],
       ...,
       [-0.00249774, -0.00290227, -0.00273723, -0.00191638],
       [-0.00249793, -0.00290252, -0.00273748, -0.00191659],
       [-0.00249772, -0.00290225, -0.0027372 , -0.00191637]])

In [143]:
# Análise do Discriminante Linear (LDA - Aprendizagem Supervisionada)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [144]:
lda = LinearDiscriminantAnalysis(n_components=1)

In [145]:
previsores_lda = lda.fit_transform(previsores2, alvo)
previsores_lda.shape

(917, 1)

In [146]:
previsores_lda

array([[ 1.47297727],
       [ 1.41360735],
       [ 0.96705302],
       [-0.53331037],
       [ 1.45783427],
       [ 1.78559221],
       [ 2.28906835],
       [ 1.26844095],
       [-1.03044735],
       [ 1.8033219 ],
       [ 2.56103248],
       [-1.08833702],
       [ 1.4298088 ],
       [-1.02172609],
       [ 2.51881925],
       [ 0.75582909],
       [ 0.27447783],
       [ 2.33841952],
       [-0.50562225],
       [-0.03155735],
       [ 2.89530718],
       [ 0.23402029],
       [ 2.26833719],
       [-0.97019565],
       [ 1.70998121],
       [ 2.0730337 ],
       [-1.62722174],
       [ 0.96486408],
       [ 1.52502126],
       [ 1.33460621],
       [ 0.03948873],
       [ 1.44459749],
       [-0.63936919],
       [-0.34808635],
       [ 2.23945383],
       [ 1.48957548],
       [-2.4746182 ],
       [ 2.13691975],
       [ 1.54252555],
       [-0.30741381],
       [ 1.8748176 ],
       [ 0.26749988],
       [ 1.51713745],
       [ 1.79186716],
       [-1.00348547],
       [-0

In [147]:
lda.explained_variance_ratio_

array([1.])

In [148]:
lda.explained_variance_ratio_.sum()

1.0

# 43. Pré-processamento: Salvamento de variáveis

In [150]:
import pickle

In [151]:
arq1 = open('Dataset/heart.pkl', 'wb') # Criando o arquivo

In [152]:
pickle.dump(alvo, arq1) # Salvando a variável no arquivo

In [153]:
arq1.close() # Fechando o arquivo

In [154]:
arq1 = open('Dataset/heart.pkl', 'rb') # Abrindo o arquivo

In [155]:
alvo = pickle.load(arq1) # Lendo o arquivo

In [156]:
alvo

array([0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,

In [157]:
arq1.close() # Fechando o arquivo

# 44. Separação entre treino e teste

In [159]:
# Base de treino e teste
from sklearn.model_selection import train_test_split

In [310]:
x_treino, x_teste, y_treino, y_teste = train_test_split(previsores3_esc, alvo, test_size=0.3, random_state=0)

In [311]:
x_teste.shape

(276, 20)

In [312]:
y_treino.shape

(641,)

In [313]:
y_teste.shape

(276,)

# 45. Naive Bayes : Teoria
"""
P(A|B) = [P(B|A) * P(A)] / P(B)
P(A|B)*P(B) = P(A^B) = P(B^A)

Aplicações:
> Filtros de spam.
> Diagnostico médico.
> Classificação de imagens.
> Análise de crédito.
> Separação de documentos.
> Previsão de falhas.

Vantagens:
> Rápido e de fácil entendimento.
> Pouco esforço computacional.
> Bom desemprenho com muitos dados.
> Boas previsões com poucos dados.

Desvantagens:
> Considera atributos independentes.
> Atribuição de um valor nulo de probabilidade quando uma classe contida no conjunto de teste não se apresenta no conjunto de treino.
"""

# 46. Naive Bayes no Python
"https://scikit-learn.org/stable/modules/naive_bayes.html"

In [166]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB

In [167]:
# Treinamento do algoritmo
naive = GaussianNB()
naive.fit(x_treino, y_treino)

In [168]:
# Avaliação do algoritmo
previsoes_naive = naive.predict(x_teste)
previsoes_naive

array([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0], dtype=int64)

In [169]:
y_teste

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1], dtype=int64)

In [170]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [171]:
accuracy_score(y_teste, previsoes_naive)

0.8442028985507246

In [172]:
print(f'Acurácia: {accuracy_score(y_teste, previsoes_naive)*100:.2f}%')

Acurácia: 84.42%


In [173]:
confusion_matrix(y_teste, previsoes_naive)

array([[102,  19],
       [ 24, 131]], dtype=int64)

In [174]:
print(classification_report(y_teste, previsoes_naive))

              precision    recall  f1-score   support

           0       0.81      0.84      0.83       121
           1       0.87      0.85      0.86       155

    accuracy                           0.84       276
   macro avg       0.84      0.84      0.84       276
weighted avg       0.85      0.84      0.84       276


In [175]:
# Análise dos dados de treino
previsoes_treino = naive.predict(x_treino)
previsoes_treino

array([0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1,

In [176]:
accuracy_score(y_treino, previsoes_treino)

0.8361934477379095

In [177]:
confusion_matrix(y_treino, previsoes_treino)

array([[243,  46],
       [ 59, 293]], dtype=int64)

In [178]:
print(classification_report(y_treino, previsoes_treino))

              precision    recall  f1-score   support

           0       0.80      0.84      0.82       289
           1       0.86      0.83      0.85       352

    accuracy                           0.84       641
   macro avg       0.83      0.84      0.84       641
weighted avg       0.84      0.84      0.84       641


In [179]:
# Validação Cruzada
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [180]:
# Separando os dados em folds
kfold = KFold(n_splits=30, shuffle=True, random_state=5)

In [181]:
# Criando o modelo
modelo = GaussianNB()
resultado = cross_val_score(modelo, previsores3_esc, alvo, cv=kfold)
resultado

array([0.87096774, 0.87096774, 0.90322581, 0.80645161, 0.77419355,
       0.90322581, 0.70967742, 0.87096774, 0.77419355, 0.87096774,
       0.90322581, 0.77419355, 0.93548387, 0.90322581, 0.87096774,
       0.87096774, 0.87096774, 0.93333333, 0.93333333, 0.9       ,
       0.76666667, 0.83333333, 0.8       , 0.7       , 0.83333333,
       0.93333333, 0.86666667, 0.9       , 0.86666667, 0.8       ])

In [182]:
# Usando a média e o desvio padrão
print(f'Acurácia Média: {resultado.mean()*100:.2f}%')

Acurácia Média: 85.17%


# 47. Máquina de Vetor de Suporte (SVM): Teoria
"""
Equação:
w-> * x-> +b >= 0
Onde:
w = Vetor hiperplano;
x = vetor pertencente ao hiperplano;
b = deslocamento em relação à origem;

Aplicações:
> Classificação e Regressão.
> Categorização de textos.
> Reconhecimento de imagem.
> Detecção facial.
> Detecção e anomalias.
> Reconhecimento de letras manuscritas.

Constante de Penalização (custo)
> Hiperparâmetro C: controla a tolerância a erros.
    - Quanto maior o valor de C, maior a probabilidade de overfitting e o tempo.
    - Quanto menor o valor de C, maior o tempo de treinamento e maior a probabilidade de underfitting.
    
> Ajuste do hiperparâmetro Gama para otimização.

Vantagens: 
> Não é influenciado por dados discrepantes.
> Solução de problemas lineares e não lineares.
> Muito efetivo para datasets grandes.
> Consegue aprender com características não pertencentes aos dados.

Desvantagens:
> Difícil interpretação teórica devido à matemática complexa.
> Difícil visualização gráfica.
> É lento comparado a outros algoritmos.
"""

# 48. Máquina de Vetor de Suporte (SVM): Aplicação

In [186]:
# Máquinas de Vetores de Suporte (SVM)
"https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html"

'https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html'

In [315]:
from sklearn.svm import SVC

In [354]:
svm = SVC(kernel = 'rbf', random_state=1, C=2)
svm.fit(x_treino, y_treino)

In [355]:
previsoes_svm = svm.predict(x_teste)
previsoes_svm

array([1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1], dtype=int64)

In [356]:
y_teste

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1], dtype=int64)

In [357]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [358]:
accuracy_score(y_teste, previsoes_svm)

0.8623188405797102

In [359]:
print(f'Acurácia: {accuracy_score(y_teste, previsoes_svm)*100:.2f}%')

Acurácia: 86.23%


In [360]:
confusion_matrix(y_teste, previsoes_svm)

array([[ 99,  22],
       [ 16, 139]], dtype=int64)

In [361]:
print(classification_report(y_teste, previsoes_svm))

              precision    recall  f1-score   support

           0       0.86      0.82      0.84       121
           1       0.86      0.90      0.88       155

    accuracy                           0.86       276
   macro avg       0.86      0.86      0.86       276
weighted avg       0.86      0.86      0.86       276


In [362]:
# Análise de dados de treino
previsoes_treino = svm.predict(x_treino)
previsoes_treino

array([1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,

In [363]:
accuracy_score(y_treino, previsoes_treino)

0.9282371294851794

In [364]:
confusion_matrix(y_treino, previsoes_treino)

array([[257,  32],
       [ 14, 338]], dtype=int64)

In [365]:
# Validaçõa Cruzada
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [367]:
# Separando os dados em folds
kfold = KFold(n_splits=30, shuffle=True, random_state=5)

In [370]:
# Criando o modelo
modelo = SVC(kernel = 'rbf', random_state=1, C=2)
resultado = cross_val_score(modelo, previsores3_esc, alvo, cv=kfold)
resultado

array([0.96774194, 0.83870968, 0.87096774, 0.90322581, 0.83870968,
       0.87096774, 0.70967742, 0.87096774, 0.80645161, 0.80645161,
       0.87096774, 0.80645161, 0.96774194, 0.90322581, 0.80645161,
       0.83870968, 0.80645161, 0.93333333, 0.86666667, 0.93333333,
       0.83333333, 0.86666667, 0.86666667, 0.63333333, 0.83333333,
       0.93333333, 0.9       , 0.86666667, 0.9       , 0.86666667])

In [371]:
# Usando a média e o desvio padrão
print(f'Acurácia Média: {resultado.mean()*100:.2f}%')

Acurácia Média: 85.72%
