# 39. Pré-processamento dos dados: variáveis categóricas

In [1]:
# Pré-processamento
import numpy as np
import pandas as pd
df = pd.read_csv('Dataset/heart_tratado.csv', sep=',', encoding='utf-8')

In [2]:
df.head(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289.0,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180.0,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283.0,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214.0,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195.0,0,Normal,122,N,0.0,Up,0
5,39,M,NAP,120,339.0,0,Normal,170,N,0.0,Up,0
6,45,F,ATA,130,237.0,0,Normal,170,N,0.0,Up,0
7,54,M,ATA,110,208.0,0,Normal,142,N,0.0,Up,0
8,37,M,ASY,140,207.0,0,Normal,130,Y,1.5,Flat,1
9,48,F,ATA,120,284.0,0,Normal,120,N,0.0,Up,0


In [3]:
df.shape

(917, 12)

In [4]:
# Transformando as variáveis categóricas nominais em variáveis categóricas ordinais
df2 = pd.DataFrame.copy(df)

In [5]:
df2['Sex'].replace({'M': 1, 'F': 0}, inplace=True)
df2['ChestPainType'].replace({'TA':0, 'ATA':1, 'NAP':2, 'ASY':3}, inplace=True)
df2['RestingECG'].replace({'Normal':0, 'ST':1, 'LVH':2}, inplace=True)
df2['ExerciseAngina'].replace({'N':0, 'Y':1}, inplace=True)
df2['ST_Slope'].replace({'Up':0, 'Flat':1 ,'Down':2}, inplace=True)

In [6]:
df2.head(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289.0,0,0,172,0,0.0,0,0
1,49,0,2,160,180.0,0,0,156,0,1.0,1,1
2,37,1,1,130,283.0,0,1,98,0,0.0,0,0
3,48,0,3,138,214.0,0,0,108,1,1.5,1,1
4,54,1,2,150,195.0,0,0,122,0,0.0,0,0
5,39,1,2,120,339.0,0,0,170,0,0.0,0,0
6,45,0,1,130,237.0,0,0,170,0,0.0,0,0
7,54,1,1,110,208.0,0,0,142,0,0.0,0,0
8,37,1,3,140,207.0,0,0,130,1,1.5,1,1
9,48,0,1,120,284.0,0,0,120,0,0.0,0,0


In [7]:
df2.dtypes

Age                 int64
Sex                 int64
ChestPainType       int64
RestingBP           int64
Cholesterol       float64
FastingBS           int64
RestingECG          int64
MaxHR               int64
ExerciseAngina      int64
Oldpeak           float64
ST_Slope            int64
HeartDisease        int64
dtype: object

In [8]:
df2.shape

(917, 12)

# 40. Pré-processamento: escalonamento e separação de variáveis

In [9]:
df2.head(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289.0,0,0,172,0,0.0,0,0
1,49,0,2,160,180.0,0,0,156,0,1.0,1,1
2,37,1,1,130,283.0,0,1,98,0,0.0,0,0
3,48,0,3,138,214.0,0,0,108,1,1.5,1,1
4,54,1,2,150,195.0,0,0,122,0,0.0,0,0
5,39,1,2,120,339.0,0,0,170,0,0.0,0,0
6,45,0,1,130,237.0,0,0,170,0,0.0,0,0
7,54,1,1,110,208.0,0,0,142,0,0.0,0,0
8,37,1,3,140,207.0,0,0,130,1,1.5,1,1
9,48,0,1,120,284.0,0,0,120,0,0.0,0,0


In [10]:
previsores = df2.iloc[:, 0:11].values

In [11]:
previsores

array([[40. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
       [49. ,  0. ,  2. , ...,  0. ,  1. ,  1. ],
       [37. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
       ...,
       [57. ,  1. ,  3. , ...,  1. ,  1.2,  1. ],
       [57. ,  0. ,  1. , ...,  0. ,  0. ,  1. ],
       [38. ,  1. ,  2. , ...,  0. ,  0. ,  0. ]])

In [12]:
previsores.shape

(917, 11)

In [13]:
alvo = df2.iloc[:, 11].values

In [14]:
alvo

array([0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,

In [15]:
alvo.shape

(917,)

In [16]:
# Análise das escalas dos atributos (Escalonamento)
df2.describe()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,53.509269,0.789531,2.251908,132.540894,244.635389,0.23337,0.604144,136.789531,0.40458,0.886696,0.63795,0.55289
std,9.437636,0.407864,0.931502,17.999749,53.347125,0.423206,0.806161,25.467129,0.491078,1.06696,0.60727,0.497466
min,28.0,0.0,0.0,80.0,85.0,0.0,0.0,60.0,0.0,-2.6,0.0,0.0
25%,47.0,1.0,2.0,120.0,214.0,0.0,0.0,120.0,0.0,0.0,0.0,0.0
50%,54.0,1.0,3.0,130.0,244.635389,0.0,0.0,138.0,0.0,0.6,1.0,1.0
75%,60.0,1.0,3.0,140.0,267.0,0.0,1.0,156.0,1.0,1.5,1.0,1.0
max,77.0,1.0,3.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,2.0,1.0


In [17]:
# Padronização (utiliza a média e o desvio padrão como referência)
from sklearn.preprocessing import StandardScaler
previsores_esc = StandardScaler().fit_transform(previsores)
previsores_esc

array([[-1.43220634,  0.51630861, -1.34470119, ..., -0.82431012,
        -0.83150225, -1.05109458],
       [-0.47805725, -1.9368261 , -0.27058012, ..., -0.82431012,
         0.10625149,  0.59651863],
       [-1.75025603,  0.51630861, -1.34470119, ..., -0.82431012,
        -0.83150225, -1.05109458],
       ...,
       [ 0.37007527,  0.51630861,  0.80354095, ...,  1.21313565,
         0.29380223,  0.59651863],
       [ 0.37007527, -1.9368261 , -1.34470119, ..., -0.82431012,
        -0.83150225,  0.59651863],
       [-1.64423947,  0.51630861, -0.27058012, ..., -0.82431012,
        -0.83150225, -1.05109458]])

In [18]:
previsoresdf = pd.DataFrame(previsores_esc)
previsoresdf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-1.432206,0.516309,-1.344701,0.414627,0.832075,-0.551733,-0.749818,1.383339,-0.824310,-0.831502,-1.051095
1,-0.478057,-1.936826,-0.270580,1.526360,-1.212261,-0.551733,-0.749818,0.754736,-0.824310,0.106251,0.596519
2,-1.750256,0.516309,-1.344701,-0.141240,0.719543,-0.551733,0.491306,-1.523953,-0.824310,-0.831502,-1.051095
3,-0.584074,-1.936826,0.803541,0.303453,-0.574578,-0.551733,-0.749818,-1.131075,1.213136,0.575128,0.596519
4,0.052026,0.516309,-0.270580,0.970493,-0.930931,-0.551733,-0.749818,-0.581047,-0.824310,-0.831502,-1.051095
...,...,...,...,...,...,...,...,...,...,...,...
912,-0.902124,0.516309,-2.418822,-1.252973,0.363191,-0.551733,-0.749818,-0.188170,-0.824310,0.293802,0.596519
913,1.536257,0.516309,0.803541,0.636973,-0.968441,1.812470,-0.749818,0.165420,-0.824310,2.356860,0.596519
914,0.370075,0.516309,0.803541,-0.141240,-2.131275,-0.551733,-0.749818,-0.856061,1.213136,0.293802,0.596519
915,0.370075,-1.936826,-1.344701,-0.141240,-0.161960,-0.551733,1.732430,1.461915,-0.824310,-0.831502,0.596519


In [19]:
previsoresdf.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,1.859654e-16,-7.748558000000001e-17,1.046055e-16,7.767929e-16,-1.86934e-16,4.649135e-17,0.0,-5.114048e-16,-1.046055e-16,7.748558000000001e-17,-3.8742790000000005e-17
std,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546
min,-2.704405,-1.936826,-2.418822,-2.920572,-2.994023,-0.5517333,-0.749818,-3.016886,-0.8243101,-3.269662,-1.051095
25%,-0.6900904,0.5163086,-0.2705801,-0.6971063,-0.5745784,-0.5517333,-0.749818,-0.6596226,-0.8243101,-0.8315022,-1.051095
50%,0.05202558,0.5163086,0.803541,-0.1412398,0.0,-0.5517333,-0.749818,0.04755658,-0.8243101,-0.26885,0.5965186
75%,0.688125,0.5163086,0.803541,0.4146267,0.4194568,-0.5517333,0.491306,0.7547357,1.213136,0.5751284,0.5965186
max,2.490407,0.5163086,0.803541,3.749826,6.721265,1.81247,1.73243,2.561971,1.213136,4.982571,2.244132


# 41. Pré-processamento: LabelEncoder e OnehotEncoder

In [20]:
# Transformação de variáveis categóricas em numéricas automaticamente
from sklearn.preprocessing import LabelEncoder

In [21]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289.0,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180.0,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283.0,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214.0,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195.0,0,Normal,122,N,0.0,Up,0


In [22]:
previsores2 = df.iloc[:, 0:11].values
previsores2

array([[40, 'M', 'ATA', ..., 'N', 0.0, 'Up'],
       [49, 'F', 'NAP', ..., 'N', 1.0, 'Flat'],
       [37, 'M', 'ATA', ..., 'N', 0.0, 'Up'],
       ...,
       [57, 'M', 'ASY', ..., 'Y', 1.2, 'Flat'],
       [57, 'F', 'ATA', ..., 'N', 0.0, 'Flat'],
       [38, 'M', 'NAP', ..., 'N', 0.0, 'Up']], dtype=object)

In [23]:
previsores2[:, 1] = LabelEncoder().fit_transform(previsores2[:, 1])
previsores2[:, 2] = LabelEncoder().fit_transform(previsores2[:, 2])
previsores2[:, 6] = LabelEncoder().fit_transform(previsores2[:, 6])
previsores2[:, 8] = LabelEncoder().fit_transform(previsores2[:, 8])
previsores2[:, 10] = LabelEncoder().fit_transform(previsores2[:, 10])
previsores2

array([[40, 1, 1, ..., 0, 0.0, 2],
       [49, 0, 2, ..., 0, 1.0, 1],
       [37, 1, 1, ..., 0, 0.0, 2],
       ...,
       [57, 1, 0, ..., 1, 1.2, 1],
       [57, 0, 1, ..., 0, 0.0, 1],
       [38, 1, 2, ..., 0, 0.0, 2]], dtype=object)

In [24]:
previsores2.shape

(917, 11)

In [25]:
# OneHotEncoder: Criação de variaveis Dummy (transformação de variáveis numéricas ordinais em binárias)
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [26]:
ColumnTransformer?

In [27]:
previsores3 = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(), [1, 2, 6, 8, 10])], remainder='passthrough').fit_transform(previsores2)
previsores3

array([[0.0, 1.0, 0.0, ..., 0, 172, 0.0],
       [1.0, 0.0, 0.0, ..., 0, 156, 1.0],
       [0.0, 1.0, 0.0, ..., 0, 98, 0.0],
       ...,
       [0.0, 1.0, 1.0, ..., 0, 115, 1.2],
       [1.0, 0.0, 0.0, ..., 0, 174, 0.0],
       [0.0, 1.0, 0.0, ..., 0, 173, 0.0]], dtype=object)

In [28]:
previsores3.shape

(917, 20)

In [29]:
previsores3df = pd.DataFrame(previsores3)
previsores3df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,40,140,289.0,0,172,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,49,160,180.0,0,156,1.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,37,130,283.0,0,98,0.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,48,138,214.0,0,108,1.5
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,54,150,195.0,0,122,0.0
5,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,39,120,339.0,0,170,0.0
6,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,45,130,237.0,0,170,0.0
7,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,54,110,208.0,0,142,0.0
8,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,37,140,207.0,0,130,1.5
9,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,48,120,284.0,0,120,0.0


In [30]:
df.head(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289.0,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180.0,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283.0,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214.0,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195.0,0,Normal,122,N,0.0,Up,0
5,39,M,NAP,120,339.0,0,Normal,170,N,0.0,Up,0
6,45,F,ATA,130,237.0,0,Normal,170,N,0.0,Up,0
7,54,M,ATA,110,208.0,0,Normal,142,N,0.0,Up,0
8,37,M,ASY,140,207.0,0,Normal,130,Y,1.5,Flat,1
9,48,F,ATA,120,284.0,0,Normal,120,N,0.0,Up,0


In [31]:
# Escalonamento
from sklearn.preprocessing import StandardScaler

In [32]:
previsores3_esc = StandardScaler().fit_transform(previsores3)
previsores3_esc

array([[-0.51630861,  0.51630861, -1.08542493, ..., -0.55173333,
         1.38333943, -0.83150225],
       [ 1.9368261 , -1.9368261 , -1.08542493, ..., -0.55173333,
         0.75473573,  0.10625149],
       [-0.51630861,  0.51630861, -1.08542493, ..., -0.55173333,
        -1.52395266, -0.83150225],
       ...,
       [-0.51630861,  0.51630861,  0.92129817, ..., -0.55173333,
        -0.85606123,  0.29380223],
       [ 1.9368261 , -1.9368261 , -1.08542493, ..., -0.55173333,
         1.46191489, -0.83150225],
       [-0.51630861,  0.51630861, -1.08542493, ..., -0.55173333,
         1.42262716, -0.83150225]])

In [33]:
previsores3df = pd.DataFrame(previsores3_esc)
previsores3df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-0.516309,0.516309,-1.085425,2.073784,-0.531524,-0.229810,-0.507826,0.815013,-0.490781,0.824310,-0.824310,-0.271607,-1.001091,1.149573,-1.432206,0.414627,0.832075,-0.551733,1.383339,-0.831502
1,1.936826,-1.936826,-1.085425,-0.482210,1.881384,-0.229810,-0.507826,0.815013,-0.490781,0.824310,-0.824310,-0.271607,0.998910,-0.869888,-0.478057,1.526360,-1.212261,-0.551733,0.754736,0.106251
2,-0.516309,0.516309,-1.085425,2.073784,-0.531524,-0.229810,-0.507826,-1.226974,2.037569,0.824310,-0.824310,-0.271607,-1.001091,1.149573,-1.750256,-0.141240,0.719543,-0.551733,-1.523953,-0.831502
3,1.936826,-1.936826,0.921298,-0.482210,-0.531524,-0.229810,-0.507826,0.815013,-0.490781,-1.213136,1.213136,-0.271607,0.998910,-0.869888,-0.584074,0.303453,-0.574578,-0.551733,-1.131075,0.575128
4,-0.516309,0.516309,-1.085425,-0.482210,1.881384,-0.229810,-0.507826,0.815013,-0.490781,0.824310,-0.824310,-0.271607,-1.001091,1.149573,0.052026,0.970493,-0.930931,-0.551733,-0.581047,-0.831502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
912,-0.516309,0.516309,-1.085425,-0.482210,-0.531524,4.351412,-0.507826,0.815013,-0.490781,0.824310,-0.824310,-0.271607,0.998910,-0.869888,-0.902124,-1.252973,0.363191,-0.551733,-0.188170,0.293802
913,-0.516309,0.516309,0.921298,-0.482210,-0.531524,-0.229810,-0.507826,0.815013,-0.490781,0.824310,-0.824310,-0.271607,0.998910,-0.869888,1.536257,0.636973,-0.968441,1.812470,0.165420,2.356860
914,-0.516309,0.516309,0.921298,-0.482210,-0.531524,-0.229810,-0.507826,0.815013,-0.490781,-1.213136,1.213136,-0.271607,0.998910,-0.869888,0.370075,-0.141240,-2.131275,-0.551733,-0.856061,0.293802
915,1.936826,-1.936826,-1.085425,2.073784,-0.531524,-0.229810,1.969177,-1.226974,-0.490781,0.824310,-0.824310,-0.271607,0.998910,-0.869888,0.370075,-0.141240,-0.161960,-0.551733,1.461915,-0.831502


In [34]:
previsores3df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,1.084798e-16,-1.472226e-16,1.937139e-17,-3.8742790000000005e-17,3.8742790000000005e-17,6.973702000000001e-17,0.0,-9.298269e-17,1.549712e-17,-4.2617070000000006e-17,4.2617070000000006e-17,8.523413e-17,0.0,-3.8742790000000005e-17,1.859654e-16,7.884157e-16,3.014189e-15,-1.549712e-17,-5.114048e-16,-1.859654e-16
std,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546
min,-0.5163086,-1.936826,-1.085425,-0.4822104,-0.5315237,-0.2298105,-0.507826,-1.226974,-0.490781,-1.213136,-0.8243101,-0.2716072,-1.001091,-0.8698879,-2.704405,-2.920572,-2.994023,-0.5517333,-3.016886,-3.269662
25%,-0.5163086,0.5163086,-1.085425,-0.4822104,-0.5315237,-0.2298105,-0.507826,-1.226974,-0.490781,-1.213136,-0.8243101,-0.2716072,-1.001091,-0.8698879,-0.6900904,-0.6971063,-0.5745784,-0.5517333,-0.6596226,-0.8315022
50%,-0.5163086,0.5163086,0.9212982,-0.4822104,-0.5315237,-0.2298105,-0.507826,0.8150134,-0.490781,0.8243101,-0.8243101,-0.2716072,0.99891,-0.8698879,0.05202558,-0.1412398,3.19836e-15,-0.5517333,0.04755658,-0.26885
75%,-0.5163086,0.5163086,0.9212982,-0.4822104,-0.5315237,-0.2298105,-0.507826,0.8150134,-0.490781,0.8243101,1.213136,-0.2716072,0.99891,1.149573,0.688125,0.4146267,0.4194568,-0.5517333,0.7547357,0.5751284
max,1.936826,0.5163086,0.9212982,2.073784,1.881384,4.351412,1.969177,0.8150134,2.037569,0.8243101,1.213136,3.681787,0.99891,1.149573,2.490407,3.749826,6.721265,1.81247,2.561971,4.982571


In [35]:
# Resumo do Pré-Processamento:
# alvo = variável que se pretende atingir (tem ou não tem doença cardiaca);
# previsores = conjunto de variáveis previsoras com as variáveis categóricas transformadas em numéricas manualmente, sem escalonar;
# previsores_esc = previsores escalonada;
# previsores2 = conjunto de variáveis previsoras com as variáveis categóricas transformadas em numéricas automaticamente pelo labelencoder;
# previsores3 = previsores2 aplicada onehotencoder (variáveis dummy);
# previsores3_esc = previsores3 escalonada.

# 42. Pré-processamento: Redução de dimensionalidade

In [36]:
# Análise dos componentes principais (Seleção de características vs Extração de características / aprendizagem não supervisionada)
from sklearn.decomposition import PCA
pca = PCA(n_components=3, svd_solver='auto')

In [37]:
previsores_pca = pca.fit_transform(previsores2)
previsores_pca.shape

(917, 3)

In [38]:
previsores_pca # componentes principais

array([[  44.01218603,  -36.15137592,   10.64727631],
       [ -63.9918455 ,  -13.93849233,   31.69212503],
       [  38.53844277,   33.91712222,  -12.48837374],
       ...,
       [-113.3467467 ,   23.48595247,   -2.51064362],
       [  -9.1141306 ,  -35.90083188,    4.81814643],
       [ -70.01342608,  -35.68741686,   12.10984845]])

In [39]:
pca.explained_variance_ratio_ # razão das variáveis explicativas

array([0.72847929, 0.17183534, 0.08122019])

In [40]:
pca.explained_variance_ratio_.sum() # Soma das variáveis explicativas

0.9815348236774052

In [41]:
# Kernel PCA (Redução de dimensionalidade com aprendizagem não supervisionada)
from sklearn.decomposition import KernelPCA

In [42]:
kpca = KernelPCA(n_components=4, kernel='rbf')

In [43]:
previsores_kernel = kpca.fit_transform(previsores2)
previsores_kernel.shape

(917, 4)

In [44]:
previsores_kernel

array([[-0.00249772, -0.00290225, -0.0027372 , -0.00191636],
       [-0.00249877, -0.00290359, -0.00273854, -0.00191743],
       [-0.00249785, -0.00290241, -0.00273736, -0.00191649],
       ...,
       [-0.00249774, -0.00290227, -0.00273723, -0.00191638],
       [-0.00249793, -0.00290252, -0.00273748, -0.00191659],
       [-0.00249772, -0.00290225, -0.0027372 , -0.00191637]])

In [45]:
# Análise do Discriminante Linear (LDA - Aprendizagem Supervisionada)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [46]:
lda = LinearDiscriminantAnalysis(n_components=1)

In [47]:
previsores_lda = lda.fit_transform(previsores2, alvo)
previsores_lda.shape

(917, 1)

In [48]:
previsores_lda

array([[ 1.47297727],
       [ 1.41360735],
       [ 0.96705302],
       [-0.53331037],
       [ 1.45783427],
       [ 1.78559221],
       [ 2.28906835],
       [ 1.26844095],
       [-1.03044735],
       [ 1.8033219 ],
       [ 2.56103248],
       [-1.08833702],
       [ 1.4298088 ],
       [-1.02172609],
       [ 2.51881925],
       [ 0.75582909],
       [ 0.27447783],
       [ 2.33841952],
       [-0.50562225],
       [-0.03155735],
       [ 2.89530718],
       [ 0.23402029],
       [ 2.26833719],
       [-0.97019565],
       [ 1.70998121],
       [ 2.0730337 ],
       [-1.62722174],
       [ 0.96486408],
       [ 1.52502126],
       [ 1.33460621],
       [ 0.03948873],
       [ 1.44459749],
       [-0.63936919],
       [-0.34808635],
       [ 2.23945383],
       [ 1.48957548],
       [-2.4746182 ],
       [ 2.13691975],
       [ 1.54252555],
       [-0.30741381],
       [ 1.8748176 ],
       [ 0.26749988],
       [ 1.51713745],
       [ 1.79186716],
       [-1.00348547],
       [-0

In [49]:
lda.explained_variance_ratio_

array([1.])

In [50]:
lda.explained_variance_ratio_.sum()

1.0

# 43. Pré-processamento: Salvamento de variáveis

In [51]:
import pickle

In [52]:
arq1 = open('Dataset/heart.pkl', 'wb') # Criando o arquivo

In [53]:
pickle.dump(alvo, arq1) # Salvando a variável no arquivo

In [54]:
arq1.close() # Fechando o arquivo

In [55]:
arq1 = open('Dataset/heart.pkl', 'rb') # Abrindo o arquivo

In [56]:
alvo = pickle.load(arq1) # Lendo o arquivo

In [57]:
alvo

array([0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,

In [58]:
arq1.close() # Fechando o arquivo

# 44. Separação entre treino e teste

In [193]:
# Base de treino e teste
from sklearn.model_selection import train_test_split

In [None]:
x_treino, x_teste, y_treino, y_teste = train_test_split(previsores, alvo, test_size=0.3, random_state=0)

In [199]:
x_treino.shape

(641, 11)

In [200]:
x_teste.shape

(276, 11)

In [201]:
y_treino.shape

(641,)

In [202]:
y_teste.shape

(276,)

# 45. Naive Bayes : Teoria
"""
P(A|B) = [P(B|A) * P(A)] / P(B)
P(A|B)*P(B) = P(A^B) = P(B^A)

Aplicações:
> Filtros de spam.
> Diagnostico médico.
> Classificação de imagens.
> Análise de crédito.
> Separação de documentos.
> Previsão de falhas.

Vantagens:
> Rápido e de fácil entendimento.
> Pouco esforço computacional.
> Bom desemprenho com muitos dados.
> Boas previsões com poucos dados.

Desvantagens:
> Considera atributos independentes.
> Atribuição de um valor nulo de probabilidade quando uma classe contida no conjunto de teste não se apresenta no conjunto de treino.
"""

# 46. Naive Bayes no Python
"https://scikit-learn.org/stable/modules/naive_bayes.html"

In [64]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB

In [65]:
# Treinamento do algoritmo
naive = GaussianNB()
naive.fit(x_treino, y_treino)

In [66]:
# Avaliação do algoritmo
previsoes_naive = naive.predict(x_teste)
previsoes_naive

array([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0], dtype=int64)

In [67]:
y_teste

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1], dtype=int64)

In [68]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [69]:
accuracy_score(y_teste, previsoes_naive)

0.8442028985507246

In [70]:
print(f'Acurácia: {accuracy_score(y_teste, previsoes_naive)*100:.2f}%')

Acurácia: 84.42%


In [71]:
confusion_matrix(y_teste, previsoes_naive)

array([[102,  19],
       [ 24, 131]], dtype=int64)

In [72]:
print(classification_report(y_teste, previsoes_naive))

              precision    recall  f1-score   support

           0       0.81      0.84      0.83       121
           1       0.87      0.85      0.86       155

    accuracy                           0.84       276
   macro avg       0.84      0.84      0.84       276
weighted avg       0.85      0.84      0.84       276


In [73]:
# Análise dos dados de treino
previsoes_treino = naive.predict(x_treino)
previsoes_treino

array([0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1,

In [74]:
accuracy_score(y_treino, previsoes_treino)

0.8361934477379095

In [75]:
confusion_matrix(y_treino, previsoes_treino)

array([[243,  46],
       [ 59, 293]], dtype=int64)

In [76]:
print(classification_report(y_treino, previsoes_treino))

              precision    recall  f1-score   support

           0       0.80      0.84      0.82       289
           1       0.86      0.83      0.85       352

    accuracy                           0.84       641
   macro avg       0.83      0.84      0.84       641
weighted avg       0.84      0.84      0.84       641


In [77]:
# Validação Cruzada
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [78]:
# Separando os dados em folds
kfold = KFold(n_splits=30, shuffle=True, random_state=5)

In [79]:
# Criando o modelo
modelo = GaussianNB()
resultado = cross_val_score(modelo, previsores3_esc, alvo, cv=kfold)
resultado

array([0.87096774, 0.87096774, 0.90322581, 0.80645161, 0.77419355,
       0.90322581, 0.70967742, 0.87096774, 0.77419355, 0.87096774,
       0.90322581, 0.77419355, 0.93548387, 0.90322581, 0.87096774,
       0.87096774, 0.87096774, 0.93333333, 0.93333333, 0.9       ,
       0.76666667, 0.83333333, 0.8       , 0.7       , 0.83333333,
       0.93333333, 0.86666667, 0.9       , 0.86666667, 0.8       ])

In [80]:
# Usando a média e o desvio padrão
print(f'Acurácia Média: {resultado.mean()*100:.2f}%')

Acurácia Média: 85.17%


# 47. Máquina de Vetor de Suporte (SVM): Teoria
"""
Equação:
w-> * x-> +b >= 0
Onde:
w = Vetor hiperplano;
x = vetor pertencente ao hiperplano;
b = deslocamento em relação à origem;

Aplicações:
> Classificação e Regressão.
> Categorização de textos.
> Reconhecimento de imagem.
> Detecção facial.
> Detecção e anomalias.
> Reconhecimento de letras manuscritas.

Constante de Penalização (custo)
> Hiperparâmetro C: controla a tolerância a erros.
    - Quanto maior o valor de C, maior a probabilidade de overfitting e o tempo.
    - Quanto menor o valor de C, maior o tempo de treinamento e maior a probabilidade de underfitting.
    
> Ajuste do hiperparâmetro Gama para otimização.

Vantagens: 
> Não é influenciado por dados discrepantes.
> Solução de problemas lineares e não lineares.
> Muito efetivo para datasets grandes.
> Consegue aprender com características não pertencentes aos dados.

Desvantagens:
> Difícil interpretação teórica devido à matemática complexa.
> Difícil visualização gráfica.
> É lento comparado a outros algoritmos.
"""

# 48. Máquina de Vetor de Suporte (SVM): Aplicação

In [81]:
# Máquinas de Vetores de Suporte (SVM)
"https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html"

'https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html'

In [82]:
from sklearn.svm import SVC

In [83]:
svm = SVC(kernel = 'rbf', random_state=1, C=2)
svm.fit(x_treino, y_treino)

In [84]:
previsoes_svm = svm.predict(x_teste)
previsoes_svm

array([0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0], dtype=int64)

In [85]:
y_teste

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1], dtype=int64)

In [86]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [87]:
accuracy_score(y_teste, previsoes_svm)

0.6702898550724637

In [88]:
print(f'Acurácia: {accuracy_score(y_teste, previsoes_svm)*100:.2f}%')

Acurácia: 67.03%


In [89]:
confusion_matrix(y_teste, previsoes_svm)

array([[ 68,  53],
       [ 38, 117]], dtype=int64)

In [90]:
print(classification_report(y_teste, previsoes_svm))

              precision    recall  f1-score   support

           0       0.64      0.56      0.60       121
           1       0.69      0.75      0.72       155

    accuracy                           0.67       276
   macro avg       0.66      0.66      0.66       276
weighted avg       0.67      0.67      0.67       276


In [91]:
# Análise de dados de treino
previsoes_treino = svm.predict(x_treino)
previsoes_treino

array([1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,

In [92]:
accuracy_score(y_treino, previsoes_treino)

0.7129485179407177

In [93]:
confusion_matrix(y_treino, previsoes_treino)

array([[181, 108],
       [ 76, 276]], dtype=int64)

In [94]:
# Validaçõa Cruzada
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [95]:
# Separando os dados em folds
kfold = KFold(n_splits=30, shuffle=True, random_state=5)

In [96]:
# Criando o modelo
modelo = SVC(kernel = 'rbf', random_state=1, C=2)
resultado = cross_val_score(modelo, previsores3_esc, alvo, cv=kfold)
resultado

array([0.96774194, 0.83870968, 0.87096774, 0.90322581, 0.83870968,
       0.87096774, 0.70967742, 0.87096774, 0.80645161, 0.80645161,
       0.87096774, 0.80645161, 0.96774194, 0.90322581, 0.80645161,
       0.83870968, 0.80645161, 0.93333333, 0.86666667, 0.93333333,
       0.83333333, 0.86666667, 0.86666667, 0.63333333, 0.83333333,
       0.93333333, 0.9       , 0.86666667, 0.9       , 0.86666667])

In [97]:
# Usando a média e o desvio padrão
print(f'Acurácia Média: {resultado.mean()*100:.2f}%')

Acurácia Média: 85.72%


# 49. Regressão Logística: Aplicação

In [98]:
"""
Regressão Logística
> Algoritmo de classificação em aprendizagem supervisionada.
> Utiliza conceitos de regressão linear em seu modelo matemático
> Equação: p = 1/(1+e^-z) onde z=(b0+b1*x1+b2*x2+...+bn*xn)
p = probabilidade de pertencer a determinada classe;
e = número de euler;
b0 = intercepto;
bn = coeficientes;
xn = variáveis dependentes;

Aplicações:
> Está presente em diversas aplicações em várias áreas de estudo como na economia, bioestatística, psicometria, medicina, ciências sociais...

Vantagens:
> Fácil implementação;
> Teoria consolidada;
> Excelente desempenho;
> Indica o valor de probabilidade para cada instância:
"""

'\nRegressão Logística\n> Algoritmo de classificação em aprendizagem supervisionada.\n> Utiliza conceitos de regressão linear em seu modelo matemático\n> Equação: p = 1/(1+e^-z) onde z=(b0+b1*x1+b2*x2+...+bn*xn)\np = probabilidade de pertencer a determinada classe;\ne = número de euler;\nb0 = intercepto;\nbn = coeficientes;\nxn = variáveis dependentes;\n\nAplicações:\n> Está presente em diversas aplicações em várias áreas de estudo como na economia, bioestatística, psicometria, medicina, ciências sociais...\n\nVantagens:\n> Fácil implementação;\n> Teoria consolidada;\n> Excelente desempenho;\n> Indica o valor de probabilidade para cada instância:\n'

# 50. Regressão Logística no Python

In [99]:
"https: scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html"

'https: scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html'

In [100]:
from sklearn.linear_model import LogisticRegression

In [101]:
logistica = LogisticRegression(random_state=1, max_iter=500, penalty="l2", tol=0.0001, C=1, solver="lbfgs") 
logistica.fit(x_treino, y_treino)

In [102]:
logistica.intercept_

array([-4.70949359])

In [103]:
logistica.coef_

array([[ 0.01310081,  1.27109795,  0.66432244,  0.00337222,  0.00432106,
         0.92045964,  0.08879068, -0.01148407,  1.02230033,  0.24998987,
         1.64635353]])

In [104]:
previsoes_logistica = logistica.predict(x_teste)
previsoes_logistica

array([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1], dtype=int64)

In [105]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [106]:
print(f'''Acurácia: {accuracy_score(y_teste, previsoes_logistica)*100:.2f}%''')

Acurácia: 85.51%


In [107]:
confusion_matrix(y_teste, previsoes_logistica)

array([[ 99,  22],
       [ 18, 137]], dtype=int64)

In [108]:
print(classification_report(y_teste, previsoes_logistica))

              precision    recall  f1-score   support

           0       0.85      0.82      0.83       121
           1       0.86      0.88      0.87       155

    accuracy                           0.86       276
   macro avg       0.85      0.85      0.85       276
weighted avg       0.85      0.86      0.85       276


In [109]:
# Análise dados de treino
previsoes_treino = logistica.predict(x_treino)
previsoes_treino

array([1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1,

In [110]:
accuracy_score(y_treino, previsoes_treino)

0.8502340093603744

In [111]:
confusion_matrix(y_treino, previsoes_treino)

array([[239,  50],
       [ 46, 306]], dtype=int64)

In [112]:
# Validação Cruzada
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [113]:
# Separando os dados em folds
kfold = KFold(n_splits=30, shuffle=True, random_state=5)

In [114]:
# Criando o modelo
modelo = LogisticRegression(random_state=1, max_iter=500, penalty="l2", tol=0.0001, C=1, solver="lbfgs") 
resultado = cross_val_score(modelo, previsores3_esc, alvo, cv=kfold)
resultado

array([0.93548387, 0.83870968, 0.90322581, 0.77419355, 0.87096774,
       0.90322581, 0.70967742, 0.87096774, 0.83870968, 0.87096774,
       0.87096774, 0.80645161, 0.96774194, 0.83870968, 0.80645161,
       0.90322581, 0.80645161, 0.96666667, 0.96666667, 0.86666667,
       0.83333333, 0.86666667, 0.86666667, 0.7       , 0.83333333,
       0.93333333, 0.9       , 0.83333333, 0.83333333, 0.83333333])

In [115]:
# Usando a média e o desvio padrão
print(f'Acurácia Média: {resultado.mean()*100:.2f}%')

Acurácia Média: 85.83%


# 51. KNN (Aprendizagem Baseada em Instâncias)

In [116]:
# Aprendizagem baseada em instâncias (KNN)
"""
> Algoritmo de classificação em aprendizagem supervisionada.
> KNN é a sigla de K Nearest Neighbors (K vizinhos mais próximos).
> O Knn realiza classificação de instâncias (dados) em classes.
> Não possuí um modelo matemático, apenas classifica uma instância através de cálculos de distâncias

Vantagens:
> Fácil implementação;
> Fácil entendimento;
> Excelente desempenho em situações de dados com características complexas;
> Poucos parâmetros para ajustar;

Desvantagens:
> Alto custo computacional;
> Parâmetro k é ajustado na tentativa e erro;
"""

'\n> Algoritmo de classificação em aprendizagem supervisionada.\n> KNN é a sigla de K Nearest Neighbors (K vizinhos mais próximos).\n> O Knn realiza classificação de instâncias (dados) em classes.\n> Não possuí um modelo matemático, apenas classifica uma instância através de cálculos de distâncias\n\nVantagens:\n> Fácil implementação;\n> Fácil entendimento;\n> Excelente desempenho em situações de dados com características complexas;\n> Poucos parâmetros para ajustar;\n\nDesvantagens:\n> Alto custo computacional;\n> Parâmetro k é ajustado na tentativa e erro;\n'

# 52. KNN no Python
'https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html'

In [117]:
from sklearn.neighbors import KNeighborsClassifier

In [118]:
knn = KNeighborsClassifier(n_neighbors=7, metric='minkowski', p=1)
knn.fit(x_treino, y_treino)

'https://scikit-learn.org/stable/modules/generated/sklearn.metrics.DistanceMetric.html'

In [119]:
previsoes_knn = knn.predict(x_teste)
previsoes_knn

array([1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], dtype=int64)

In [120]:
y_teste

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1], dtype=int64)

In [121]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [122]:
print(f'''Acurácia: {accuracy_score(y_teste, previsoes_knn)*100:.2f}%''')

Acurácia: 69.20%


In [123]:
confusion_matrix(y_teste, previsoes_knn)

array([[ 74,  47],
       [ 38, 117]], dtype=int64)

In [124]:
print(classification_report(y_teste, previsoes_knn))

              precision    recall  f1-score   support

           0       0.66      0.61      0.64       121
           1       0.71      0.75      0.73       155

    accuracy                           0.69       276
   macro avg       0.69      0.68      0.68       276
weighted avg       0.69      0.69      0.69       276


In [125]:
# Análise dados de treino
previsoes_treino = knn.predict(x_treino)
previsoes_treino

array([1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1,

In [126]:
accuracy_score(y_treino, previsoes_treino)

0.7987519500780031

In [127]:
confusion_matrix(y_treino, previsoes_treino)

array([[210,  79],
       [ 50, 302]], dtype=int64)

In [128]:
# Validação Cruzada
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [129]:
# Separando os dados em folds
kfold = KFold(n_splits=30, shuffle=True, random_state=5)

In [130]:
# Criando o modelo
modelo = KNeighborsClassifier(n_neighbors=7, metric='minkowski', p=1)
resultado = cross_val_score(modelo, previsores3_esc, alvo, cv=kfold)
resultado

array([0.96774194, 0.80645161, 0.83870968, 0.83870968, 0.83870968,
       0.87096774, 0.67741935, 0.87096774, 0.80645161, 0.90322581,
       0.83870968, 0.77419355, 0.96774194, 0.90322581, 0.87096774,
       0.87096774, 0.83870968, 0.9       , 0.93333333, 0.96666667,
       0.8       , 0.83333333, 0.86666667, 0.73333333, 0.8       ,
       0.93333333, 0.9       , 0.9       , 0.86666667, 0.83333333])

In [131]:
# Usando a média e o desvio padrão
print(f'Acurácia Média: {resultado.mean()*100:.2f}%')

Acurácia Média: 85.84%


# 53. Árvore de Decisão: Teoria

In [132]:
# Decision Tree
"""
> Aplicado em problemas de aprendizagem supervisionada tanto de classificação (mais utilizado) como de regressão.
> Seleciona a ordem que os atributos irão aparecer na árvore, sempre de cima para baixo, conforme sua importância para a predição, assim como determina a separação dos ramos da árvore.
> Para determinar o nível de importância de um atributo, denominado de ganho de informação, utiliza-se de várias métricas, sendo que as mais e aplicadas são a entropia (medida da falta de homogeneidade) e o índice de Gini (medida do grau de heterogeneidade).
> Cálculo da Entropia (E):
    E(S) = -∑ p(xi)log2p(xi), onde 
        p(xi) = probabilidade de ocorrer os atributos nos dados.
        n = número de classes que pode ser atingida.
> Cálculo do índice de Gini (Gi):
    Gi(S) = 1 - ∑ p(xi)^2, onde 
        p(xi) = probabilidade de ocorrer os atributos nos dados.
        n = número de classes que pode ser atingida.
> Cálculo do ganho de informação (G):
    G(S, A) = E(S) - ∑ p(a)*P(i|a) * log2(P(i|a)), onde
        p(a) = probabilidade de ocorrer o atributo a nos dados.
        p(i|a) = probabilidade de i ocorrer, dado que a já tenha ocorrido.
        n = número de classes que pode ser atingida.
        
Podagem das Árvores
> Objetiva a probabilidade de overfitting
> Processo de podagem:
    - percorre a árvore em profundidade.
    - para cada nó de decisão calcula o erro no nó e a soma dos erros nos nós descendentes.
    - se o erro do nó é menor ou igual à soma dos erros dos nós descendentes, então o nó é transformado em folha.
    
Vantagens:
> Fácil entendimento e interpretação.
> Normalmente não necessitam de preparações sofisticadas nos dados (label Encoder e OneHotEncoder).
> Trabalha com valores faltantes, variáveis categóricas e numéricas.
> Atua com dados não linearmente separáveis.

Desvantagens:
> Sujeito a problemas de overfitting.
> Os modelos são instáveis (possuem alta variância).
> Não garante a construção da melhor estrutura para os dados de treino em questão (Necessita treinar várias árvores distintas).
"""

'\n> Aplicado em problemas de aprendizagem supervisionada tanto de classificação (mais utilizado) como de regressão.\n> Seleciona a ordem que os atributos irão aparecer na árvore, sempre de cima para baixo, conforme sua importância para a predição, assim como determina a separação dos ramos da árvore.\n> Para determinar o nível de importância de um atributo, denominado de ganho de informação, utiliza-se de várias métricas, sendo que as mais e aplicadas são a entropia (medida da falta de homogeneidade) e o índice de Gini (medida do grau de heterogeneidade).\n> Cálculo da Entropia (E):\n    E(S) = -∑ p(xi)log2p(xi), onde \n        p(xi) = probabilidade de ocorrer os atributos nos dados.\n        n = número de classes que pode ser atingida.\n> Cálculo do índice de Gini (Gi):\n    Gi(S) = 1 - ∑ p(xi)^2, onde \n        p(xi) = probabilidade de ocorrer os atributos nos dados.\n        n = número de classes que pode ser atingida.\n> Cálculo do ganho de informação (G):\n    G(S, A) = E(S) - ∑ 

# 54. Árvore de Decisão no Python

In [133]:
# Árvore de Decisão
"""
https://scikit-learn.org/stable/modules/tree.html
"""

'\nhttps://scikit-learn.org/stable/modules/tree.html\n'

In [134]:
from sklearn.tree import DecisionTreeClassifier

In [135]:
arvore = DecisionTreeClassifier(criterion='entropy', random_state=0, max_depth=3)
arvore.fit(x_treino, y_treino)

In [136]:
previsoes_arvore = arvore.predict(x_teste)
previsoes_arvore

array([1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1], dtype=int64)

In [137]:
y_teste

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1], dtype=int64)

In [138]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [139]:
print(f'''Acurácia: {accuracy_score(y_teste, previsoes_arvore)*100:.2f}%''')

Acurácia: 83.70%


In [140]:
confusion_matrix(y_teste, previsoes_arvore)

array([[105,  16],
       [ 29, 126]], dtype=int64)

In [141]:
print(classification_report(y_teste, previsoes_arvore))

              precision    recall  f1-score   support

           0       0.78      0.87      0.82       121
           1       0.89      0.81      0.85       155

    accuracy                           0.84       276
   macro avg       0.84      0.84      0.84       276
weighted avg       0.84      0.84      0.84       276


In [142]:
# Análise dados de treino
previsoes_treino = arvore.predict(x_treino)
previsoes_treino

array([1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1,

In [143]:
accuracy_score(y_treino, previsoes_treino)

0.8517940717628705

In [144]:
confusion_matrix(y_treino, previsoes_treino)

array([[249,  40],
       [ 55, 297]], dtype=int64)

In [145]:
# Validação Cruzada
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [146]:
# Separando os dados em folds
kfold = KFold(n_splits=30, shuffle=True, random_state=5)

In [147]:
# Criando o modelo
modelo = DecisionTreeClassifier(criterion='gini', random_state=0, max_depth=3)
resultado = cross_val_score(modelo, previsores3_esc, alvo, cv=kfold)
resultado

array([0.90322581, 0.80645161, 0.90322581, 0.87096774, 0.83870968,
       0.83870968, 0.67741935, 0.77419355, 0.80645161, 0.80645161,
       0.77419355, 0.87096774, 0.93548387, 0.80645161, 0.90322581,
       0.87096774, 0.80645161, 0.93333333, 0.8       , 0.8       ,
       0.73333333, 0.86666667, 0.8       , 0.76666667, 0.83333333,
       0.9       , 0.86666667, 0.86666667, 0.73333333, 0.86666667])

In [148]:
# Usando a média e o desvio padrão
print(f'Acurácia Média: {resultado.mean()*100:.2f}%')

Acurácia Média: 83.20%


# 55. Random Forest: teoria

In [149]:
# Random Forest (Floresta Aleatória)
"""
> Criação aleatória de várias árvores de decisão;
> Utiliza o método Ensemble (construção de vários modelos para obter um resultado único);
> É mais robusto, complexo e normalmente propicia resultados melhores, mas possui maior custo computacional;
> Em problemas de classificação o resultado que mais aparece será o escolhido (moda), já em regressão será a média;

Vantagens:
> Resultados bastante precisos;
> Normalmente não necessitam de preparações sofisticadas nos dados (label Encoder e OneHotEncoder);
> Trabalha com valores faltantes, variáveis categóricas e numéricas;
> Pouca probabilidade de ocorrência de overfitting;

Desvantagens:
> Velocidade de processamento relativamente baixa;
> Difícil interpretação de como chegou no resultado;
"""

'\n> Criação aleatória de várias árvores de decisão;\n> Utiliza o método Ensemble (construção de vários modelos para obter um resultado único);\n> É mais robusto, complexo e normalmente propicia resultados melhores, mas possui maior custo computacional;\n> Em problemas de classificação o resultado que mais aparece será o escolhido (moda), já em regressão será a média;\n\nVantagens:\n> Resultados bastante precisos;\n> Normalmente não necessitam de preparações sofisticadas nos dados (label Encoder e OneHotEncoder);\n> Trabalha com valores faltantes, variáveis categóricas e numéricas;\n> Pouca probabilidade de ocorrência de overfitting;\n\nDesvantagens:\n> Velocidade de processamento relativamente baixa;\n> Difícil interpretação de como chegou no resultado;\n'

# 56. Random Forest no Python

In [150]:
# Random Forest
"""https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html"""

'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html'

In [151]:
from sklearn.ensemble import RandomForestClassifier

In [152]:
random_forest = RandomForestClassifier(n_estimators=150, criterion='entropy', random_state=0, max_depth=4)
random_forest.fit(x_treino, y_treino)

In [153]:
previsoes_random_forest = random_forest.predict(x_teste)
previsoes_random_forest

array([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1], dtype=int64)

In [154]:
y_teste

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1], dtype=int64)

In [155]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [156]:
print(f'''Acurácia: {accuracy_score(y_teste, previsoes_random_forest)*100:.2f}%''')

Acurácia: 86.23%


In [157]:
confusion_matrix(y_teste, previsoes_random_forest)

array([[102,  19],
       [ 19, 136]], dtype=int64)

In [158]:
print(classification_report(y_teste, previsoes_random_forest))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84       121
           1       0.88      0.88      0.88       155

    accuracy                           0.86       276
   macro avg       0.86      0.86      0.86       276
weighted avg       0.86      0.86      0.86       276


In [159]:
# Análise dados de treino
previsoes_treino = random_forest.predict(x_treino)
previsoes_treino

array([1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1,

In [160]:
accuracy_score(y_treino, previsoes_treino)

0.890795631825273

In [161]:
confusion_matrix(y_treino, previsoes_treino)

array([[241,  48],
       [ 22, 330]], dtype=int64)

In [162]:
# Validação Cruzada
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [163]:
# Separando os dados em folds
kfold = KFold(n_splits=30, shuffle=True, random_state=5)

In [164]:
# Criando o modelo
modelo = RandomForestClassifier(n_estimators=150, criterion='entropy', random_state=0, max_depth=4)
resultado = cross_val_score(modelo, previsores, alvo, cv=kfold)
resultado

array([0.93548387, 0.90322581, 0.90322581, 0.83870968, 0.90322581,
       0.83870968, 0.74193548, 0.80645161, 0.83870968, 0.80645161,
       0.83870968, 0.83870968, 0.96774194, 0.87096774, 0.87096774,
       0.90322581, 0.80645161, 0.96666667, 0.9       , 0.83333333,
       0.76666667, 0.86666667, 0.83333333, 0.66666667, 0.86666667,
       0.93333333, 0.86666667, 0.86666667, 0.83333333, 0.86666667])

In [165]:
# Usando a média e o desvio padrão
print(f'Acurácia Média: {resultado.mean()*100:.2f}%')

Acurácia Média: 85.60%


# 57. XGBoost: Teoria

In [166]:
# XGBOOST (eXtreme Gradient Boosting)
"""
> Algoritmo poderoso baseado em árvores de decisão;
> É uma evolução do algoritmo Gradient Boosting, que por sua vez é uma evolução do Random Forest;
> Apresenta aderência a grande variedade de aplicações;
> Roda em várias linguagens de programação, nos principais sistemas operacionais e em nuvem;
"""

'\n> Algoritmo poderoso baseado em árvores de decisão;\n> É uma evolução do algoritmo Gradient Boosting, que por sua vez é uma evolução do Random Forest;\n> Apresenta aderência a grande variedade de aplicações;\n> Roda em várias linguagens de programação, nos principais sistemas operacionais e em nuvem;\n'

# 58. XGBoost no Python

In [167]:
# XGBOOST
"""https//xgboost.readthedocs.io/en/stable/"""

'https//xgboost.readthedocs.io/en/stable/'

In [168]:
from xgboost import XGBClassifier

In [169]:
xg = XGBClassifier(max_depth=2, learning_rate=0.05, n_estimators=250, objective='binary:logistic', random_state=3)
xg.fit(x_treino, y_treino)

In [170]:
previsoes_xg = xg.predict(x_teste)
previsoes_xg

array([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1])

In [171]:
y_teste

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1], dtype=int64)

In [172]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [173]:
print(f'''Acurácia: {accuracy_score(y_teste, previsoes_xg)*100:.2f}%''')

Acurácia: 87.68%


In [174]:
confusion_matrix(y_teste, previsoes_xg)

array([[104,  17],
       [ 17, 138]], dtype=int64)

In [175]:
print(classification_report(y_teste, previsoes_xg))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86       121
           1       0.89      0.89      0.89       155

    accuracy                           0.88       276
   macro avg       0.87      0.87      0.87       276
weighted avg       0.88      0.88      0.88       276


In [176]:
# Análise dados de treino
previsoes_treino = xg.predict(x_treino)
previsoes_treino

array([1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1,

In [177]:
accuracy_score(y_treino, previsoes_treino)

0.9157566302652106

In [178]:
confusion_matrix(y_treino, previsoes_treino)

array([[254,  35],
       [ 19, 333]], dtype=int64)

In [179]:
# Validação Cruzada
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [180]:
# Separando os dados em folds
kfold = KFold(n_splits=30, shuffle=True, random_state=5)

In [181]:
# Criando o modelo
modelo = XGBClassifier(max_depth=2, learning_rate=0.05, n_estimators=250, objective='binary:logistic', random_state=3)
resultado = cross_val_score(modelo, previsores, alvo, cv=kfold)
resultado

array([0.87096774, 0.93548387, 0.83870968, 0.90322581, 0.93548387,
       0.87096774, 0.80645161, 0.87096774, 0.83870968, 0.80645161,
       0.83870968, 0.83870968, 0.96774194, 0.90322581, 0.87096774,
       0.90322581, 0.83870968, 0.96666667, 0.93333333, 0.93333333,
       0.76666667, 0.83333333, 0.9       , 0.7       , 0.86666667,
       0.96666667, 0.86666667, 0.86666667, 0.83333333, 0.86666667])

In [182]:
# Usando a média e o desvio padrão
print(f'Acurácia Média: {resultado.mean()*100:.2f}%')

Acurácia Média: 87.13%


# 59. Light GBM: teoria

In [183]:
# Light Gradient Boosting Machine
"""
> Baseado em árvores de decisão;
> É uma evolução do Gradient Boosting e do Random Forest;
> Alta velocidade de processamento;
> No LGBM a árvore cresce em folhas ao invés de níveis;

Observações:
> O LGBM é feito para trabalhar com grandes quantidades de dados.
> Possui mais de 100 hiperparâmetros;
"""

'\n> Baseado em árvores de decisão;\n> É uma evolução do Gradient Boosting e do Random Forest;\n> Alta velocidade de processamento;\n> No LGBM a árvore cresce em folhas ao invés de níveis;\n\nObservações:\n> O LGBM é feito para trabalhar com grandes quantidades de dados.\n> Possui mais de 100 hiperparâmetros;\n'

# 60. LGBM no Python

In [352]:
# LGMB
"""https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.train.html"""

'https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.train.html'

In [353]:
import lightgbm as lgb

In [366]:
# Dataset para treino
dataset = lgb.Dataset(x_treino, label=y_treino)

In [367]:
# Parâmetros
parametros = {
    'num_leaves':250,
    'objective':'binary',
    'max_depth':2,
    'learning_rate':0.05,
    'max_bin':250
}

In [368]:
lgbm = lgb.train(parametros, dataset, num_boost_round=150)

[LightGBM] [Info] Number of positive: 352, number of negative: 289
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000229 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 354
[LightGBM] [Info] Number of data points in the train set: 641, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.549142 -> initscore=0.197204
[LightGBM] [Info] Start training from score 0.197204


In [369]:
"""# Marcação de tempo de execução
from datetime import datetime
inicio = datetime.now()
lgbm = lgb.train(parametros, dataset)
fim = datetime.now()
print(f'Tempo de execução: {fim - inicio}')"""

"# Marcação de tempo de execução\nfrom datetime import datetime\ninicio = datetime.now()\nlgbm = lgb.train(parametros, dataset)\nfim = datetime.now()\nprint(f'Tempo de execução: {fim - inicio}')"

In [370]:
previsoes_lgbm = lgbm.predict(x_teste)
previsoes_lgbm

array([0.89657674, 0.79376542, 0.96611784, 0.0345637 , 0.13164081,
       0.32808911, 0.42011256, 0.02537998, 0.05898933, 0.08452429,
       0.89221534, 0.90140987, 0.96633094, 0.9429581 , 0.41218974,
       0.90669656, 0.66820624, 0.92499868, 0.90940891, 0.16515864,
       0.9006004 , 0.93603904, 0.74183747, 0.22011269, 0.04246358,
       0.02223214, 0.94077573, 0.97032438, 0.59886293, 0.61467825,
       0.94414578, 0.04762212, 0.07400267, 0.0670975 , 0.78816801,
       0.09527251, 0.17931325, 0.05683303, 0.66382839, 0.01511374,
       0.02193676, 0.92546141, 0.92516916, 0.95050782, 0.95207891,
       0.07706578, 0.01166637, 0.90214505, 0.37468361, 0.96673338,
       0.90973708, 0.14357247, 0.10166874, 0.40031337, 0.9598319 ,
       0.14179928, 0.08635226, 0.13797462, 0.55797668, 0.10313952,
       0.94889437, 0.8967696 , 0.5851622 , 0.13936401, 0.95897641,
       0.62845591, 0.95952399, 0.30669151, 0.91749655, 0.13936401,
       0.87882209, 0.88958389, 0.31503401, 0.9598319 , 0.93522

In [371]:
previsoes_lgbm.shape

(276,)

In [372]:
# Quando for menor que 5 considera 0 e quando for maior ou igual a 5 considera 1
for i in range(0, 276):
    if previsoes_lgbm[i] >= 0.5:
        previsoes_lgbm[i] = 1
    else:
        previsoes_lgbm[i] = 0

In [373]:
previsoes_lgbm

array([1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 1., 1.,
       1., 1., 0., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0.,
       1., 0., 0., 0., 1., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1.,
       0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 1., 0., 1., 1., 1., 0.,
       1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1.,
       1., 1., 1., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1.,
       1., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1.,
       1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1., 0., 0.,
       0., 1., 1., 0., 0., 1., 0., 1., 0., 0., 1., 1., 1., 0., 0., 1., 1.,
       0., 1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 0., 1., 0.,
       1., 1., 0., 0., 1., 0., 1., 0., 0., 1., 1., 1., 0., 1., 1., 1., 0.,
       1., 1., 0., 1., 0., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1.,
       1., 0., 0., 0., 1., 0., 1., 1., 0., 1., 0., 0., 1., 0., 1., 1., 1.,
       0., 0., 1., 1., 1.

In [374]:
y_teste

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1], dtype=int64)

In [375]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [376]:
print(f'''Acurácia: {accuracy_score(y_teste, previsoes_lgbm)*100:.2f}%''')

Acurácia: 87.32%


In [377]:
confusion_matrix(y_teste, previsoes_lgbm)

array([[104,  17],
       [ 18, 137]], dtype=int64)

In [378]:
print(classification_report(y_teste, previsoes_lgbm))

              precision    recall  f1-score   support

           0       0.85      0.86      0.86       121
           1       0.89      0.88      0.89       155

    accuracy                           0.87       276
   macro avg       0.87      0.87      0.87       276
weighted avg       0.87      0.87      0.87       276


In [379]:
# Análise dados de treino
previsoes_treino = lgbm.predict(x_treino)
previsoes_treino

array([0.91322533, 0.46783483, 0.93277683, 0.87912667, 0.79091118,
       0.67168101, 0.12526164, 0.01399622, 0.05855133, 0.06401418,
       0.91401951, 0.04033972, 0.84519125, 0.77457595, 0.90740634,
       0.00679826, 0.02648392, 0.16975228, 0.28564554, 0.9429581 ,
       0.3296561 , 0.68937129, 0.90380512, 0.87293038, 0.772692  ,
       0.1023139 , 0.15520584, 0.84926588, 0.62665664, 0.91199231,
       0.96305324, 0.00679826, 0.90222525, 0.96363648, 0.0443194 ,
       0.25524952, 0.7945205 , 0.6084711 , 0.03649648, 0.87081326,
       0.09244642, 0.93825883, 0.08784896, 0.08940363, 0.93407525,
       0.96476373, 0.08899818, 0.94439448, 0.95059933, 0.83084074,
       0.50661071, 0.08533556, 0.80833912, 0.29979011, 0.95120529,
       0.36930553, 0.89667239, 0.03281802, 0.86306095, 0.07242802,
       0.04057756, 0.04994571, 0.59371247, 0.01399622, 0.92910388,
       0.85988102, 0.0778941 , 0.57920137, 0.9838597 , 0.00859929,
       0.5489168 , 0.92336416, 0.96830816, 0.96304375, 0.04182

In [380]:
previsoes_treino.shape

(641,)

In [381]:
# Quando for menor que 5 considera 0 e quando for maior ou igual a 5 considera 1
for i in range(0, 641):
    if previsoes_treino[i] >= 0.5:
        previsoes_treino[i] = 1
    else:
        previsoes_treino[i] = 0

In [382]:
previsoes_treino

array([1., 0., 1., 1., 1., 1., 0., 0., 0., 0., 1., 0., 1., 1., 1., 0., 0.,
       0., 0., 1., 0., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 0., 1., 1.,
       0., 0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 1., 0., 1., 1., 1., 1.,
       0., 1., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0., 1.,
       1., 0., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1., 0., 0.,
       1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 0., 1.,
       0., 1., 0., 1., 1., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0.,
       0., 1., 1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1.,
       1., 1., 1., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 1.,
       1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 1., 1., 1., 0., 0., 1., 1.,
       0., 0., 1., 1., 1.

In [383]:
accuracy_score(y_treino, previsoes_treino)

0.9017160686427457

In [384]:
confusion_matrix(y_treino, previsoes_treino)

array([[246,  43],
       [ 20, 332]], dtype=int64)

In [385]:
# Validação Cruzada
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [386]:
# Separando os dados em folds
kfold = KFold(n_splits =30, shuffle=True, random_state=5)

In [387]:
# Criando o modelo
modelo = lgb.LGBMClassifier(num_leaves=250, objective='binary', max_depth=2, learning_rate=0.05, max_bin=250)
resultado = cross_val_score(modelo, previsores, alvo, cv=kfold)
resultado

[LightGBM] [Info] Number of positive: 489, number of negative: 397
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000093 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 395
[LightGBM] [Info] Number of data points in the train set: 886, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.551919 -> initscore=0.208426
[LightGBM] [Info] Start training from score 0.208426
[LightGBM] [Info] Number of positive: 490, number of negative: 396
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000120 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 394
[LightGBM] [Info] Number of data points in the train set: 886, number of used features: 11
[LightGBM] [Info] [binary:BoostFro

array([0.87096774, 0.93548387, 0.87096774, 0.80645161, 0.93548387,
       0.87096774, 0.74193548, 0.87096774, 0.80645161, 0.83870968,
       0.87096774, 0.83870968, 0.96774194, 0.90322581, 0.87096774,
       0.87096774, 0.80645161, 0.96666667, 0.93333333, 0.9       ,
       0.8       , 0.83333333, 0.83333333, 0.66666667, 0.83333333,
       0.96666667, 0.86666667, 0.86666667, 0.83333333, 0.86666667])

In [388]:
# Usando a média e o desvio padrão
print(f'Acurácia Média: {resultado.mean()*100:.2f}%')

Acurácia Média: 86.15%
