In [28]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose       import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

In [2]:
df = pd.read_csv('/content/drive/MyDrive/2024 1/ML/Course/1/heart_tratado.csv')

In [None]:
df.head()

In [None]:
df.shape

(917, 12)

# Tratamento nominal para categórico (Manual)

In [3]:
df2 = pd.DataFrame.copy(df)

In [4]:
df2['Sex'].replace({'M': 0, 'F': 1}, inplace=True)
df2['ChestPainType'].replace({'TA': 0, 'ATA': 1, 'NAP': 2, 'ASY': 3}, inplace=True)
df2['RestingECG'].replace({'Normal': 0, 'ST': 1, 'LVH': 2}, inplace=True)
df2['ExerciseAngina'].replace({'N': 0, 'Y': 1}, inplace=True)
df2['ST_Slope'].replace({'Up': 0, 'Flat':1, 'Down': 2}, inplace=True)

In [None]:
df2.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,1,140,289.0,0,0,172,0,0.0,0,0
1,49,1,2,160,180.0,0,0,156,0,1.0,1,1
2,37,0,1,130,283.0,0,1,98,0,0.0,0,0
3,48,1,3,138,214.0,0,0,108,1,1.5,1,1
4,54,0,2,150,195.0,0,0,122,0,0.0,0,0


In [None]:
df2.dtypes

Age                 int64
Sex                 int64
ChestPainType       int64
RestingBP           int64
Cholesterol       float64
FastingBS           int64
RestingECG          int64
MaxHR               int64
ExerciseAngina      int64
Oldpeak           float64
ST_Slope            int64
HeartDisease        int64
dtype: object

# Atributos Previsores

In [7]:
previsores = df2.drop('HeartDisease', axis=1).values
# previsores = df2.iloc[:, 0:11].values

In [8]:
previsores

array([[40. ,  0. ,  1. , ...,  0. ,  0. ,  0. ],
       [49. ,  1. ,  2. , ...,  0. ,  1. ,  1. ],
       [37. ,  0. ,  1. , ...,  0. ,  0. ,  0. ],
       ...,
       [57. ,  0. ,  3. , ...,  1. ,  1.2,  1. ],
       [57. ,  1. ,  1. , ...,  0. ,  0. ,  1. ],
       [38. ,  0. ,  2. , ...,  0. ,  0. ,  0. ]])

In [9]:
alvo = df2['HeartDisease'].values
# alvo = df2.iloc[:, 11] #pandas.core.series.Series

In [10]:
alvo

array([0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,

# Análise das escalas dos atributos

In [11]:
df2.describe()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,53.509269,0.210469,2.251908,132.540894,244.635389,0.23337,0.604144,136.789531,0.40458,0.886696,0.63795,0.55289
std,9.437636,0.407864,0.931502,17.999749,53.347125,0.423206,0.806161,25.467129,0.491078,1.06696,0.60727,0.497466
min,28.0,0.0,0.0,80.0,85.0,0.0,0.0,60.0,0.0,-2.6,0.0,0.0
25%,47.0,0.0,2.0,120.0,214.0,0.0,0.0,120.0,0.0,0.0,0.0,0.0
50%,54.0,0.0,3.0,130.0,244.635389,0.0,0.0,138.0,0.0,0.6,1.0,1.0
75%,60.0,0.0,3.0,140.0,267.0,0.0,1.0,156.0,1.0,1.5,1.0,1.0
max,77.0,1.0,3.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,2.0,1.0


In [12]:
previsores_esc = StandardScaler().fit_transform(previsores)

In [13]:
previsores_df = pd.DataFrame(previsores_esc)
previsores_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-1.432206,-0.516309,-1.344701,0.414627,0.832075,-0.551733,-0.749818,1.383339,-0.82431,-0.831502,-1.051095
1,-0.478057,1.936826,-0.27058,1.52636,-1.212261,-0.551733,-0.749818,0.754736,-0.82431,0.106251,0.596519
2,-1.750256,-0.516309,-1.344701,-0.14124,0.719543,-0.551733,0.491306,-1.523953,-0.82431,-0.831502,-1.051095
3,-0.584074,1.936826,0.803541,0.303453,-0.574578,-0.551733,-0.749818,-1.131075,1.213136,0.575128,0.596519
4,0.052026,-0.516309,-0.27058,0.970493,-0.930931,-0.551733,-0.749818,-0.581047,-0.82431,-0.831502,-1.051095


In [14]:
previsores_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,1.859654e-16,7.748558e-18,1.046055e-16,7.767929e-16,-1.86934e-16,4.649135e-17,0.0,-5.114048e-16,-1.046055e-16,7.748558000000001e-17,-3.8742790000000005e-17
std,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546
min,-2.704405,-0.5163086,-2.418822,-2.920572,-2.994023,-0.5517333,-0.749818,-3.016886,-0.8243101,-3.269662,-1.051095
25%,-0.6900904,-0.5163086,-0.2705801,-0.6971063,-0.5745784,-0.5517333,-0.749818,-0.6596226,-0.8243101,-0.8315022,-1.051095
50%,0.05202558,-0.5163086,0.803541,-0.1412398,0.0,-0.5517333,-0.749818,0.04755658,-0.8243101,-0.26885,0.5965186
75%,0.688125,-0.5163086,0.803541,0.4146267,0.4194568,-0.5517333,0.491306,0.7547357,1.213136,0.5751284,0.5965186
max,2.490407,1.936826,0.803541,3.749826,6.721265,1.81247,1.73243,2.561971,1.213136,4.982571,2.244132


# LabelEncoder
## transformação de variáveis categóricas em numéricas

In [15]:
previsores2 = df.copy()

In [16]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289.0,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180.0,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283.0,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214.0,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195.0,0,Normal,122,N,0.0,Up,0


In [17]:
# previsores2.drop('Sex', axis=1)
previsores2['Sex'] = LabelEncoder().fit_transform(df['Sex'])
previsores2['ChestPainType'] = LabelEncoder().fit_transform(df['ChestPainType'])
previsores2['RestingECG'] = LabelEncoder().fit_transform(df['RestingECG'])
previsores2['ExerciseAngina'] = LabelEncoder().fit_transform(df['ExerciseAngina'])
previsores2['ST_Slope'] = LabelEncoder().fit_transform(df['ST_Slope'])

In [18]:
previsores2.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289.0,0,1,172,0,0.0,2,0
1,49,0,2,160,180.0,0,1,156,0,1.0,1,1
2,37,1,1,130,283.0,0,2,98,0,0.0,2,0
3,48,0,0,138,214.0,0,1,108,1,1.5,1,1
4,54,1,2,150,195.0,0,1,122,0,0.0,2,0


In [19]:
previsores2.shape

(917, 12)

## ColumnTransformer

In [20]:
previsores2.drop('HeartDisease', axis=1, inplace=True)

In [21]:
previsores3 = ColumnTransformer(
    transformers=[('OneHot', OneHotEncoder(), ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'])],
    remainder='passthrough', verbose=True
    ).fit_transform(previsores2)

[ColumnTransformer] ........ (1 of 2) Processing OneHot, total=   0.0s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s


In [22]:
pd.DataFrame(previsores3).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,40.0,140.0,289.0,0.0,172.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,49.0,160.0,180.0,0.0,156.0,1.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,37.0,130.0,283.0,0.0,98.0,0.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,48.0,138.0,214.0,0.0,108.0,1.5
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,54.0,150.0,195.0,0.0,122.0,0.0


In [23]:
previsores3.shape

(917, 20)

## Escalonamento Again

In [24]:
previsores3_esc = StandardScaler().fit_transform(previsores3)

In [25]:
previsores3_df = pd.DataFrame(previsores3_esc)

In [26]:
previsores3_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,1.084798e-16,-1.472226e-16,1.937139e-17,-3.8742790000000005e-17,3.8742790000000005e-17,6.973702000000001e-17,0.0,-9.298269e-17,1.549712e-17,-4.2617070000000006e-17,4.2617070000000006e-17,8.523413e-17,0.0,-3.8742790000000005e-17,1.859654e-16,7.884157e-16,3.014189e-15,-1.549712e-17,-5.114048e-16,-1.859654e-16
std,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546
min,-0.5163086,-1.936826,-1.085425,-0.4822104,-0.5315237,-0.2298105,-0.507826,-1.226974,-0.490781,-1.213136,-0.8243101,-0.2716072,-1.001091,-0.8698879,-2.704405,-2.920572,-2.994023,-0.5517333,-3.016886,-3.269662
25%,-0.5163086,0.5163086,-1.085425,-0.4822104,-0.5315237,-0.2298105,-0.507826,-1.226974,-0.490781,-1.213136,-0.8243101,-0.2716072,-1.001091,-0.8698879,-0.6900904,-0.6971063,-0.5745784,-0.5517333,-0.6596226,-0.8315022
50%,-0.5163086,0.5163086,0.9212982,-0.4822104,-0.5315237,-0.2298105,-0.507826,0.8150134,-0.490781,0.8243101,-0.8243101,-0.2716072,0.99891,-0.8698879,0.05202558,-0.1412398,3.19836e-15,-0.5517333,0.04755658,-0.26885
75%,-0.5163086,0.5163086,0.9212982,-0.4822104,-0.5315237,-0.2298105,-0.507826,0.8150134,-0.490781,0.8243101,1.213136,-0.2716072,0.99891,1.149573,0.688125,0.4146267,0.4194568,-0.5517333,0.7547357,0.5751284
max,1.936826,0.5163086,0.9212982,2.073784,1.881384,4.351412,1.969177,0.8150134,2.037569,0.8243101,1.213136,3.681787,0.99891,1.149573,2.490407,3.749826,6.721265,1.81247,2.561971,4.982571


# Treino & Teste

**alvo** = variável que se pretende atingir (tem ou não doença cardíaca).

**previsores** = conjunto de variáveis previsoras com as variáveis categóricas transformadas em numéricas manualmente, sem escalonar.

**previsores_esc** = conjunto de variáveis previsoras com as variáveis categóricas transformadas em numéricas, escalonada.

**previsores2** = conjunto de variáveis previsoras com as variáveis categóricas transformadas em numéricas pelo labelencoder.

**previsores3** = conjunto de variáveis previsoras transformadas pelo labelencoder e onehotencoder, sem escalonar.

**previsores3_esc** = conjunto de variáveis previsoras transformadas pelo labelencoder e onehotencoder escalonada.

In [34]:
X_train, X_test, y_train, y_test = train_test_split(
    previsores3_esc, alvo, test_size=0.3, random_state=0)

In [None]:
y_train

array([1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,

# NAIVE BAYES

In [None]:
naive = GaussianNB()
naive.fit(X_train, y_train)

In [None]:
previsoes_naive = naive.predict(X_test)
previsoes_naive

array([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0])

In [None]:
# accuracy_score(y_test, previsoes_naive)
accuracy_score(y_test, naive.predict(X_test))

0.8442028985507246

In [None]:
confusion_matrix(y_test, previsoes_naive)

array([[102,  19],
       [ 24, 131]])

In [None]:
print(classification_report(y_test, previsoes_naive))

              precision    recall  f1-score   support

           0       0.81      0.84      0.83       121
           1       0.87      0.85      0.86       155

    accuracy                           0.84       276
   macro avg       0.84      0.84      0.84       276
weighted avg       0.85      0.84      0.84       276



## Dados de Treino

In [None]:
previsoes_treino = naive.predict(X_train)

In [None]:
accuracy_score(y_train, previsoes_treino)

0.8361934477379095

In [None]:
confusion_matrix(y_train, previsoes_treino)

array([[243,  46],
       [ 59, 293]])

## Validação Cruzada

In [None]:
#Separando os dados em folds
kfold = KFold(n_splits=30, shuffle=True, random_state=5)

In [None]:
#CCriando o modelo
modelo = GaussianNB()
resultado = cross_val_score(modelo, previsores3_esc, alvo, cv=kfold)
resultado

array([0.87096774, 0.87096774, 0.90322581, 0.80645161, 0.77419355,
       0.90322581, 0.70967742, 0.87096774, 0.77419355, 0.87096774,
       0.90322581, 0.77419355, 0.93548387, 0.90322581, 0.87096774,
       0.87096774, 0.87096774, 0.93333333, 0.93333333, 0.9       ,
       0.76666667, 0.83333333, 0.8       , 0.7       , 0.83333333,
       0.93333333, 0.86666667, 0.9       , 0.86666667, 0.8       ])

In [None]:
#média e desvio padrão
resultado.mean()

0.8516845878136201

Nas validações cruzadas, os resultados não podem diferir muito comparado com a acurácia do modelo.
No caso do modelo, a acurácia foi de *85,42%*
No caso da validação cruzada, a acurácia foi de *85,16*

# SVM

In [35]:
svm = SVC(kernel='rbf', C=2, random_state=1)

In [36]:
svm.fit(X_train, y_train)

In [38]:
previsoes_svm = svm.predict(X_train)

In [39]:
accuracy_score(y_train, previsoes_svm)

0.9282371294851794

In [42]:
confusion_matrix(y_train, previsoes_svm)

array([[257,  32],
       [ 14, 338]])

In [44]:
print(classification_report(y_train, previsoes_svm))

              precision    recall  f1-score   support

           0       0.95      0.89      0.92       289
           1       0.91      0.96      0.94       352

    accuracy                           0.93       641
   macro avg       0.93      0.92      0.93       641
weighted avg       0.93      0.93      0.93       641



## Análise dados de treino

In [45]:
previsoes_treino = svm.predict(X_train)

In [46]:
accuracy_score(y_train, previsoes_treino)

0.9282371294851794

In [47]:
confusion_matrix(y_train, previsoes_treino)

array([[257,  32],
       [ 14, 338]])

## Validação Cruzada

In [48]:
kfold = KFold(n_splits=30, shuffle=True, random_state=5)

In [50]:
modelo = SVC(kernel='rbf', C=2, random_state=1)
result = cross_val_score(modelo, previsores3_esc, alvo, cv=kfold)
result

array([0.96774194, 0.83870968, 0.87096774, 0.90322581, 0.83870968,
       0.87096774, 0.70967742, 0.87096774, 0.80645161, 0.80645161,
       0.87096774, 0.80645161, 0.96774194, 0.90322581, 0.80645161,
       0.83870968, 0.80645161, 0.93333333, 0.86666667, 0.93333333,
       0.83333333, 0.86666667, 0.86666667, 0.63333333, 0.83333333,
       0.93333333, 0.9       , 0.86666667, 0.9       , 0.86666667])

In [51]:
result.mean()

0.8572401433691755