Importando bibliotecas

In [64]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np

Carregando o dataframe

In [65]:
df = pd.read_csv('data/healthcare-dataset-stroke-data.csv')
df.dropna(inplace=True)
df.replace(0, -1, inplace=True)
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,-1,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,-1,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,-1,-1,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,-1,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,-1,-1,Yes,Private,Urban,186.21,29.0,formerly smoked,1


Convertendo os dados categóricos em numéricos

In [66]:
# bmi
# https://www.cdc.gov/obesity/basics/adult-defining.html#:~:text=If%20your%20BMI%20is%20less,falls%20within%20the%20obesity%20range.
bmi = df['bmi']

df['underweight'] = bmi< 18.5
df['healthy_weight'] = (18.5 <= bmi) & (bmi < 25)
df['overweight'] = (25 <= bmi) & (bmi < 30)
df['obesity'] = 30 <= bmi


# avg_glucose_level
# https://www.cdc.gov/diabetes/basics/getting-tested.html#:~:text=A%20fasting%20blood%20sugar%20level,higher%20indicates%20you%20have%20diabetes.
glucose = df['avg_glucose_level']
df['normal_glucose_level'] = glucose <= 99
df['prediabetes'] = (100 <= glucose) & (glucose <=125)
df['diabetes'] = 126 <= glucose

# age 
age = df['age']
df['children'] = age <= 12
df['teen'] = (13<=age ) & (age<=19)
df['adult'] = (20 <=age) & (age<=39)
df['middle_age'] = (40<=age) & (age<=59)
df['senior'] = 60 <= age 

df = pd.get_dummies(df, dtype=float)
df.replace(False, -1, inplace=True)
df.replace(True, 1, inplace=True)
df.replace(0, -1, inplace=True)
df.head(2)


Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,underweight,healthy_weight,overweight,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,67.0,-1,1,228.69,36.6,1,-1,-1,-1,...,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0
2,31112,80.0,-1,1,105.92,32.5,1,-1,-1,-1,...,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0


In [67]:
# caracteríticas
X = df.drop(['id', 'stroke', 'age', 'avg_glucose_level', 'bmi'], axis=1)
X = X.astype('float64')

# resultados
y = df['stroke']
y = y.astype('float64')

Separando linhas de dados em treino e teste

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)
X_train,  X_test = X_train.to_numpy().T,  X_test.to_numpy().T
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()


função de acurácia utilizada

In [84]:
def accuracy(y_test, y_est):
    return np.mean(np.sign(y_test)==np.sign(y_est))

def acuracia_hipotese_nula(y_test):
    sem_avc = np.count_nonzero(y_test == -1)
    com_avc = np.count_nonzero(y_test == 1)
    maior = max(sem_avc, com_avc)
    return maior / len(y_test)

print(acuracia_hipotese_nula(y_test))


def tabela_relevancia(w_):
    df_ = pd.DataFrame()
    df_['features'] = X.columns
    df_['peso'] = w_
    df_.sort_values(by='peso', inplace=True, ascending=False)
    return df_


0.9604887983706721


## Classificação linear

In [98]:
import autograd.numpy as np_ 
from autograd import grad

def loss( parametros ):
    w, b, pontos, val = parametros # parametros como vetores
    est = w.T @ pontos + b
    mse = np_.mean( (est - val)**2)

    return mse

g = grad(loss)

pontos = X_train
alvos = y_train

w = np.random.randn(X_train.shape[0],1)
b = 0.0
alpha = 10**-3

for n in range(10**4):
    grad_ = g( (w, b, pontos, alvos) )
    w -= alpha*grad_[0]
    b -= alpha*grad_[1]

w_linear = w
y_hat = w_linear.T @ X_test + b
acuracia_linear = accuracy(y_hat, y_test)
print(f'Acurácia: {acuracia_linear}')

Acurácia: 0.9446028513238289


In [99]:
df_linear = tabela_relevancia(w_linear)
df_linear.head(10)

Unnamed: 0,features,peso
24,Residence_type_Rural,1.103246
25,Residence_type_Urban,1.098704
9,children,1.097864
10,teen,0.89497
13,senior,0.850422
5,obesity,0.794295
4,overweight,0.791987
12,middle_age,0.77811
3,healthy_weight,0.777993
11,adult,0.772807


## Classificador por árvore

In [94]:
df_rotulo = y_train
df_features = X_train.T

In [95]:
tree = DecisionTreeClassifier(criterion='entropy')
tree.fit(df_features, df_rotulo)

In [74]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# plt.figure( figsize=(20,20) )
# a = plot_tree(tree, feature_names=df_features.columns, fontsize=15, 
#               node_ids=False, impurity=False, filled=True)

In [96]:
w_arvore = tree.feature_importances_
y_est = w_arvore.T @ X_test 
acuracia_linear = accuracy(y_hat, y_test)
print(f'Acurácia: {acuracia_linear}')

Acurácia: 0.9604887983706721


In [97]:
df_arvore = tabela_relevancia(w_arvore)
df_arvore.head()

Unnamed: 0,features,peso
13,senior,0.162902
29,smoking_status_smokes,0.068002
4,overweight,0.06777
25,Residence_type_Urban,0.052239
21,work_type_Private,0.047899


In [77]:
from sklearn.metrics import accuracy_score

y_pred = [0, 2, 1, 3]
y_true = [0, 1, 2, 3]
print(accuracy_score(y_true, y_pred))
print(y_hat[0].shape, y_test.shape)
print(accuracy_score(y_test, y_hat[0]))

0.5
(2455,) (2455,)


ValueError: Classification metrics can't handle a mix of binary and continuous targets