In [46]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

Carregando o dataframe

In [47]:
df = pd.read_csv('data/healthcare-dataset-stroke-data.csv')
df.dropna(inplace=True)
df.replace(0, -1, inplace=True)
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,-1,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,-1,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,-1,-1,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,-1,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,-1,-1,Yes,Private,Urban,186.21,29.0,formerly smoked,1


Convertendo os dados categóricos em numéricos

In [48]:
# bmi
# https://www.cdc.gov/obesity/basics/adult-defining.html#:~:text=If%20your%20BMI%20is%20less,falls%20within%20the%20obesity%20range.
bmi = df['bmi']

df['underweight'] = bmi< 18.5
df['healthy_weight'] = (18.5 <= bmi) & (bmi < 25)
df['overweight'] = (25 <= bmi) & (bmi < 30)
df['obesity'] = 30 <= bmi


# avg_glucose_level
# https://www.cdc.gov/diabetes/basics/getting-tested.html#:~:text=A%20fasting%20blood%20sugar%20level,higher%20indicates%20you%20have%20diabetes.
glucose = df['avg_glucose_level']
df['normal_glucose_level'] = glucose <= 99
df['prediabetes'] = (100 <= glucose) & (glucose <=125)
df['diabetes'] = 126 <= glucose

# age 
age = df['age']
df['children'] = age <= 12
df['teen'] = (13<=age ) & (age<=19)
df['adult'] = (20 <=age) & (age<=39)
df['middle_age'] = (40<=age) & (age<=59)
df['senior'] = 60 <= age 

df = pd.get_dummies(df, dtype=int)
df.replace(False, -1, inplace=True)
df.replace(True, 1, inplace=True)
df.replace(0, -1, inplace=True)
df.head()


Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,underweight,healthy_weight,overweight,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,67.0,-1,1,228.69,36.6,1,-1,-1,-1,...,-1,1,-1,-1,-1,1,-1,1,-1,-1
2,31112,80.0,-1,1,105.92,32.5,1,-1,-1,-1,...,-1,1,-1,-1,1,-1,-1,-1,1,-1
3,60182,49.0,-1,-1,171.23,34.4,1,-1,-1,-1,...,-1,1,-1,-1,-1,1,-1,-1,-1,1
4,1665,79.0,1,-1,174.12,24.0,1,-1,1,-1,...,-1,-1,1,-1,1,-1,-1,-1,1,-1
5,56669,81.0,-1,-1,186.21,29.0,1,-1,-1,1,...,-1,1,-1,-1,-1,1,-1,1,-1,-1


In [49]:
# caracteríticas
X = df.drop(['id', 'stroke', 'age', 'avg_glucose_level', 'bmi'], axis=1)
X = X.astype('float64')

# resultados
y = df['stroke']
y = y.astype('float64')

Separando linhas de dados em treino e teste

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)
X_train,  X_test = X_train.to_numpy().T,  X_test.to_numpy().T
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()


In [51]:
X.head()

Unnamed: 0,hypertension,heart_disease,underweight,healthy_weight,overweight,obesity,normal_glucose_level,prediabetes,diabetes,children,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,...,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0
2,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,...,-1.0,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0
3,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,...,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0
4,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,...,-1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0
5,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,...,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0


In [52]:
def accuracy(y_test, y_est):
    return np.mean(np.sign(y_test)==np.sign(y_est))

In [57]:
import autograd.numpy as np_ 
from autograd import grad

def loss( parametros ):
    w, b, pontos, val = parametros # parametros como vetores
    est = w.T @ pontos + b
    mse = np_.mean( (est - val)**2)

    return mse

g = grad(loss)

pontos = X_train
alvos = y_train

w = np.random.randn(X_train.shape[0],1)
b = 0.0
alpha = 10**-2

for n in range(10**3):
    grad_ = g( (w, b, pontos, alvos) )
    w -= alpha*grad_[0]
    b -= alpha*grad_[1]

y_hat = w.T @ X_test + b
print(accuracy(y_hat, y_test))
print(b)

0.9556008146639511
0.0324857123981846


In [58]:
df_1 =pd.DataFrame()
df_1['features'] = X.columns
df_1['peso'] = w
df_1.sort_values(by='peso', inplace=True, ascending=False)
df_1.head(30)

Unnamed: 0,features,peso
16,gender_Other,2.114014
22,work_type_Self-employed,0.646679
19,work_type_Govt_job,0.643627
21,work_type_Private,0.642554
24,Residence_type_Rural,0.636967
25,Residence_type_Urban,0.636834
23,work_type_children,0.577586
8,diabetes,0.215892
7,prediabetes,0.174891
6,normal_glucose_level,0.162831


In [55]:
df_ = pd.DataFrame()

features = X.columns
w_ordenado = np.sort(w, axis=0).T.tolist()[0]
list_w = w.T.tolist()[0]
ranking = []
peso = []
for i in w_ordenado:
    a = list_w.index(i)
    peso.append(i)
    ranking.append(features[a])
    # print(X.columns[a])
    # print(i)
df_['features'] = ranking[::-1]
df_['peso'] = peso[::-1]
df_.head(10)

Unnamed: 0,features,peso
0,gender_Female,1.01153
1,gender_Male,1.010819
2,work_type_Self-employed,0.452527
3,work_type_Govt_job,0.451395
4,work_type_Private,0.439479
5,smoking_status_formerly smoked,0.410044
6,smoking_status_smokes,0.409132
7,smoking_status_never smoked,0.407052
8,smoking_status_Unknown,0.40163
9,children,0.355718


In [56]:
from sklearn.metrics import accuracy_score

print(y_hat[0].shape, y_test.shape)
print(accuracy_score(y_test, y_hat[0]))

(2455,) (2455,)


ValueError: Classification metrics can't handle a mix of binary and continuous targets