# 0.0 - Carregando os dados

Importamos as bibliotecas necessárias para o notebook e carregamos a base de dados tratada

In [1]:
import pandas as pd
import pickle
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score

In [2]:
df = pd.read_csv('../data/data_clean.csv', sep=',')
print(df.shape)
df.head()

(43400, 11)


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,3.0,0,0,0,4,0,95.12,18.0,1,0
1,1,58.0,1,0,1,2,1,87.96,39.2,1,0
2,0,8.0,0,0,0,2,1,110.89,17.6,1,0
3,0,70.0,0,0,1,2,0,69.04,35.9,0,0
4,1,14.0,0,0,0,1,0,161.28,19.1,1,0


In [3]:
df.describe()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
count,43400.0,43400.0,43400.0,43400.0,43400.0,43400.0,43400.0,43400.0,43400.0,43400.0,43400.0
mean,0.408894,42.217894,0.093571,0.047512,0.643733,2.185438,0.50129,104.48275,28.605038,0.978548,0.018041
std,0.492151,22.519649,0.291235,0.212733,0.478901,1.093158,0.500004,43.111751,7.638023,0.568678,0.133103
min,0.0,0.08,0.0,0.0,0.0,0.0,0.0,55.0,10.1,0.0,0.0
25%,0.0,24.0,0.0,0.0,0.0,2.0,0.0,77.54,23.4,1.0,0.0
50%,0.0,44.0,0.0,0.0,1.0,2.0,1.0,91.58,28.1,1.0,0.0
75%,1.0,60.0,0.0,0.0,1.0,3.0,1.0,112.07,32.6,1.0,0.0
max,2.0,82.0,1.0,1.0,1.0,4.0,1.0,291.05,97.6,2.0,1.0


## 0.1 - Preprocessamento da base

Aplicamos o preprocessamento tipo standartization na base de dados e salvamos em um pickle

In [4]:
dict_columns = {}

for cols in df.columns:
    u = df[cols].mean()
    s = df[cols].std()
    dict_columns[cols] = [round(u, 2),round(s,2)]

pickle.dump(dict_columns, open('../models/std_scalar.pkl', 'wb'))

In [5]:
dict_columns

{'gender': [0.41, 0.49],
 'age': [42.22, 22.52],
 'hypertension': [0.09, 0.29],
 'heart_disease': [0.05, 0.21],
 'ever_married': [0.64, 0.48],
 'work_type': [2.19, 1.09],
 'Residence_type': [0.5, 0.5],
 'avg_glucose_level': [104.48, 43.11],
 'bmi': [28.61, 7.64],
 'smoking_status': [0.98, 0.57],
 'stroke': [0.02, 0.13]}

In [19]:
df_scalar = pd.DataFrame()

for cols in df.columns:
    z_prod = (df[cols] - dict_columns[cols][0])/dict_columns[cols][1]
    df_scalar[cols] = z_prod

df_scalar

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1.204082,-1.741563,-0.310345,-0.238095,-1.333333,1.660550,-1.0,-0.217119,-1.388743,0.035088,-0.153846
1,1.204082,0.700710,3.137931,-0.238095,0.750000,-0.174312,1.0,-0.383206,1.386126,0.035088,-0.153846
2,-0.836735,-1.519538,-0.310345,-0.238095,-1.333333,-0.174312,1.0,0.148689,-1.441099,0.035088,-0.153846
3,-0.836735,1.233570,-0.310345,-0.238095,0.750000,-0.174312,-1.0,-0.822083,0.954188,-1.719298,-0.153846
4,1.204082,-1.253108,-0.310345,-0.238095,-1.333333,-1.091743,-1.0,1.317560,-1.244764,0.035088,-0.153846
...,...,...,...,...,...,...,...,...,...,...,...
43395,-0.836735,-1.430728,-0.310345,-0.238095,-1.333333,1.660550,1.0,-1.063326,-1.074607,0.035088,-0.153846
43396,-0.836735,0.611901,-0.310345,-0.238095,0.750000,-2.009174,1.0,2.531431,3.506545,-1.719298,-0.153846
43397,-0.836735,1.766430,3.137931,-0.238095,0.750000,-0.174312,1.0,-0.290884,0.037958,-1.719298,-0.153846
43398,1.204082,-0.098579,-0.310345,-0.238095,0.750000,-0.174312,1.0,-0.123405,0.600785,0.035088,-0.153846


In [20]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,3.0,0,0,0,4,0,95.12,18.0,1,0
1,1,58.0,1,0,1,2,1,87.96,39.2,1,0
2,0,8.0,0,0,0,2,1,110.89,17.6,1,0
3,0,70.0,0,0,1,2,0,69.04,35.9,0,0
4,1,14.0,0,0,0,1,0,161.28,19.1,1,0
...,...,...,...,...,...,...,...,...,...,...,...
43395,0,10.0,0,0,0,4,1,58.64,20.4,1,0
43396,0,56.0,0,0,1,0,1,213.61,55.4,0,0
43397,0,82.0,1,0,1,2,1,91.94,28.9,0,0
43398,1,40.0,0,0,1,2,1,99.16,33.2,1,0


## 0.2 - Divisão da Base em Treinamento e Teste

Dividimos randomicamente o dataset em teste e treino de x e y, repectivamente, ao descompactar a tupla de retorno. O valor `random_state` significa um número de seed dado com fins de reprodutibilidade da divisão.

In [49]:
X = df.drop(columns=['stroke'])
y = df['stroke']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"train {X_train.shape}")
print(f"test {X_test.shape}")

train (34720, 10)
test (8680, 10)


In [50]:
y.shape

(43400,)

In [51]:
X.shape

(43400, 10)

# 1.0 - Modelo de SVM

In [52]:
# Crie um objeto SVM
clf = svm.SVC()

# Ajuste o modelo aos dados de treinamento
clf.fit(X_train, y_train)

In [54]:
# Faça previsões nos dados de teste
y_pred = clf.predict(X_test)
y_train_pred = clf.predict(X_train)

In [55]:
# Calcule a precisão (accuracy) do modelo
accuracy = accuracy_score(y_test, y_pred)

print("Acurácia:", accuracy)

Acurácia: 0.9814516129032258


In [56]:
recall = recall_score(y_test,y_pred)
recall

0.0

In [57]:
f1 = f1_score(y_pred=y_pred,y_true=y_test)
f1

0.0

# 2.0 - Modelo de Regressão Logística

In [58]:
# Crie um objeto de regressão logística
logreg = LogisticRegression()

# Ajuste o modelo aos dados de treinamento
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [59]:
# Faça previsões nos dados de teste
y_pred_log = logreg.predict(X_test)

In [60]:
# Calcule a acurácia do modelo
accuracy_log = accuracy_score(y_test, y_pred_log)
print("Acurácia:", accuracy_log)

Acurácia: 0.9814516129032258


# 3.0 - Salvando Modelo

In [61]:
# Salvar o modelo em um arquivo pickle
with open('../models/modelo_SVM.pkl', 'wb') as file:
    pickle.dump(clf, file)