### Importando as bibliotecas e realizando a análise exploratória
---

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
data = pd.read_csv("diabetes.csv")

### Tratamento de dados
---

In [3]:
data['age'] = data['age'].astype('int32')

In [4]:
def gender_adjust(gender):
    if gender == 'Male':
        return 0
    elif gender == 'Female':
        return 1
    else:
        return 2

In [5]:
data['gender'] = data['gender'].apply(gender_adjust)
data.gender.value_counts()

gender
1    58552
0    41430
2       18
Name: count, dtype: int64

In [6]:
from sklearn.preprocessing import OneHotEncoder

In [7]:
smoking_encoder = OneHotEncoder(handle_unknown='ignore')

In [8]:
smoking_encoder = smoking_encoder.fit(data[['smoking_history']])

In [9]:
smk_transform = smoking_encoder.transform(data[['smoking_history']]).toarray()
smk_columns = smoking_encoder.get_feature_names_out()

In [10]:
pre_smk_data = pd.DataFrame(smk_transform, columns=smk_columns, dtype='int32')

In [11]:
one_smk_data = pd.concat([data, pre_smk_data], axis=1)
one_smk_data.drop(['smoking_history'], axis=1, inplace=True)
one_smk_data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,1,80,0,1,25.19,6.6,140,0,0,0,0,0,1,0
1,1,54,0,0,27.32,6.6,80,0,1,0,0,0,0,0
2,0,28,0,0,27.32,5.7,158,0,0,0,0,0,1,0
3,1,36,0,0,23.45,5.0,155,0,0,1,0,0,0,0
4,0,76,1,1,20.14,4.8,155,0,0,1,0,0,0,0


In [12]:
data.smoking_history.value_counts()

smoking_history
No Info        35816
never          35095
former          9352
current         9286
not current     6447
ever            4004
Name: count, dtype: int64

In [13]:
def smoking_adjust(smoking):
    if smoking == 'No Info':
        return 0
    elif smoking == 'never':
        return 1
    elif smoking == 'former':
        return 2
    elif smoking == 'not current':
        return 3
    elif smoking == 'current':
        return 4
    elif smoking == 'ever':
        return 5

In [14]:
smk_data = data
smk_data['smoking_history'] = smk_data['smoking_history'].apply(smoking_adjust)
smk_data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,1,80,0,1,1,25.19,6.6,140,0
1,1,54,0,0,0,27.32,6.6,80,0
2,0,28,0,0,1,27.32,5.7,158,0
3,1,36,0,0,4,23.45,5.0,155,0
4,0,76,1,1,4,20.14,4.8,155,0


In [15]:
smk_data.smoking_history.value_counts()

smoking_history
0    35816
1    35095
2     9352
4     9286
3     6447
5     4004
Name: count, dtype: int64

In [16]:
no_smk_data = data.drop('smoking_history', axis=1)
no_smk_data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
0,1,80,0,1,25.19,6.6,140,0
1,1,54,0,0,27.32,6.6,80,0
2,0,28,0,0,27.32,5.7,158,0
3,1,36,0,0,23.45,5.0,155,0
4,0,76,1,1,20.14,4.8,155,0


### KNN
---

In [17]:
X_one_smk = one_smk_data.drop('diabetes', axis=1)
y_one_smk = one_smk_data['diabetes']

In [18]:
X_smk = smk_data.drop('diabetes', axis=1)
y_smk = smk_data['diabetes']

In [19]:
X_no_smk = no_smk_data.drop(['diabetes'], axis=1)
y_no_smk = no_smk_data['diabetes']

In [20]:
from sklearn.preprocessing import StandardScaler

In [21]:
scaler = StandardScaler()

In [22]:
scaler_one_smoke = scaler.fit_transform(X_one_smk)
scaler_smk = scaler.fit_transform(X_smk)
scaler_no_smk = scaler.fit_transform(X_no_smk)

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X_one_smk_train, X_one_smk_test, y_one_smk_train, y_one_smk_test = train_test_split(scaler_one_smoke, y_one_smk, test_size=0.3, random_state=101)

In [25]:
X_smk_train, X_smk_test, y_smk_train, y_smk_test = train_test_split(scaler_smk, y_smk, test_size=0.3, random_state=101)

In [26]:
X_no_smk_train, X_no_smk_test, y_no_smk_train, y_no_smk_test = train_test_split(scaler_no_smk, y_no_smk, test_size=0.3, random_state=101)

In [27]:
from sklearn.neighbors import KNeighborsClassifier

In [28]:
knn_one_smk = KNeighborsClassifier()
knn_smk = KNeighborsClassifier()
knn_no_smk = KNeighborsClassifier()

In [29]:
knn_one_smk.fit(X_one_smk_train, y_one_smk_train)
knn_smk.fit(X_smk_train, y_smk_train)
knn_no_smk.fit(X_no_smk_train, y_no_smk_train)

In [30]:
knn_one_smk_pred = knn_one_smk.predict(X_one_smk_test)
knn_smk_pred = knn_smk.predict(X_smk_test)
knn_no_smk_pred = knn_no_smk.predict(X_no_smk_test)

### Resultados
---

In [31]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [32]:
print('One Smoke: ', accuracy_score(y_one_smk_test, knn_one_smk_pred))
print('Smoke: ', accuracy_score(y_smk_test, knn_smk_pred))
print('No Smoke: ', accuracy_score(y_no_smk_test, knn_no_smk_pred))

One Smoke:  0.9608333333333333
Smoke:  0.9614333333333334
No Smoke:  0.9629333333333333


In [33]:
print('One Smoke:\n', confusion_matrix(y_one_smk_test, knn_one_smk_pred))
print('Smoke:\n', confusion_matrix(y_smk_test, knn_smk_pred))
print('No Smoke:\n', confusion_matrix(y_no_smk_test, knn_no_smk_pred))

One Smoke:
 [[27308   167]
 [ 1008  1517]]
Smoke:
 [[27307   168]
 [  989  1536]]
No Smoke:
 [[27282   193]
 [  919  1606]]


In [34]:
print('One Smoke:\n', classification_report(y_one_smk_test, knn_one_smk_pred))
print('Smoke:\n', classification_report(y_smk_test, knn_smk_pred))
print('No Smoke:\n', classification_report(y_no_smk_test, knn_no_smk_pred))

One Smoke:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98     27475
           1       0.90      0.60      0.72      2525

    accuracy                           0.96     30000
   macro avg       0.93      0.80      0.85     30000
weighted avg       0.96      0.96      0.96     30000

Smoke:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98     27475
           1       0.90      0.61      0.73      2525

    accuracy                           0.96     30000
   macro avg       0.93      0.80      0.85     30000
weighted avg       0.96      0.96      0.96     30000

No Smoke:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98     27475
           1       0.89      0.64      0.74      2525

    accuracy                           0.96     30000
   macro avg       0.93      0.81      0.86     30000
weighted avg       0.96      0.96      0.96

### Hiperparametrização

In [35]:
best_knn_one_smk = KNeighborsClassifier(metric='euclidean', n_neighbors=15, weights='distance')
best_knn_smk = KNeighborsClassifier(metric='euclidean', n_neighbors=15, weights='distance')
best_knn_no_smk = KNeighborsClassifier(metric='euclidean', n_neighbors=15, weights='distance')

In [36]:
best_knn_one_smk.fit(X_one_smk_train, y_one_smk_train)
best_knn_smk.fit(X_smk_train, y_smk_train)
best_knn_no_smk.fit(X_no_smk_train, y_no_smk_train)

In [37]:
best_knn_one_smk_pred = best_knn_one_smk.predict(X_one_smk_test)
best_knn_smk_pred = best_knn_smk.predict(X_smk_test)
best_knn_no_smk_pred = best_knn_no_smk.predict(X_no_smk_test)

In [38]:
print('One Smoke: ', accuracy_score(y_one_smk_test, best_knn_one_smk_pred))
print('Smoke: ', accuracy_score(y_smk_test, best_knn_smk_pred))
print('No Smoke: ', accuracy_score(y_no_smk_test, best_knn_no_smk_pred))

One Smoke:  0.9601
Smoke:  0.9617
No Smoke:  0.9634333333333334


In [39]:
print('One Smoke:\n', confusion_matrix(y_one_smk_test, best_knn_one_smk_pred))
print('Smoke:\n', confusion_matrix(y_smk_test, best_knn_smk_pred))
print('No Smoke:\n', confusion_matrix(y_no_smk_test, best_knn_no_smk_pred))

One Smoke:
 [[27362   113]
 [ 1084  1441]]
Smoke:
 [[27360   115]
 [ 1034  1491]]
No Smoke:
 [[27347   128]
 [  969  1556]]


In [40]:
print('One Smoke:\n', classification_report(y_one_smk_test, best_knn_one_smk_pred))
print('Smoke:\n', classification_report(y_smk_test, best_knn_smk_pred))
print('No Smoke:\n', classification_report(y_no_smk_test, best_knn_no_smk_pred))

One Smoke:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98     27475
           1       0.93      0.57      0.71      2525

    accuracy                           0.96     30000
   macro avg       0.94      0.78      0.84     30000
weighted avg       0.96      0.96      0.96     30000

Smoke:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98     27475
           1       0.93      0.59      0.72      2525

    accuracy                           0.96     30000
   macro avg       0.95      0.79      0.85     30000
weighted avg       0.96      0.96      0.96     30000

No Smoke:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98     27475
           1       0.92      0.62      0.74      2525

    accuracy                           0.96     30000
   macro avg       0.94      0.81      0.86     30000
weighted avg       0.96      0.96      0.96