In [1]:
# Importing Pandas Library 
import pandas as pd

In [2]:
# Create Dataframe from diabetes CSV file
df = pd.read_csv('diabetes.csv', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
# Rename DataFrame columns 
df.rename(columns={0:'Gravidez', 1:'Glicose', 2: 'Pressão', 3: 'Dobra', 4: 'Insulina', 5: 'IMC', 6: 'Genética', 7: 'Idade', 8:'Diabetes'}, inplace=True)
df.head()

Unnamed: 0,Gravidez,Glicose,Pressão,Dobra,Insulina,IMC,Genética,Idade,Diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# DataFrame size 
df.shape

(768, 9)

Na base existe um total de 768 mulheres. Não é um número necessariamente baixo, mas precisamos avaliar se alguns registros precisarão ser deletados para melhorar a análise. De toda forma é uma quantidade considerável de dados.

In [5]:
# Dataframe missing values analysis
df.isna().sum()

Gravidez    0
Glicose     0
Pressão     0
Dobra       0
Insulina    0
IMC         0
Genética    0
Idade       0
Diabetes    0
dtype: int64

Por se tratar de uma base de dados já pronta, o tratamento dos dados faltantes (missing values) já foi realizado. Em praticamente todos os projetos essa é uma parte muito importante e não deve ser negligenciada.

In [6]:
# Evaluating features general correlation
df.corr()

Unnamed: 0,Gravidez,Glicose,Pressão,Dobra,Insulina,IMC,Genética,Idade,Diabetes
Gravidez,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
Glicose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
Pressão,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
Dobra,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
Insulina,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548
IMC,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
Genética,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
Idade,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356
Diabetes,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0


In [7]:
# DataFrame general statistics
df.describe()

Unnamed: 0,Gravidez,Glicose,Pressão,Dobra,Insulina,IMC,Genética,Idade,Diabetes
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


**Gravidez**: Aparentemente tem distribuição normal, mas com uma variância muito grande. O valor máximo como 17 não é impossível, mas pode estar distorcido da maioria. Deve ser avaliado.	
**Glicose**: Distribuição normal (média e mediana parecidas) e aparentemente o único problema são os valores zero.	
**Pressão**: Distribuição normal, o problema são os valores zero.		
**Dobra**: Distribuição normal, um problema são os valores zero.	
**Insulina**: Distribuição muito estranha. Presica ser melhor avaliado.	
**IMC**: Distribuição normal, o problema são os valores zero.			
**Genética**: Distribuição parece normal, apesar da grande variância. Maior problema são valores aima de 1.	
**Idade**: Distribuição normal, aparentement tudo certo.	
**Diabetes**: Deve ser avaliada a distribuição de cada classe.

In [8]:
# Gravidez
df.Gravidez.value_counts()

1     135
0     111
2     103
3      75
4      68
5      57
6      50
7      45
8      38
9      28
10     24
11     11
13     10
12      9
14      2
15      1
17      1
Name: Gravidez, dtype: int64

In [4]:
# Drop low frequency cases
df = df.loc[df.Gravidez <= 12]
df.shape

(754, 9)

In [None]:
# Importing libraries
from sklearn.preprocessing import MinMaxScaler # Data transformation 
from sklearn.model_selection import train_test_split # Data slicing
from sklearn.tree import DecisionTreeClassifier # Decision Tree model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score # Methods to evaluate model 
import joblib # Model persistence

In [None]:
# Dataframe general stats 
df2 = df.loc[df[8]==1]

In [None]:
df2.describe()

In [None]:
df3 = df.loc[df[8]==0]

In [None]:
df3.describe()

In [None]:
# Amount of Diabetes event
df[8].value_counts()

# **Construção dos Modelos e Previsão**

In [None]:
# Data Normalization 
raw_variables = [0, 1, 2, 3, 4, 5, 6, 7]
adjusted_variables = MinMaxScaler()

In [None]:
# Defining input and output variables
input_variables = adjusted_variables.fit_transform(df[raw_variables])
output_variables = df[8]

In [None]:
resumo_normalizacao = pd.DataFrame(data=input_variables)
resumo_normalizacao.describe()

In [None]:
# Assign variables to Train and Test slices
x = input_variables   
y = output_variables

# Dataframe to Train and Test slices
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30,random_state=42)

**KNN (K nearest neighbor)**

In [None]:
# Building KNN model with 5 neighbors (K=5)
clf_KNN = KNeighborsClassifier(n_neighbors=5)

# Training the KNN model
clf_KNN.fit(x_train, y_train)

# Testing the KNN model
y_forecast_KNN = clf_KNN.predict(x_test)

**Decision Tree**

In [None]:
# Building Tree model 
clf_tree = DecisionTreeClassifier(random_state=1)

# Training the Tree model
clf_tree.fit(x_train, y_train)

# Testing the Tree model
y_forecast_tree = clf_tree.predict(x_test)

**Multilayer Perceptron (MLP)**

In [None]:
# Building MLP model
clf_mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 10), random_state=1)

# Training MLP model
clf_mlp.fit(x_train, y_train)

# Testing MLP model
y_forecast_mlp = clf_mlp.predict(x_test)

**Evaluating Model Accuracy**

In [None]:
# Comparing models accuracy
print(f'Acurácia do Modelo KNN: {accuracy_score(y_test, y_forecast_KNN)*100}')
print(f'Acurácia do Modelo Decision Tree: {accuracy_score(y_test, y_forecast_tree)*100}')
print(f'Acurácia do Modelo MLP: {accuracy_score(y_test, y_forecast_mlp)*100}')

# **Salvando Melhor Modelo**

In [None]:
# Save best model as a file
mlp_model = 'mlp_model.sav'
joblib.dump(clf_mlp, mlp_model)

# **Salvando a Normalização dos Dados**

In [None]:
# Save transform method
scale_data = "normalização.sav"
joblib.dump(adjusted_variables, scale_data)