# Redes Neuronales - Trabajo Práctico N° 2 - Ejercicio 1 - Regresión Logística
# Notebook #3: Implementación de un modelo MLP
En esta notebook se busca aprovechar los conocimientos de las anteriores e implementar un modelo MLP para poder estimar la condición de diabético de un paciente, perteneciente al Pima Indians Dataset analizado en la primer notebook.

# 1. Cargando base de datos

In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
import importlib

In [4]:
# Read database from .csv
df = pd.read_csv('../../databases/diabetes.csv', delimiter=',')

# Show first rows of data
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# 2. Preprocesamiento de los datos

## 2.1 Filtrado de valores inválidos

In [5]:
# Filtering Glucose values
df['Glucose'].replace(0, np.nan, inplace=True)

# Filtering Blood Pressure values
df['BloodPressure'].replace(0, np.nan, inplace=True)

# Filtering Skin Thickness values
df['SkinThickness'].replace(0, np.nan, inplace=True)

# Filtering Insulin values
df['Insulin'].replace(0, np.nan, inplace=True)

# Filtering Body Mass Index values
df['BMI'].replace(0, np.nan, inplace=True)

## 2.2 Remoción de Outliers

In [6]:
from src.helper import remove_outliers

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [7]:
x_labels = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction','Age']
y_labels = ['Outcome']

for column in x_labels:
    remove_outliers(df, column)
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,764.0,763.0,719.0,538.0,370.0,749.0,739.0,759.0,768.0
mean,3.786649,121.686763,72.115438,28.903346,132.610811,32.204005,0.429832,32.805007,0.348958
std,3.278714,30.535641,11.239072,9.86548,74.285393,6.491385,0.249684,11.113182,0.476951
min,0.0,44.0,40.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,64.0,22.0,75.0,27.4,0.238,24.0,0.0
50%,3.0,117.0,72.0,29.0,120.0,32.0,0.356,29.0,0.0
75%,6.0,141.0,80.0,36.0,177.5,36.5,0.587,40.0,1.0
max,13.0,199.0,104.0,56.0,360.0,50.0,1.191,66.0,1.0


# 3. Separación del conjunto de entrenamiento y evaluación

In [8]:
from sklearn import model_selection

In [9]:
from sklearn import preprocessing

In [10]:
# Define input and output variables for the model
df_x = df[x_labels]
df_y = df[y_labels]

In [11]:
# Split the dataset into train_valid and test
x_train_valid, x_test, y_train_valid, y_test = model_selection.train_test_split(df_x, df_y, test_size=0.2, random_state=15, shuffle=True)

# Split the train_valid sub-dataset into train and valid
x_train, x_valid, y_train, y_valid = model_selection.train_test_split(x_train_valid, y_train_valid, test_size=0.3, random_state=23, shuffle=True)

In [12]:
x_train.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,427.0,426.0,399.0,286.0,203.0,416.0,414.0,425.0
mean,3.859485,120.514085,72.045113,28.811189,132.458128,32.000962,0.430853,32.868235
std,3.285896,29.742282,11.55685,9.853631,70.564358,6.568853,0.255626,11.111848
min,0.0,56.0,40.0,7.0,14.0,18.2,0.085,21.0
25%,1.0,100.0,64.0,22.0,76.0,27.1,0.238,24.0
50%,3.0,115.0,72.0,29.0,122.0,31.6,0.343,29.0
75%,6.0,138.0,80.0,35.75,179.0,36.1,0.60325,40.0
max,13.0,198.0,102.0,52.0,335.0,50.0,1.182,66.0


# 4. Reemplazo de valores inválidos

In [13]:
# Compute the mean of training
train_means = x_train.mean().to_numpy()

# Replacing nan values of the train dataset with training mean values
for index, column in enumerate(x_train.columns):
    x_train.loc[:,column].replace(np.nan, train_means[index], inplace=True)

# Replacing nan values of the test dataset with training mean values
for index, column in enumerate(x_test.columns):
    x_test.loc[:,column].replace(np.nan, train_means[index], inplace=True)
    
# Replacing nan values of the test dataset with training mean values
for index, column in enumerate(x_valid.columns):
    x_valid.loc[:,column].replace(np.nan, train_means[index], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [14]:
x_train.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,429.0,429.0,429.0,429.0,429.0,429.0,429.0,429.0
mean,3.859485,120.514085,72.045113,28.811189,132.458128,32.000962,0.430853,32.868235
std,3.278209,29.637862,11.144462,8.040755,48.477386,6.468323,0.251107,11.059801
min,0.0,56.0,40.0,7.0,14.0,18.2,0.085,21.0
25%,1.0,100.0,64.0,26.0,126.0,27.3,0.24,24.0
50%,3.0,116.0,72.045113,28.811189,132.458128,32.0,0.351,29.0
75%,6.0,138.0,80.0,32.0,132.458128,36.0,0.591,40.0
max,13.0,198.0,102.0,52.0,335.0,50.0,1.182,66.0


# 5. Normalización de datos de entrada. Z Score. 

In [15]:
# IMPORTANT! Backup unnormalized subsets for further utilization
x_train_un = x_train
x_valid_un = x_valid
x_test_un = x_test

# Apply z-score to all sub-datasets
scalable_variables = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction','Age']

if scalable_variables:
    # Create an instance of the StandardScaler for each variable
    scaler = preprocessing.StandardScaler()

    # Fit the distribution
    scaler.fit(x_train.loc[:, scalable_variables])

    # Transform and normalize all variables
    x_train.loc[:, scalable_variables] = scaler.transform(x_train.loc[:, scalable_variables])
    x_test.loc[:, scalable_variables] = scaler.transform(x_test.loc[:, scalable_variables])
    x_valid.loc[:, scalable_variables] = scaler.transform(x_valid.loc[:, scalable_variables])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_wi

In [16]:
x_train.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,429.0,429.0,429.0,429.0,429.0,429.0,429.0,429.0
mean,6.832142000000001e-17,-7.349728000000001e-17,2.204918e-16,5.465713e-16,-1.6562770000000002e-17,-3.5195880000000006e-17,-9.937661e-17,7.867315e-17
std,1.001168,1.001168,1.001168,1.001168,1.001168,1.001168,1.001168,1.001168
min,-1.178689,-2.179287,-2.878786,-2.715747,-2.446428,-2.136114,-1.378921,-1.074349
25%,-0.8732887,-0.6929662,-0.7227362,-0.3500257,-0.1333749,-0.7276152,-0.7609333,-0.8027802
50%,-0.2624873,-0.1524859,0.0,4.423542e-16,0.0,-0.000148827,-0.3183741,-0.3501647
75%,0.6537149,0.5906746,0.7146307,0.3970441,0.0,0.6189715,0.6385106,0.6455895
max,2.79152,2.617476,2.69101,2.887277,4.182947,2.785893,2.994839,2.99919


# 6. Multi Layer Perceptron (MLP) - Test #1

In [17]:
# Loading TensorBoard for learning logging
%load_ext tensorboard

In [18]:
from keras.models import Sequential, load_model
from keras.layers import Dense, BatchNormalization, Dropout
from keras.optimizers import SGD, Adam
from keras.metrics import SensitivityAtSpecificity
from keras.callbacks import TensorBoard, ModelCheckpoint
import datetime

In [19]:
units_per_layer = 3
hidden_layers = 2
input_dim = 8
dropout_prob = 0
# Define MLP model
mlp_model = Sequential()
for layer in range(hidden_layers):
    mlp_model.add(Dense(units_per_layer, input_shape=(input_dim,), activation='relu', use_bias=True))
    mlp_model.add(Dropout(dropout_prob))
mlp_model.add(Dense(1, activation='sigmoid', use_bias=True)) # Output layer

In [20]:
# Compiling model
mlp_model.compile(optimizer='SGD', loss='binary_crossentropy', metrics=['AUC'])
mlp_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 3)                 27        
_________________________________________________________________
dropout (Dropout)            (None, 3)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 12        
_________________________________________________________________
dropout_1 (Dropout)          (None, 3)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 4         
Total params: 43
Trainable params: 43
Non-trainable params: 0
_________________________________________________________________


In [21]:
mlp_model.fit(x=x_train, y=y_train, validation_data=(x_valid, y_valid), shuffle=True, epochs=200, batch_size=32, verbose=0)

<keras.callbacks.History at 0x2079c162220>

In [61]:
from src import mlp_helper

In [62]:
mae = mlp_helper.run_model(x_train, y_train, x_valid, y_valid, x_test, y_test,
                           hidden_layers=1,
                           units_per_layer=8,
                           hidden_layer_activation='relu',
                           tensorboard_on=False
                           epochs=1000,
                           batch_size=32,
                           learning_rate=2, 
                           decay_rate=0.01,
                           optimizer='sgd',
                           beta_1=0.99,
                           beta_2=0.999
                          )

Model logs at tb-logs/mlp/20210528-175238
Model checkpoints at checkpoints/mlp/20210528-175238
Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 8)                 72        
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 9         
Total params: 81
Trainable params: 81
Non-trainable params: 0
_________________________________________________________________


ValueError: Expected scalar shape, saw shape: (1,).