# Fuentes

# 1. Cargando base de datos

In [72]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

In [73]:
import matplotlib.pyplot as plt

In [74]:
import numpy as np

In [75]:
import importlib

In [76]:
import sys

In [77]:
sys.path.insert(0, '..')

In [78]:
sys.path.insert(0, '../..')

In [79]:
# Read the database from the .csv file into a pandas dataframe
df = pd.read_csv('../../databases/insurance.csv')

# 2. Preprocesamiento de los datos

In [80]:
from sklearn import preprocessing

In [81]:
from src import helper
importlib.reload(helper);

## 2.1. Codificación de variables no numéricas o categóricas

In [82]:
# Create a label encoder for the sex variable or feature and create a new column in the dataframe 
# with the encoded version of the gender
sex_encoder = preprocessing.LabelEncoder()
sex_encoder.fit(df['sex'])
df['sex-encoded'] = sex_encoder.transform(df['sex'])

In [83]:
# Create a label encoder for the smoker variable or feature and create a new column in the dataframe
# with the encoded version of the smoker
smoker_encoder = preprocessing.LabelEncoder()
smoker_encoder.fit(df['smoker'])
df['smoker-encoded'] = smoker_encoder.transform(df['smoker'])

In [84]:
# Create a label encoder for the region variable or feature and create a new column in the dataframe
# with the encoded version of the region
region_encoder = preprocessing.LabelEncoder()
region_encoder.fit(df['region'])
df['region-encoded'] = region_encoder.transform(df['region'])

In [85]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,sex-encoded,smoker-encoded,region-encoded
0,19,female,27.9,0,yes,southwest,16884.924,0,1,3
1,18,male,33.77,1,no,southeast,1725.5523,1,0,2
2,28,male,33.0,3,no,southeast,4449.462,1,0,2
3,33,male,22.705,0,no,northwest,21984.47061,1,0,1
4,32,male,28.88,0,no,northwest,3866.8552,1,0,1


## 2.2. Eliminando outliers

In [86]:
# Remove outliers by setting NaN on those rows at the column of BMI
helper.remove_outliers(df, 'bmi')

# Remove NaN values from the dataframe
df = df.dropna()

## 2.3. Filtrado de variables

In [87]:
# Filtering or removing of non desired variables
df_x = df[['age', 'bmi', 'smoker-encoded', 'children', 'sex-encoded', 'region-encoded']]
df_y = df['charges']

# 3. Separación del conjunto de entrenamiento y evaluación

In [88]:
from sklearn import model_selection

## 3.1. Separación de los conjuntos

In [89]:
# Split the dataset into train_valid and test
x_train_valid, x_test, y_train_valid, y_test = model_selection.train_test_split(df_x, df_y, test_size=0.2, random_state=5, shuffle=True)

## 3.2. Normalización de variables

In [90]:
# Select the variables where the z-score will be applied
scalable_variables = ['bmi', 'age']

if scalable_variables:
    # Create an instance of the StandardScaler for each variable
    scaler = preprocessing.StandardScaler()

    # Fit the distribution
    scaler.fit(x_train_valid.loc[:, scalable_variables])

    # Transform and normalize all variables
    x_test.loc[:, scalable_variables] = scaler.transform(x_test.loc[:, scalable_variables])

# 4. Multilayer Perceptron y K-Folding

In [91]:
from src.ej2 import mlp_helper
importlib.reload(mlp_helper);

In [110]:
# Amount of folds
number_folds = 5

# Create an instance of a K-Folding handler
kf = model_selection.KFold(n_splits=number_folds, random_state=15, shuffle=True)

# Generate arrays to contain train, valid and test metrics
train_metrics = np.zeros(number_folds)
valid_metrics = np.zeros(number_folds)
test_metrics = np.zeros(number_folds)

# Iterate through each fold
for i, (train, valid) in enumerate(kf.split(x_train_valid, y_train_valid)):
    
    # Get the train set
    x_train = x_train_valid.iloc[train].copy()
    y_train = y_train_valid.iloc[train].copy()
    
    # Get the valid set
    x_valid = x_train_valid.iloc[valid].copy()
    y_valid = y_train_valid.iloc[valid].copy()
    
    # Select the variables where the z-score will be applied
    scalable_variables = ['bmi', 'age']
    if scalable_variables:
        # Create an instance of the StandardScaler for each variable
        scaler = preprocessing.StandardScaler()

        # Fit the distribution
        scaler.fit(x_train.loc[:, scalable_variables])

        # Transform and normalize all variables
        x_train.loc[:, scalable_variables] = scaler.transform(x_train.loc[:, scalable_variables])
        x_valid.loc[:, scalable_variables] = scaler.transform(x_valid.loc[:, scalable_variables])
    
    # Run model
    mae_train, mae_valid, mae_test = mlp_helper.run_model(x_train, y_train, x_valid, y_valid, x_test, y_test,
                                                          hidden_layers=1,
                                                          units_per_layer=1500,
                                                          hidden_layer_activation='tanh',
                                                          epochs=500,
                                                          batch_size=64,
                                                          learning_rate=0.5, 
                                                          decay_rate=0.01,
                                                          optimizer='adam',
                                                          beta_1=0.99,
                                                          beta_2=0.999,
                                                          tag='k-fold',
                                                          verbose=2
                                                         )
    
    # Save metrics
    train_metrics[i] = mae_train
    valid_metrics[i] = mae_valid
    test_metrics[i] = mae_test

# Inform results!
print(f'[MAE] Train: {train_metrics.mean()} Valid: {valid_metrics.mean()} Test: {test_metrics.mean()}')

Model logs at tb-logs/mlp/k-fold/20210530-215151
Model checkpoints at checkpoints/mlp/k-fold/20210530-215151
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 2)         8           input_2[0][0]                    
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 5)]          0                                            
__________________________________________________________________________________________________
flatten (Flatten)               (None, 2)            0           embedding[0][0]    

ValueError: in user code:

    C:\anaconda\lib\site-packages\tensorflow\python\keras\engine\training.py:805 train_function  *
        return step_function(self, iterator)
    C:\anaconda\lib\site-packages\tensorflow\python\keras\engine\training.py:795 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\anaconda\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\anaconda\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\anaconda\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\anaconda\lib\site-packages\tensorflow\python\keras\engine\training.py:788 run_step  **
        outputs = model.train_step(data)
    C:\anaconda\lib\site-packages\tensorflow\python\keras\engine\training.py:754 train_step
        y_pred = self(x, training=True)
    C:\anaconda\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:998 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    C:\anaconda\lib\site-packages\tensorflow\python\keras\engine\input_spec.py:204 assert_input_compatibility
        raise ValueError('Layer ' + layer_name + ' expects ' +

    ValueError: Layer model expects 2 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None, 6) dtype=float64>]
