# Fuentes

# 1. Cargando base de datos

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
import numpy as np

In [3]:
import matplotlib.pyplot as plt

In [4]:
import importlib

In [5]:
import sys

In [6]:
sys.path.insert(0, '../..')

In [7]:
# Read the database from the .csv file into a pandas dataframe
df = pd.read_csv('../../databases/insurance.csv')

# 2. Preprocesamiento de los datos

In [8]:
from sklearn import preprocessing

## 2.1. Codificación de variables no numéricas

In [9]:
# Create a label encoder for the sex variable or feature and create a new column in the dataframe 
# with the encoded version of the gender
sex_encoder = preprocessing.LabelEncoder()
sex_encoder.fit(df['sex'])
df['sex-encoded'] = sex_encoder.transform(df['sex'])

In [10]:
# Create a label encoder for the smoker variable or feature and create a new column in the dataframe
# with the encoded version of the smoker
smoker_encoder = preprocessing.LabelEncoder()
smoker_encoder.fit(df['smoker'])
df['smoker-encoded'] = smoker_encoder.transform(df['smoker'])

In [11]:
# Create a one hot encoder and fit the available types of regions in the dataset
region_encoder = preprocessing.OneHotEncoder()
region_encoder.fit(df['region'].to_numpy().reshape(-1, 1))

# Transform all entries into the one hot encoded representation
encoded_regions = region_encoder.transform(df['region'].to_numpy().reshape(-1, 1)).toarray()

# Add each new encoded variable or feature to the dataset
for i, category in enumerate(region_encoder.categories_[0]):
    df[f'{category}-encoded'] = encoded_regions.transpose()[i]

In [12]:
# Filtering or removing of non desired variables
df_x = df[['age', 'bmi', 'smoker-encoded', 'children', 'sex-encoded', 'northwest-encoded', 'northeast-encoded', 'southwest-encoded', 'southeast-encoded']]
df_y = df['charges']

# 3. Separación del conjunto de entrenamiento y evaluación

In [13]:
from sklearn import model_selection

In [14]:
from sklearn import preprocessing

## 3.1. Separación de los conjuntos
Es importante notar que, se realiza la separación del conjunto de datos original en **train**, **valid** y **test**, por fuera del framework de Keras para garantizar un adecuado tratamiento de los conjuntos acorde a la metodología empleada. En otras palabras, de esta forma nos aseguramos que cualquier preprocesamiento o normalización sobre validación (valid) y evaluación (test) se realiza a partir de la información obtenida en entrenamiento.

In [75]:
# Split the dataset into train_valid and test
x_train_valid, x_test, y_train_valid, y_test = model_selection.train_test_split(df_x, df_y, test_size=0.2, random_state=15, shuffle=True)

In [76]:
# Split the dataset into train and valid
x_train, x_valid, y_train, y_valid = model_selection.train_test_split(x_train_valid, y_train_valid, test_size=0.3, random_state=23, shuffle=True)

## 2.2. Data augmentation

In [77]:
import copy

In [78]:
# Gaussian white noise parameters
noise_power = 1e-3
noise_std = np.sqrt(noise_power)

In [79]:
# Convert to NumPy arrays
x = x_train.to_numpy()
y = y_train.to_numpy()

In [80]:
# How many samples to be added
augment_factor = 20
number_samples = x.shape[0] * augment_factor

In [81]:
# Initialize the new data
new_x = np.zeros((number_samples, x.shape[1]))
new_y = np.zeros(number_samples)

for i, (x_i, y_i) in enumerate(zip(x, y)):
    for k in range(augment_factor):
        new_x_i = copy.copy(x_i)
        new_x_i[1] += np.random.normal(0, noise_std)
        new_x[i * augment_factor + k] = new_x_i
        new_y[i * augment_factor + k] = y_i

In [82]:
x = np.append(x, new_x, axis=0)

In [83]:
y = np.append(y, new_y, axis=0)

In [84]:
x.shape

(15729, 9)

In [85]:
y.reshape(-1).shape

(15729,)

In [86]:
x_train = pd.DataFrame(x)
y_train = pd.DataFrame(y)

# 4. Regresión Lineal

In [87]:
from src import rl_helper
importlib.reload(rl_helper);

In [88]:
# Run model experiment
mae_train, mae_valid, mae_test = rl_helper.run_model(x_train, y_train, x_valid, y_valid, x_test, y_test,
                                                     learning_rate=1.0,
                                                     degree=8,
                                                     scheduler='exponential-decay',
                                                     decay_rate=0.01,
                                                     optimizer='adam',
                                                     beta_1=0.9,
                                                     beta_2=0.99,
                                                     patience=50,
                                                     min_delta=10,
                                                     epochs=2000,
                                                     batch_size=128
                                                    )

KeyboardInterrupt: 

In [59]:
# Run model experiment
mae_train, mae_valid, mae_test = rl_helper.run_model(x_train, y_train, x_valid, y_valid, x_test, y_test,
                                                     learning_rate=5.0,
                                                     degree=8,
                                                     regularizer='l2',
                                                     regularizer_lambda=1e-4,
                                                     scheduler='exponential-decay',
                                                     decay_rate=0.01,
                                                     optimizer='adam',
                                                     beta_1=0.9,
                                                     beta_2=0.99,
                                                     patience=50,
                                                     min_delta=10,
                                                     epochs=2000,
                                                     batch_size=32
                                                    )

Model logs at tb-logs/rl/20210527-163758
Model checkpoints at checkpoints/rl/20210527-163758
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1)                 24310     
Total params: 24,310
Trainable params: 24,310
Non-trainable params: 0
_________________________________________________________________


ValueError: Expected scalar shape, saw shape: (1,).

In [29]:
# Run model experiment
mae_train, mae_valid, mae_test = rl_helper.run_model(x_train, y_train, x_valid, y_valid, x_test, y_test,
                                                     learning_rate=5.0,
                                                     degree=8,
                                                     scheduler='exponential-decay',
                                                     decay_rate=0.01,
                                                     optimizer='adam',
                                                     beta_1=0.9,
                                                     beta_2=0.99,
                                                     patience=50,
                                                     min_delta=10,
                                                     epochs=2000,
                                                     batch_size=128
                                                    )

Model logs at tb-logs/rl/20210527-180034
Model checkpoints at checkpoints/rl/20210527-180034
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1)                 24310     
Total params: 24,310
Trainable params: 24,310
Non-trainable params: 0
_________________________________________________________________


KeyboardInterrupt: 

In [None]:
# Run model experiment
mae_train, mae_valid, mae_test = rl_helper.run_model(x_train, y_train, x_valid, y_valid, x_test, y_test,
                                                     learning_rate=5.0,
                                                     degree=9,
                                                     scheduler='exponential-decay',
                                                     decay_rate=0.01,
                                                     optimizer='adam',
                                                     beta_1=0.9,
                                                     beta_2=0.99,
                                                     patience=50,
                                                     min_delta=10,
                                                     epochs=2000,
                                                     batch_size=128
                                                    )

Model logs at tb-logs/rl/20210527-181314
Model checkpoints at checkpoints/rl/20210527-181314
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 1)                 48620     
Total params: 48,620
Trainable params: 48,620
Non-trainable params: 0
_________________________________________________________________
