#### Importing dependencies
---

In [None]:
import pandas as pd
import numpy as np
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [None]:
os.chdir(os.getcwd().replace('Notebooks', ''))
os.getcwd()

In [None]:
from GitMarco.tf.utils import limit_memory
limit_memory(2000)

#### Loading data (csv format)
---

In [None]:
# Reading the train and test data
import os
df = pd.read_excel('tabular/df.xlsx')

#### Check data type
---

In [None]:
df.info()

#### Drop desired columns from the dataset
---

In [None]:
rem_cols = None

In [None]:
if rem_cols is not None:
  df.drop(rem_cols, axis=1, inplace=True)
  df.head()

#### Cleaning
---

In [None]:
print(df.shape)
# df = df[df.CNT_CHILDREN != 12]
print(df.shape)

#### Removing NaN values
---


In [None]:
print('Before', df.shape)
df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
print('After', df.shape)

#### Fixing data
---

In [None]:
# df.DAYS_EMPLOYED[df.DAYS_EMPLOYED == 365243] = 1
# df.head()

#### Performing Factorization
---
We transform the desired columns into factorized classes

In [None]:
cols = None
if cols is not None:
  df[cols] = df[cols].apply(lambda x: pd.factorize(x)[0])
  df.head()

#### Checking again data dtype
---

In [None]:
df.info()

#### Checking stats
---

In [None]:
df.describe().T

#### Ditribution
---

In [None]:
df.hist(df.columns[-4])
pass

#### Preparing data for training, validation and test
---

In [None]:
input_data = df.copy()
input_data.head()

#### Normalizing data
---

In [None]:
scaler = StandardScaler()
scaler.fit(input_data)
normed_data = pd.DataFrame(scaler.transform(input_data), columns=input_data.columns)
normed_data.describe().T

#### Splitting data into training, validation and testing
---

In [None]:
n_labels = 1

labels = normed_data.drop(columns=df.columns[:-n_labels])
normed_data = normed_data.drop(columns=df.columns[-n_labels:])

train_data, test_data, train_labels, test_labels = train_test_split(
    normed_data, labels, test_size=0.2, shuffle=True)

#### Creating a parametric sequential model
---

In [None]:
from GitMarco.tf.metrics import r_squared
from GitMarco.graphics.matplotlib import validation_plot

def create_model(dropout_rate: float = 0.0,
                 neurons: int = 32,
                 activation: str = 'relu',
                 n_layers: int = 2,
                 learning_rate: float = 0.001,
                 optimizer = tf.keras.optimizers.Adam,
                 nesterov: bool = True,
                 momentum: float = .9,
                 ) -> tf.keras.Model:

    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(neurons,
                                    activation=activation,
                                    input_shape=(normed_data.shape[1], )))
    model.add(tf.keras.layers.Dropout(dropout_rate))

    for i in range(1, n_layers):
        model.add(tf.keras.layers.Dense(neurons, activation=activation))
        model.add(tf.keras.layers.Dropout(dropout_rate))
    model.add(tf.keras.layers.Dense(train_labels.shape[1]))

    if isinstance(optimizer, tf.keras.optimizers.SGD):
        model.compile(optimizer=optimizer(learning_rate=learning_rate, nesterov=nesterov, momentum=momentum),
                      loss=tf.keras.losses.mean_squared_error,
                      # metrics=[r_squared]
                     )
    else:
        model.compile(optimizer=optimizer(learning_rate=learning_rate),
                      loss=tf.keras.losses.mean_squared_error,
                      # metrics=[r_squared]
                     )
    model.summary()
    return model

#### Wrap keras model with sklearn
---

In [None]:
from keras.wrappers.scikit_learn import KerasRegressor
model = KerasRegressor(build_fn=create_model, verbose=0)

#### Performing cross-validation and hyper-parameters optimization
---

In [None]:
# We specify here the parameters of the grid search, in the form of lists
learning_rate = [0.01,]
dropout_rate = [0.2]
batch_size = [100]
epochs = [2000, ]
neurons = [256,]
activation = ['relu',]
n_layers = [2,]
nesterov = [True,]
momentum = [.9, ]
optimizer = [
             tf.keras.optimizers.Adam,
             # tf.keras.optimizers.SGD,
             # tf.keras.optimizers.RMSprop,
             ]

# Make a dictionary of the grid search parameters
param_grid = dict(learning_rate=learning_rate,
                  dropout_rate=dropout_rate,
                  batch_size=batch_size,
                  epochs=epochs,
                  neurons=neurons,
                  activation=activation,
                  n_layers=n_layers,
                  optimizer=optimizer,
                  nesterov=nesterov,
                  momentum=momentum)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

seed = 22  # Set random state
jobs = 1  # Set the number of parallel processes (-1 => all available cores)
n_folds = 5  # Number of cross-validation folds

# Build and fit the GridSearchCV
grid = GridSearchCV(estimator=model,
                    param_grid=param_grid,
                    cv=KFold(random_state=seed,
                             n_splits=n_folds,
                             shuffle=True
                             ),
                    scoring='neg_mean_squared_error',
                    error_score='raise',
                    verbose=0,
                    n_jobs=jobs)

grid_results = grid.fit(train_data, train_labels, verbose=0)

#### Explore Results
---

In [None]:
# Summarize the results in a readable format
print("Best: {0}, using {1} \n".format(grid_results.best_score_, grid_results.best_params_))

In [None]:
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print('{0} ({1}) with: {2}\n'.format(mean, stdev, param))

In [None]:
vars(grid_results).keys()

It is possible to post-process the results of the analysis in various ways in order to create detailed reports describing the performance of the classifier as its hyperparameters change @TODO

#### Extracting the best model
---

In [None]:
vars(grid_results.best_estimator_).keys()

In [None]:
final_model = grid_results.best_estimator_.model

#### Evaluating the best model on the test set
---

In [None]:
train_scores = final_model.evaluate(train_data, train_labels)
print('Train loss: {0}'.format(train_scores))

In [None]:
scores = []
for i in range(train_labels.shape[1]):
  R2_ = r2_score(train_labels.to_numpy()[:, i], final_model.predict(train_data)[:, i])
  print(R2_)
  scores.append(R2_)
print('\n')
np.mean(scores)

In [None]:
test_scores = final_model.evaluate(test_data, test_labels)
print('Test loss: {0}'.format(test_scores))

In [None]:
r2_score(test_labels, final_model.predict(test_data))

#### Saving training data, test data and best model
---

In [None]:
results_path = 'results_tabular'

In [None]:
if os.path.exists(results_path):
    os.system('rm -r {0}'.format(results_path))
    os.mkdir(results_path)
else:
    os.mkdir(results_path)

In [None]:
np_to_csv = lambda x, y: np.savetxt(f"{y}.csv", x, delimiter=",")

In [None]:
train_data.to_csv(os.path.join(results_path, 'train_data.csv'))
test_data.to_csv(os.path.join(results_path, 'test_data.csv'))

np_to_csv(train_labels, os.path.join(results_path, 'train_labels'))
np_to_csv(test_labels, os.path.join(results_path, 'test_labels'))

In [None]:
final_model.save(os.path.join(results_path, 'best_model'))

In [None]:
!zip -r results_tabular/best_model.zip results_tabular/best_model

In [None]:
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')
for j in range(train_labels.shape[1]):
  plot = validation_plot(train_labels.iloc[:, j].to_numpy(), final_model.predict(train_data)[:, j], show=True, title='Training', marker_color='red')

In [None]:
for k in range(test_labels.shape[1]):
  plot = validation_plot(test_labels.iloc[:, k].to_numpy(), final_model.predict(test_data)[:, k], show=True, title='Test', marker_color='red')