# Boston Housing Codeathon
Nikki Aaron

In [None]:
#!pip install -q tensorflow_data_validation
#!pip install -q -U tensorflow_transform
#!pip install talos

In [1]:
import pandas as pd
import numpy as np
from typing import Tuple
import os

import talos as ta
from talos.model.normalizers import lr_normalizer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dropout, Flatten, Dense, BatchNormalization
import tensorflow_data_validation as tfdv
import tensorflow_transform as tft

In [2]:
path = './'
filename = 'housing.csv'
colnames = ['CRIM','ZN','INDUS','CHAS','NOX', 'RM','AGE','DIS','RAD','TAX', 'PTRATIO','MEDV','B','LSTAT']
response_col = 'MEDV'

housing = pd.read_table(f'{path}{filename}', header=None) 
housing = pd.DataFrame(housing[0].str.split().to_list(), columns=colnames) 
housing = housing.astype('float32') 
housing.head() 

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,MEDV,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.199997,4.09,1.0,296.0,15.3,396.899994,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.900002,4.9671,2.0,242.0,17.799999,396.899994,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.099998,4.9671,2.0,242.0,17.799999,392.829987,4.03,34.700001
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.799999,6.0622,3.0,222.0,18.700001,394.630005,2.94,33.400002
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.200001,6.0622,3.0,222.0,18.700001,396.899994,5.33,36.200001


## EDA

In [5]:
housing.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,MEDV,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.13678,0.06917,0.554695,6.284634,68.574898,3.795043,9.549407,408.237152,18.455534,356.674042,12.653064,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537109,2.164946,91.294861,7.141061,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.377487,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.440002,11.36,21.200001
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.074999,5.188425,24.0,666.0,20.200001,396.225006,16.954999,25.0
max,88.976196,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.899994,37.970001,50.0


In [4]:
stats = tfdv.generate_statistics_from_dataframe(dataframe=housing)

In [7]:
tfdv.visualize_statistics(stats)

## Split dataset

In [3]:
ds_size = len(housing)
train_ratio = 0.90
train_size = int(ds_size*train_ratio)

ds = tf.data.Dataset.from_tensor_slices((
    housing[[col for col in housing.columns if col != response_col]].values, 
    housing[response_col].values))

tf.random.set_seed(49)
shuffled_ds = ds.shuffle(ds_size, reshuffle_each_iteration=False)
train_ds = shuffled_ds.take(train_size)
test_ds = shuffled_ds.skip(train_size)

In [4]:
for i in test_ds.take(3):
    print((i[0].numpy(), i[1].numpy()))

(array([  0.53412,  20.     ,   3.97   ,   0.     ,   0.647  ,   7.52   ,
        89.4    ,   2.1398 ,   5.     , 264.     ,  13.     ,   7.26   ,
        43.1    ], dtype=float32), 388.37)
(array([  2.73397,   0.     ,  19.58   ,   0.     ,   0.871  ,   5.597  ,
        94.9    ,   1.5257 ,   5.     , 403.     ,  14.7    ,  21.45   ,
        15.4    ], dtype=float32), 351.85)
(array([  0.35809,   0.     ,   6.2    ,   1.     ,   0.507  ,   6.951  ,
        88.5    ,   2.8617 ,   8.     , 307.     ,  17.4    ,   9.71   ,
        26.7    ], dtype=float32), 391.7)


In [5]:
print(f'Training data : {len(train_ds)}')
print(f'Test data : {len(test_ds)}')

Training data : 456
Test data : 50


## Preprocessing

In [44]:
def preprocess(ds):
    inputs = ds.copy()
    for col in inputs.columns:
        """Preprocess input columns into transformed columns."""
        if inputs[col].dtype == 'object':
            inputs[col] = tft.compute_and_apply_vocabulary(inputs[col])
        else:
            inputs[col] = tft.mean(inputs[col].values) * tft.scale_to_0_1(inputs[col].values)
    return inputs

In [None]:
df_preprocessed = preprocess(housing)
df_preprocessed

## Neural Network

In [18]:
def build_model(x_train, y_train, x_val, y_val, params):
    model = keras.Sequential()
    
    for i in range(params['hidden_layers']):
        model.add(BatchNormalization(input_shape=x_train.shape[1:]))
        model.add(Dense(params['first_neuron'], input_dim=(len(colnames) - 1),
                        activation=params['activation'],
                        kernel_initializer='normal'))

        model.add(Dropout(params['dropout']))
   
    # then we finish again with completely standard Keras way
    model.add(Dense(1, activation=params['last_activation'],
                    kernel_initializer='normal'))
    
  
    model.compile(loss=params['losses'],
                  optimizer=params['optimizer'](),
                  # here we add a regulizer normalization function from Talos
#                   optimizer=params['optimizer'](lr=lr_normalizer(params['lr'],params['optimizer'])),
                  metrics=tf.keras.metrics.RootMeanSquaredError)

    out = model.fit(x_train,
                    y_train,
                    batch_size=params['batch_size'],
                    epochs=params['epochs'],
                    validation_data=(x_val, y_val),
                    callbacks=[ta.utils.early_stopper(params['epochs'])],
                    workers=4,
                    verbose=0)

    # finally we have to make sure that history object and model are returned
    return out, model

In [7]:
train_data = np.array([list(x[0].numpy()) for x in list(train_ds)])
train_targets = np.array([x[1].numpy() for x in list(train_ds)])

test_data = np.array([list(x[0].numpy()) for x in list(test_ds)])
test_targets = np.array([x[1].numpy() for x in list(test_ds)])

## Hyperparameter tuning

In [21]:
p = {'lr': (0.5, 5, 10),
     'first_neuron':[4, 8, 16, 32, 64],
     'hidden_layers':[1, 2, 3],
     'batch_size': (2, 30, 10),
     'epochs': [10, 15, 25],
     'dropout': (0, 0.5, 5),
     'weight_regulizer':[None],
     'emb_output_dims': [None],
#      'shapes':['brick','long_funnel'],
     'optimizer': [keras.optimizers.Adam, keras.optimizers.Nadam, keras.optimizers.RMSprop],
     'losses': [keras.losses.logcosh, keras.losses.mean_squared_error],
     'activation':[keras.activations.relu, keras.activations.elu],
     'last_activation': [keras.activations.sigmoid, keras.activations.softmax]}

In [None]:
t = ta.Scan(x=train_data,
            y=train_targets,
            x_val=test_data,
            y_val=test_targets,
            model=build_model,
            fraction_limit=0.01, 
            params=p,
            experiment_name='exp',
            disable_progress_bar=True)



In [None]:
t.best_model(metric='mse', asc=False)

In [None]:
t.data.columns

In [None]:
min(t.data['mse'])

In [None]:
model_id = t.data['mse'].astype('float').argmax() - 1
# Clear any previous TensorFlow session.
tf.keras.backend.clear_session()

# Load the model parameters from the scanner.
from tensorflow.keras.models import model_from_json
model = model_from_json(t.saved_models[model_id])
model.set_weights(t.saved_weights[model_id])
model.summary()
model.save('best_model.h5')

In [None]:
r = Analyze('experiment_log.csv')
r.data

# returns the highest value for 'val_fmeasure'
r.low('val_mse')

# draws a histogram for 'val_acc'
r.plot_hist()

r.best_params

In [None]:
t.evaluate_models(
            x_val=test_data,
            y_val=test_targets,
            n_models=10,
            metric='mse',
            folds=5,
            shuffle=True,
            average='binary',
            asc=False)