In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as st

pd.set_option("display.max_columns", None)

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, Normalizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import keras_tuner as kt
#import tensorflow_addons as tfa
tf.config.set_visible_devices([], 'GPU') #disables GPU

import random
import os

SEED = 42
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    tf.compat.v1.set_random_seed(seed)
    session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
    sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
    tf.compat.v1.keras.backend.set_session(sess)

seed_everything(SEED)
import warnings
warnings.simplefilter('ignore')

from sklearn.model_selection import train_test_split

# Load Data

In [None]:
data = pd.read_csv("BC-Data-Set.csv").drop(['date'], axis=1)
#data = data[['BC','N_CPC', 'PM-2.5', 'PM-1.0', 'O3', 'CO', 'NOX', 'TEMP', 'HUM']] #include BC

In [None]:
train, test= train_test_split(data, shuffle=True, test_size=0.1, random_state=SEED)

target = 'BC'
features = test.drop(['BC'], axis=1).columns.to_list()

# Features

In [None]:
train.describe().T.style.background_gradient(subset=['std'], cmap='Reds').background_gradient(subset=['mean'], cmap='Greens')

qty = len(features)
ncols = 3
nrows = qty 
fig, axs = plt.subplots(nrows, ncols, figsize=(20, 5*nrows))
r = 0
for f in features:
    sns.kdeplot(train[f], color='r', label='train', ax=axs[r,0])
    axs[r,0].set_xscale('symlog')
    axs[r,0].set_ylabel(f)
    sns.boxplot(x=train[f],ax=axs[r,1],color='magenta')
    st.probplot(train[f],plot=axs[r,2])    
    r += 1
plt.tight_layout()
plt.show()

# Outliers

#TEDIOUS PHASE, YOU HAVE TO SEE WRT TO THE DATA IF THE OUTLIERS SHOULD BE REMOVED
outliers = train.loc[train['BC'] > 10]
try:
    outliers = pd.concat([outliers, train.loc[train['PM-10'] > 70]], axis=0)
except:
    pass
try:
    outliers = pd.concat([outliers, train.loc[train['N_CPC'] > 70]], axis=0)
except:
    pass
try:
    outliers = pd.concat([outliers, train.loc[train['PM-2.5'] > 60]], axis=0)
except:
    pass
try:
    outliers = pd.concat([outliers, train.loc[train['SO2'] > 7]], axis=0)
except:
    pass
try:
    outliers = pd.concat([outliers, train.loc[train['CO'] > 1.75]], axis=0)
except:
    pass
try:
    outliers = pd.concat([outliers, train.loc[train['NO'] > 225]], axis=0)
except:
    pass
try:
    outliers = pd.concat([outliers, train.loc[train['NOX'] > 410]], axis=0)
except:
    pass

outliers = outliers[~outliers.duplicated()]
print (outliers.shape)

train = train.drop(outliers.index)

# Preprocessing

In [None]:
scaler = StandardScaler()
train[features] = scaler.fit_transform(train[features])
test[features] = scaler.transform(test[features])

In [None]:
train_df, val_df = train_test_split(train, test_size=0.15)

train_labels =  np.array(train_df[target])
val_labels =  np.array(val_df[target])
test_labels =  np.array(test[target])
train_features = np.array(train_df[features])
val_features = np.array(val_df[features])
test_features = np.array(test[features])

print('train features shape:', train_features.shape)
print('train labels   shape:', train_labels.shape)
print('val features shape:', val_features.shape)
print('val labels   shape:', val_labels.shape)
print('test  features shape:', test_features.shape)
print('test  labels   shape:', test_labels.shape)

In [None]:
EPOCHS = 500
BATCH_SIZE = 32

METRICS = [
    keras.metrics.MeanSquaredError(),
]
metrics=METRICS

normalize = layers.Normalization(axis=-1)
normalize.adapt(train_features)

# Model tuning
* Tune structure of the model model - number of layers and parameters
* Comment this cell after the model tuned

In [None]:
tune = True

def model_builder(hp):
    model = keras.Sequential()
    
    # Normalization layer, no prameters to tune
    #model.add(normalize)
    
    # Gaussian Noise layer
    # Apply additive zero-centered Gaussian noise.
    # This is useful to mitigate overfitting 
    # (you could see it as a form of random data augmentation). 
    # Gaussian Noise (GS) is a natural choice as corruption process for real valued inputs.
    hp_stddev = hp.Choice('stddev', values=[0.0, 0.05, 0.1, 0.2])
    if hp_stddev > 0:
        model.add(layers.GaussianNoise(hp_stddev))

    hp_units = hp.Int('units', min_value=16, max_value=128, step=16)
    model.add(layers.Dense(units=hp_units, activation='relu'))
    
    hp_batchN= hp.Choice('batch_1', values=[0,1])
    if hp_batchN > 0:
        model.add(layers.BatchNormalization()),
    
    # Dropout level
    # Applies Dropout to the input.
    # The Dropout layer randomly sets input units to 0 with a frequency of rate
    # at each step during training time, which helps prevent overfitting. 
    # Inputs not set to 0 are scaled up by 1/(1 - rate) such that the sum over all inputs is unchanged.    
    hp_rate = hp.Choice('rate', values=[0.0, 0.1, 0.2, 0.5])
    if hp_rate > 0:
        model.add(layers.Dropout(hp_rate))
        
    hp_num_layers = hp.Choice('num_layers', values=[1, 2, 3])
    
    if hp_num_layers > 1:
        
        hp_stddev_2 = hp.Choice('stddev_2', values=[0.0, 0.05, 0.1, 0.2])
        if hp_stddev_2 > 0:
            model.add(layers.GaussianNoise(hp_stddev_2))
        
        hp_units_2 = hp.Int('units_2', min_value=16, max_value=128, step=16)
        model.add(layers.Dense(units=hp_units_2, activation='relu'))
        
        hp_batchN_2 = hp.Choice('batch_2', values=[0,1])
        if hp_batchN_2 > 0:
            model.add(layers.BatchNormalization()),

        hp_rate_2 = hp.Choice('rate_2', values=[0.0, 0.1, 0.2, 0.5])
        if hp_rate_2 > 0:
            model.add(layers.Dropout(hp_rate_2))

    if hp_num_layers > 2:
        
        hp_stddev_3 = hp.Choice('stddev_3', values=[0.0, 0.05, 0.1, 0.2])
        if hp_stddev_3 > 0:
            model.add(layers.GaussianNoise(hp_stddev_3))
        
        hp_units_3 = hp.Int('units_3', min_value=16, max_value=128, step=16)
        model.add(layers.Dense(units=hp_units_2, activation='relu'))
        
        hp_batchN_3 = hp.Choice('batch_3', values=[0,1])
        if hp_batchN_3 > 0:
            model.add(layers.BatchNormalization()),

        hp_rate_3 = hp.Choice('rate_3', values=[0.0, 0.1, 0.2, 0.5])
        if hp_rate_3 > 0:
            model.add(layers.Dropout(hp_rate_3))


    model.add(layers.Dense(1))

    hp_learning_rate = hp.Choice('learning_rate', values=[0.02, 0.01, 0.005, 0.001])
    model.compile(
        loss = tf.keras.losses.MeanSquaredError(),
        optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=hp_learning_rate),
        metrics=metrics,
    )
    
    return model
    
tuner = kt.Hyperband(model_builder,
                     objective='val_loss',
                     max_epochs=EPOCHS,
                     factor=4,
                    )

stop_early = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    patience=40,
    min_delta=0.02,
    mode='min')

if tune:
    tuner.search(train_features, train_labels, epochs=EPOCHS, validation_data=(val_features, val_labels), callbacks=[stop_early])

    # Get the optimal hyperparameters
    best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
    print(tuner.results_summary())

    print('The hyperparameter search is complete.')

without outliers 
Trial 0410 summary
Hyperparameters:
stddev: 0.0
units: 112
batch_1: 0
rate: 0.2
num_layers: 2
learning_rate: 0.001
stddev_2: 0.05
units_2: 80
batch_2: 0
rate_2: 0.0
stddev_3: 0.1
units_3: 96
batch_3: 0
rate_3: 0.2
tuner/epochs: 500
tuner/initial_epoch: 125
tuner/bracket: 4
tuner/round: 4
tuner/trial_id: 0406
Score: 0.20870405435562134

with outliers
Trial 0405 summary
Hyperparameters:
stddev: 0.05
units: 48
batch_1: 0
rate: 0.1
num_layers: 2
learning_rate: 0.01
stddev_2: 0.0
units_2: 16
batch_2: 0
rate_2: 0.1
stddev_3: 0.2
units_3: 96
batch_3: 1
rate_3: 0.5
tuner/epochs: 125
tuner/initial_epoch: 32
tuner/bracket: 4
tuner/round: 3
tuner/trial_id: 0389
Score: 0.23247030377388

In [None]:
if tune:

    stddev = best_hps.get('stddev')
    if stddev > 0:
        print(f'Add GaussianNoise({stddev})')
    
    print(f'''The optimal number of units in the first densely-connected
    layer is {best_hps.get('units')}''')    

    rate = best_hps.get('rate')
    if rate > 0:
        print(f'Add Dropout({rate})')
    
    num_layers = best_hps.get('num_layers')
    if num_layers > 1:
        
        stddev_2 = best_hps.get('stddev_2')
        if stddev_2 > 0:
            print(f'Add GaussianNoise({stddev_2})')

        print(f'''The optimal number of units in the 2nd densely-connected
        layer is {best_hps.get('units_2')}''')    

        rate_2 = best_hps.get('rate_2')
        if rate_2 > 0:
            print(f'Add Dropout({rate_2})')

    if num_layers > 2:
        
        stddev_3 = best_hps.get('stddev_3')
        if stddev_3 > 0:
            print(f'Add GaussianNoise({stddev_3})')

        print(f'''The optimal number of units in the 3rd densely-connected
        layer is {best_hps.get('units_3')}''')    

        rate_3 = best_hps.get('rate_3')
        if rate_3 > 0:
            print(f'Add Dropout({rate_3})')
            
    if num_layers > 3:
        
        stddev_4 = best_hps.get('stddev_4')
        if stddev_4 > 0:
            print(f'Add GaussianNoise({stddev_4})')

        print(f'''The optimal number of units in the 4rd densely-connected
        layer is {best_hps.get('units_4')}''')    

        rate_4 = best_hps.get('rate_4')
        if rate_4 > 0:
            print(f'Add Dropout({rate_4})')

        
    print(f'''The optimal learning rate for the optimizer
    is {best_hps.get('learning_rate')}.''')
    
    best_model = tuner.get_best_models(num_models=1)[0]
    #best_model.summary()

# Evaluation

In [None]:
def get_uncompiled_model():
    model = tf.keras.Sequential([
        #normalize,
        
        layers.GaussianNoise(0.05),
        layers.Dense(112, activation='relu'),
        #layers.BatchNormalization(),
        layers.Dropout(0.1),
        
        layers.GaussianNoise(0.00),
        layers.Dense(112, activation='relu'),
        #layers.BatchNormalization(),
        layers.Dropout(0.1),

        #layers.GaussianNoise(0.05),
        layers.Dense(16, activation='relu'),
        #layers.BatchNormalization(),
        #layers.Dropout(0.2),
        
        layers.Dense(1)
    ])
    return model
    
def get_compiled_model():
    model = get_uncompiled_model()
    model.compile(
        loss = tf.keras.losses.MeanSquaredError(),
        optimizer = tf.keras.optimizers.Adam(learning_rate=0.001),
    )
    return model

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    verbose=1,
    patience=100,
    min_delta=0.02,
    mode='min',
    restore_best_weights=True)

model = get_compiled_model()
#model.summary()

In [None]:
history = model.fit(
    train_features, train_labels,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=[early_stopping],
    validation_data=(val_features, val_labels),
    verbose=2
)

In [None]:
def plot_metrics(history):
    metrics = ['loss']
    for n, metric in enumerate(metrics):
        name = metric.replace("_"," ").capitalize()
        plt.subplot(1, 1, 1)
        plt.plot(history.epoch, history.history[metric], color='r', label='Train')
        plt.plot(history.epoch, history.history['val_'+metric],
             color='b', linestyle="--", label='Val')
        plt.xlabel('Epoch')
        plt.ylabel(name)
        if metric == 'loss':
            plt.ylim([0, plt.ylim()[1]])
        else:
            plt.ylim([-0.2,1])
        plt.legend()
        
plot_metrics(history)        

In [None]:
train_probas = model.predict(train_features, batch_size=BATCH_SIZE)
test_probas = model.predict(test_features, batch_size=BATCH_SIZE)
val_probas = model.predict(val_features, batch_size=BATCH_SIZE)

train_predictions = np.rint(train_probas).astype(int)
test_predictions = np.rint(test_probas).astype(int)
val_predictions = np.rint(val_probas).astype(int)

In [None]:
score = r2_score(test_labels,  test_predictions)
rmse_score = np.sqrt(mean_squared_error(test_labels, test_predictions))

print('\R2:', score)
print('\RMSE:', rmse_score)