# Forecasting Ugandan Air Quality using Deep Learning

# Module loading
Load all the necessary packages for your assignment. We give you some modules in advance, feel free to add more, if you need them.

In [None]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras.layers.experimental import preprocessing

import pathlib
import shutil
import tempfile

print(tf.__version__)

In [None]:
#!pip install -q git+https://github.com/tensorflow/docs

import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots

In [None]:
rseed = 42
tf.keras.backend.set_floatx('float64')
plt.rcParams['figure.figsize'] = (15, 10)

## Data loading
Load here your data from your ML project. You can use either `pandas` or `numpy` to format your data. 

In [None]:
#import Data
df=pd.read_csv("./data/air_quality_final.csv")

In [None]:
df.head()

## Data Preparation
Train-Test-Split and dummy encoding (if necessary) 

In [None]:
X = df.drop('target', axis=1)
y = df['target']
print(f"We have {X.shape[0]} observations in our dataset and {X.shape[1]} features")
print(f"Our target vector has also {y.shape[0]} values")

In [None]:
#dummy-encode the location feature
location = pd.get_dummies(X['location'], prefix='location',drop_first=True)
location.head()
# concatenate dummy-encoded locations feature to original dataframe
X = pd.concat([X, location],axis = 1)

In [None]:
#train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=X['location'], random_state=rseed)

In [None]:
#drop multiclass location column and store in new variable to be used for color-coding residual plot
X_train_loc = X_train.location
X_test_loc = X_test.location
X_train.drop('location', axis=1, inplace=True)
X_test.drop('location', axis=1, inplace=True)

In [None]:
# extract numeric features for z-standardization
num_features = list(X_train.columns[X_train.dtypes=='float64'])

In [None]:
# scale features using z-transformation
scaler = StandardScaler()

# fit_transform training training data
X_train[num_features] = scaler.fit_transform(X_train[num_features])
# apply transform to test data
X_test[num_features] = scaler.transform(X_test[num_features])

## DNN without regularization

### Define model architecture and compile

In [None]:
# Define model architecture and compile in function
def get_compiled_model():
    model = tf.keras.Sequential([
      tf.keras.layers.Dense(64,kernel_initializer = 'uniform', activation='relu', input_dim = 26),
      tf.keras.layers.Dense(64,kernel_initializer = 'uniform', activation='relu'),
      tf.keras.layers.Dense(64,kernel_initializer = 'uniform', activation='relu'),
      tf.keras.layers.Dense(64,kernel_initializer = 'uniform', activation='relu'),
      tf.keras.layers.Dense(1,kernel_initializer = 'uniform')
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(0.01, name='Adam'),
                  loss='mae',
                  metrics=[tf.keras.metrics.RootMeanSquaredError()])
    return model

In [None]:
# Instantiate small model and print model summary
with tf.device('/cpu:0'):
    dnn_wo = get_compiled_model()
    print(dnn_wo.summary())

### Fit the model to the training data

In [None]:
# Train the model
with tf.device('/cpu:0'):
    train_history_dnn_wo = dnn_wo.fit(X_train,
                        y_train,
                        validation_split=0.2,
                        verbose=1,
                        batch_size = 48,
                        epochs=200)

In [None]:
# Plotting function for training metrics
def plot_training_metrics(history):
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 8))
    ax1.plot(history.history['root_mean_squared_error'], label='Training Set', c='blue')
    ax1.plot(history.history['val_root_mean_squared_error'], label='Validation Set', c='red')
    ax1.set_yticks(ticks=np.arange(0,81, 20), labels=np.arange(0,81, 20), fontsize=12)
    ax1.set_xticks(ticks=np.arange(0,200, 20), labels=np.arange(0,200, 20), fontsize=12)
    ax1.set_title('Change in RMSE across training epochs', fontsize=16)
    ax1.legend(fontsize=12)
    ax1.set_ylabel('RMSE', fontsize=14)
    ax1.set_xlabel('Epoch', fontsize=14)

    ax2.plot(history.history['loss'], label='Training Set', c='blue')
    ax2.plot(history.history['val_loss'], label='Validation Set', c='red')
    ax2.set_yticks(ticks=np.arange(0,81, 20), labels=np.arange(0,81, 20), fontsize=12)
    ax2.set_xticks(ticks=np.arange(0,201, 20), labels=np.arange(0,201, 20), fontsize=12)
    ax2.set_title('\nChange in Loss across training epochs', fontsize=16)
    ax2.legend(fontsize=12)
    ax2.set_ylabel('Loss (MAE)', fontsize=14)
    ax2.set_xlabel('Epoch', fontsize=14)

    plt.tight_layout()


In [None]:
# plot training history 
plot_training_metrics(train_history_dnn_wo)

### Evaluate model performance on test set

In [None]:
# Evaluate the model on test set using .evaluate
loss, rmse = dnn_wo.evaluate(X_test, y_test, verbose=2)
print(f'Model RMSE: {rmse}')
print('--------'*5)

# Predict values for test set and flatten to 1 dimension
y_pred_dnn_wo = dnn_wo.predict(X_test).flatten()

In [None]:
# calculate residuals
residual_dnn_wo = y_test - y_pred_dnn_wo

# compute mean of residuals
np.mean(residual_dnn_wo)

In [None]:
plt.figure(figsize=(6,5))
sns.scatterplot(x=y_pred_dnn_wo, y=residual_dnn_wo, hue=X_test_loc)
plt.xlabel('y_pred')
plt.ylabel('residual')
plt.title('Residual plot from DNN w/o regularization');

In [None]:
plt.figure(figsize=(6,5))
sns.scatterplot(x=y_pred_dnn_wo, y=y_test, color='b')#, hue=X_test_loc)
plt.xlabel('Predicted from Weather Data (in µg / m$^3$)')
plt.ylabel('Measured (in µg / m$^3$)')
plt.title('Measured and Predicted PM$_{2.5}$ Concentration');

### Convert actual and predicted PM2.5 levels from best models into air quality categories

In [None]:
# plotting function for color-coded confusion matrix
import itertools
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    Source: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.figure(figsize = (6, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, size = 16)
    plt.colorbar(aspect=4)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, size = 10)
    plt.yticks(tick_marks, classes, size = 10)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    
    # Labeling the plot
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), fontsize = 12,
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
        
    plt.tight_layout()
    plt.ylabel('Measured Category', size = 14)
    plt.xlabel('Predicted Category', size = 14)

In [None]:
# actual y labels
y_test_labels = ['Good' if x < 13 else 'Moderate' if x < 36 else 'Unhealthy (Sensitive)' if x < 56 else 'Unhealthy' if x < 151 
                    else 'Very Unhealthy' if x < 251 else 'Hazardous' for x in y_test]

# predicted y labels by XGBoost Regressor
y_pred_labels_dnn_wo = ['Good' if x < 13 else 'Moderate' if x < 36 else 'Unhealthy (Sensitive)' if x < 56 else 'Unhealthy' if x < 151 
                            else 'Very Unhealthy' if x < 251 else 'Hazardous' for x in y_pred_dnn_wo]

### Compute confusion matrix and plot

In [None]:
cm_wo = confusion_matrix(y_test_labels, y_pred_labels_dnn_wo)

In [None]:
plot_confusion_matrix(cm_wo, classes=['Good', 'Moderate', 'Unhealthy\n(Sensitive)', 'Unhealthy', 'Very Unhealthy', 'Hazardous'],
                title='Confusion Matrix - DNN w/o regularization');

## DNN with L2 regularization

### Define model architecture and compile

In [None]:
# Define model architecture and compile in function
def get_compiled_l2model():
    model = tf.keras.Sequential([
      tf.keras.layers.Dense(64,kernel_initializer = 'uniform', kernel_regularizer=tf.keras.regularizers.L2(0.01), activation='relu', input_dim = 26),
      tf.keras.layers.Dense(64,kernel_initializer = 'uniform', kernel_regularizer=tf.keras.regularizers.L2(0.01), activation='relu'),
      tf.keras.layers.Dense(64,kernel_initializer = 'uniform', kernel_regularizer=tf.keras.regularizers.L2(0.01), activation='relu'),
      tf.keras.layers.Dense(64,kernel_initializer = 'uniform', kernel_regularizer=tf.keras.regularizers.L2(0.01), activation='relu'),
      tf.keras.layers.Dense(1,kernel_initializer = 'uniform')
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(0.01, name='Adam'),
                  loss='mae',
                  metrics=[tf.keras.metrics.RootMeanSquaredError()])
    return model

In [None]:
# Instantiate small model and print model summary
with tf.device('/cpu:0'):
    dnn_l2 = get_compiled_l2model()
    print(dnn_l2.summary())

### Fit the model to the train data

In [None]:
# Train the model
with tf.device('/cpu:0'):
    train_history_dnn_l2 = dnn_l2.fit(X_train,
                        y_train,
                        validation_split=0.2,
                        verbose=1,
                        batch_size = 48,
                        epochs=200)

In [None]:
# plot training history 
plot_training_metrics(train_history_dnn_l2)

### Evaluate model performance on test set

In [None]:
# Evaluate the model on test set using .evaluate
loss, rmse = dnn_l2.evaluate(X_test, y_test, verbose=2)
print(f'Model RMSE: {rmse}')
print('--------'*5)

# Predict values for test set and flatten to 1 dimension
y_pred_dnn_l2 = dnn_l2.predict(X_test).flatten()

In [None]:
# calculate residuals
residual_dnn_l2 = y_test - y_pred_dnn_l2

# compute mean of residuals
np.mean(residual_dnn_l2)

In [None]:
plt.figure(figsize=(6,5))
sns.scatterplot(x=y_pred_dnn_l2, y=residual_dnn_l2, hue=X_test_loc)
plt.xlabel('y_pred')
plt.ylabel('residual')
plt.title('Residual plot from DNN with L2 regularization');

In [None]:
plt.figure(figsize=(6,5))
sns.scatterplot(x=y_pred_dnn_l2, y=y_test, color='b')#, hue=X_test_loc)
plt.xlabel('Predicted from Weather Data (in µg / m$^3$)')
plt.ylabel('Measured (in µg / m$^3$)')
plt.title('Measured and Predicted PM$_{2.5}$ Concentration');

### Convert predicted PM2.5 concentrations into labels

In [None]:
# predicted y labels by XGBoost Regressor
y_pred_labels_dnn_l2 = ['Good' if x < 13 else 'Moderate' if x < 36 else 'Unhealthy (Sensitive)' if x < 56 else 'Unhealthy' if x < 151 
                            else 'Very Unhealthy' if x < 251 else 'Hazardous' for x in y_pred_dnn_l2]

### Compute confusion matrix and plot

In [None]:
cm_l2 = confusion_matrix(y_test_labels, y_pred_labels_dnn_l2)

In [None]:
plot_confusion_matrix(cm_l2, classes=['Good', 'Moderate', 'Unhealthy\n(Sensitive)', 'Unhealthy', 'Very Unhealthy', 'Hazardous'],
                title='Confusion Matrix - DNN with L2 regularization');

## Save models

In [None]:
dnn_wo.save('models/dnn_wo')
dnn_l2.save('models/dnn_l2')

## Conclusion

The dnn currently does not peform better than XGBoost Regressor. The L2 regularization did not improve the prediction accuracy on the test set. Testing different model architectures and/or optimizers may further improve dnn model accuracy