Neural Network with One Hot encoding of the month and hour columns to address when categorical variables are encoded as integers but where no such ordinal relationship exists.  Note same result is obtained when year, month, day, and hour are One Hot encoded, which unfortunately is no improvement.

## Preprocessing

In [62]:
!pip install keras-tuner
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd
import tensorflow as tf
from math import exp




In [63]:
# Constants
DROP_URL = "https://project-4-group-6-air-quality.s3.us-east-2.amazonaws.com/data_drop.csv"

In [64]:
# Function for calculation of relative humidity
def relative_humidity(temp, dewp):
  # formula obtained from https://bmcnoldy.earth.miami.edu/Humidity.html retrieved 12.11.2023
  rh =100*(exp((17.625*dewp)/(243.04+dewp))/exp((17.625*temp)/(243.04+temp)))
  return rh

encoder = OneHotEncoder(sparse=False)

In [65]:
# test of function
relative_humidity(12.8, -10)

19.43471947745291

In [66]:
# read in our data
drop_df = pd.read_csv(DROP_URL)
drop_df.head()

Unnamed: 0.1,Unnamed: 0,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM
0,0,2013,3,1,0,9.0,9.0,3.0,17.0,300.0,89.0,-0.5,1024.5,-21.4,0.0,NNW,5.7
1,1,2013,3,1,1,4.0,4.0,3.0,16.0,300.0,88.0,-0.7,1025.1,-22.1,0.0,NW,3.9
2,2,2013,3,1,5,4.0,4.0,9.0,25.0,300.0,78.0,-2.4,1027.5,-21.3,0.0,NW,2.4
3,3,2013,3,1,6,5.0,5.0,10.0,29.0,400.0,67.0,-2.5,1028.2,-20.4,0.0,NW,2.2
4,4,2013,3,1,7,3.0,6.0,12.0,40.0,400.0,52.0,-1.4,1029.5,-20.4,0.0,NNW,3.0


In [67]:
# calculation of relative humidity with our data
drop_df.apply(lambda row: relative_humidity(row['TEMP'], row['DEWP']), axis="columns")


0         18.911275
1         18.049201
2         21.931442
3         23.888875
4         22.028696
            ...    
382163    16.043923
382164    15.979718
382165    19.032232
382166    22.052187
382167    24.053001
Length: 382168, dtype: float64

In [68]:
# adding relative humidity to our data frame
drop_df['RelHum'] = drop_df.apply(lambda row: relative_humidity(row['TEMP'], row['DEWP']), axis="columns")

In [69]:
# Drop the non-beneficial columns
trim_drop_df = drop_df.drop(['Unnamed: 0', 'year', 'day', 'PM2.5',	'PM10',	'SO2',	'NO2',	'CO',	'PRES',	'DEWP',	'RAIN',	'wd',	'WSPM'],axis=1)

trim_drop_df.head()

Unnamed: 0,month,hour,O3,TEMP,RelHum
0,3,0,89.0,-0.5,18.911275
1,3,1,88.0,-0.7,18.049201
2,3,5,78.0,-2.4,21.931442
3,3,6,67.0,-2.5,23.888875
4,3,7,52.0,-1.4,22.028696


In [70]:
# One Hot encoding of the month integers
month_dum_drop_df = pd.get_dummies(trim_drop_df['month'])
month_dum_drop_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12
0,0,0,1,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,0,0


In [71]:
# One Hot encoding of the hour integers
hour_dum_drop_df = pd.get_dummies(trim_drop_df['hour'])
hour_dum_drop_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [72]:
# merge in the new month_dum_drop_df to trim_drop_df
trim_drop_df = trim_drop_df.merge(month_dum_drop_df, left_index=True, right_index=True)
trim_drop_df = trim_drop_df.drop(['month'], axis=1)
trim_drop_df.head()


Unnamed: 0,month,hour,O3,TEMP,RelHum,1,2,3,4,5,6,7,8,9,10,11,12
0,3,0,89.0,-0.5,18.911275,0,0,1,0,0,0,0,0,0,0,0,0
1,3,1,88.0,-0.7,18.049201,0,0,1,0,0,0,0,0,0,0,0,0
2,3,5,78.0,-2.4,21.931442,0,0,1,0,0,0,0,0,0,0,0,0
3,3,6,67.0,-2.5,23.888875,0,0,1,0,0,0,0,0,0,0,0,0
4,3,7,52.0,-1.4,22.028696,0,0,1,0,0,0,0,0,0,0,0,0


In [74]:
# merge in the new month_dum_drop_df to trim_drop_df
# note will work on renaming columns if this works
trim_drop_df = trim_drop_df.merge(hour_dum_drop_df, left_index=True, right_index=True)
trim_drop_df = trim_drop_df.drop(['hour'], axis=1)
trim_drop_df.head()

Unnamed: 0,O3,TEMP,RelHum,1_x,2_x,3_x,4_x,5_x,6_x,7_x,...,14,15,16,17,18,19,20,21,22,23
0,89.0,-0.5,18.911275,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,88.0,-0.7,18.049201,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,78.0,-2.4,21.931442,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,67.0,-2.5,23.888875,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,52.0,-1.4,22.028696,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [75]:
trim_drop_df['O3'].value_counts()

2.0000      40394
3.0000       8466
4.0000       7773
1.0000       6751
5.0000       6250
            ...  
107.5284        1
106.4574        1
103.2444        1
161.2926        1
432.0000        1
Name: O3, Length: 1035, dtype: int64

In [76]:

# Split our preprocessed data into our features and target arrays
y = trim_drop_df['O3']
X = trim_drop_df.drop(columns='O3')
# y.head()
X.head()

Unnamed: 0,TEMP,RelHum,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,...,14,15,16,17,18,19,20,21,22,23
0,-0.5,18.911275,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-0.7,18.049201,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-2.4,21.931442,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-2.5,23.888875,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-1.4,22.028696,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
# converts column names from integers to strings
X.columns = X.columns.astype(str)

In [81]:
# Checking Y set
y.head()

0    89.0
1    88.0
2    78.0
3    67.0
4    52.0
Name: O3, dtype: float64

In [82]:

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)
# with stratify=y  get valueError only 1 member issue, so recommended to remove stratify=y or try the next line
# X_train, X_test, y_train, y_test = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=10).split(X, y)

In [83]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [85]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
# number_input_features = len(X_train[0]) # one neuron for each value in out input (column in X)
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=38, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 8)                 312       
                                                                 
 dense_4 (Dense)             (None, 5)                 45        
                                                                 
 dense_5 (Dense)             (None, 1)                 6         
                                                                 
Total params: 363 (1.42 KB)
Trainable params: 363 (1.42 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [86]:
# Compile the model
nn.compile(loss="mse", optimizer="adam", metrics=['mae'])

In [89]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [90]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

2986/2986 - 4s - loss: 6414.0684 - mae: 56.4482 - 4s/epoch - 1ms/step
Loss: 6414.068359375, Accuracy: 56.44820022583008


In [91]:
# optimizing model via autotuning
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):  # hp stands for hyperparameters
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])
    # choice = choose from list  also could have more or different fxns

    # Allow kerastuner to decide number of neurons in first layer
    # Int = choose from an integer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=8,
        step=2), activation=activation, input_dim=38))
# use the activation chosen above

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 4)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=7,
            step=2),
            activation=activation))

  # always add the output layer
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="mse", optimizer='adam', metrics=['mae'])

    return nn_model



In [92]:
# Import the kerastuner library
import keras_tuner as kt

# Create a `Hyperband()` tuner instance
# uses the create_model defined above
# hyperband give validation_accuracy  (is like a free R  data held out from training)
tuner = kt.Hyperband(
    create_model,
    objective="val_mae",
    max_epochs=10,
    hyperband_iterations=2)

In [93]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train, epochs=10, validation_data=(X_test_scaled,y_test))
# for each model that it creates train that model for 20 epochs
# this number of epochs * (hyperband_iterations+1) = total number of trials

Trial 60 Complete [00h 04m 06s]
val_mae: 56.44820022583008

Best val_mae So Far: 56.44820022583008
Total elapsed time: 02h 03m 14s


In [94]:
# Get best model hyperparameters
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'activation': 'tanh',
 'first_units': 1,
 'num_layers': 2,
 'units_0': 1,
 'tuner/epochs': 2,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 2,
 'tuner/round': 0,
 'units_1': 1}

In [95]:
# Evaluate best model against full test data
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

2986/2986 - 5s - loss: 6414.0713 - mae: 56.4482 - 5s/epoch - 2ms/step
Loss: 6414.0712890625, Accuracy: 56.44820022583008


In [96]:
# Evaluate the top 3 models against the test dataset
top_model = tuner.get_best_models(3)
for model in top_model:
    model_loss, model_accuracy = model.evaluate(X_test_scaled,y_test,verbose=2)
    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

2986/2986 - 4s - loss: 6414.0713 - mae: 56.4482 - 4s/epoch - 1ms/step
Loss: 6414.0712890625, Accuracy: 56.44820022583008
2986/2986 - 4s - loss: 6414.0684 - mae: 56.4482 - 4s/epoch - 1ms/step
Loss: 6414.068359375, Accuracy: 56.44820022583008
2986/2986 - 4s - loss: 6414.0684 - mae: 56.4482 - 4s/epoch - 1ms/step
Loss: 6414.068359375, Accuracy: 56.44820022583008


In [None]:
# Export our model to HDF5 file
#  YOUR CODE GOES HERE