## Preprocessing

In [None]:
!pip install keras-tuner
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd
import tensorflow as tf


Collecting keras-tuner
  Downloading keras_tuner-1.4.6-py3-none-any.whl (128 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/128.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m122.9/128.9 kB[0m [31m4.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.9/128.9 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.6 kt-legacy-1.0.5


In [None]:
# Constants
DROP_URL = "https://project-4-group-6-air-quality.s3.us-east-2.amazonaws.com/data_drop.csv"

In [None]:
drop_df = pd.read_csv(DROP_URL)
drop_df.head()

Unnamed: 0.1,Unnamed: 0,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM
0,0,2013,3,1,0,9.0,9.0,3.0,17.0,300.0,89.0,-0.5,1024.5,-21.4,0.0,NNW,5.7
1,1,2013,3,1,1,4.0,4.0,3.0,16.0,300.0,88.0,-0.7,1025.1,-22.1,0.0,NW,3.9
2,2,2013,3,1,5,4.0,4.0,9.0,25.0,300.0,78.0,-2.4,1027.5,-21.3,0.0,NW,2.4
3,3,2013,3,1,6,5.0,5.0,10.0,29.0,400.0,67.0,-2.5,1028.2,-20.4,0.0,NW,2.2
4,4,2013,3,1,7,3.0,6.0,12.0,40.0,400.0,52.0,-1.4,1029.5,-20.4,0.0,NNW,3.0


In [None]:
# Drop the non-beneficial columns
trim_drop_df = drop_df.drop(['year', 'month', 'day','hour','wd', 'Unnamed: 0', 'PRES', 'DEWP', 'RAIN', 'WSPM'],axis=1)

# Rename Unnamed:0 column
# trim_drop_df.rename(columns ={"Unnamed: 0":"Number"})
trim_drop_df.head()

Unnamed: 0,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,WSPM
0,9.0,9.0,3.0,17.0,300.0,89.0,-0.5,1024.5,-21.4,0.0,5.7
1,4.0,4.0,3.0,16.0,300.0,88.0,-0.7,1025.1,-22.1,0.0,3.9
2,4.0,4.0,9.0,25.0,300.0,78.0,-2.4,1027.5,-21.3,0.0,2.4
3,5.0,5.0,10.0,29.0,400.0,67.0,-2.5,1028.2,-20.4,0.0,2.2
4,3.0,6.0,12.0,40.0,400.0,52.0,-1.4,1029.5,-20.4,0.0,3.0


In [None]:
trim_drop_df['O3'].value_counts()

2.0000      40394
3.0000       8466
4.0000       7773
1.0000       6751
5.0000       6250
            ...  
107.5284        1
106.4574        1
103.2444        1
161.2926        1
432.0000        1
Name: O3, Length: 1035, dtype: int64

In [None]:
# arguably cleaner code to splits off y first (based on class activity 20.2.8 instead of 19.2.3thru5...) so
# Split our preprocessed data into our features and target arrays
y = trim_drop_df['O3']
X = trim_drop_df.drop(columns='O3')
# y.head()
X.head()

Unnamed: 0,PM2.5,PM10,SO2,NO2,CO,TEMP,PRES,DEWP,RAIN,WSPM
0,9.0,9.0,3.0,17.0,300.0,-0.5,1024.5,-21.4,0.0,5.7
1,4.0,4.0,3.0,16.0,300.0,-0.7,1025.1,-22.1,0.0,3.9
2,4.0,4.0,9.0,25.0,300.0,-2.4,1027.5,-21.3,0.0,2.4
3,5.0,5.0,10.0,29.0,400.0,-2.5,1028.2,-20.4,0.0,2.2
4,3.0,6.0,12.0,40.0,400.0,-1.4,1029.5,-20.4,0.0,3.0


In [None]:
# Convert categorical data to numeric with `pd.get_dummies` - arguably cleaner code to splits off y first so
# X = pd.get_dummies(X)
# X.head()
y.head()

0    89.0
1    88.0
2    78.0
3    67.0
4    52.0
Name: O3, dtype: float64

In [None]:

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=6)
# with stratify=y  get valueError only 1 member issue, so recommended to remove stratify=y or try the next line
# X_train, X_test, y_train, y_test = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=10).split(X, y)

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
# number_input_features = len(X_train[0]) # one neuron for each value in out input (column in X)
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=6, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 8)                 88        
                                                                 
 dense_4 (Dense)             (None, 5)                 45        
                                                                 
 dense_5 (Dense)             (None, 1)                 6         
                                                                 
Total params: 139 (556.00 Byte)
Trainable params: 139 (556.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Compile the model
nn.compile(loss="mse", optimizer="adam", metrics=["mae"])

In [None]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

2986/2986 - 4s - loss: -1.0879e+11 - accuracy: 0.0179 - 4s/epoch - 1ms/step
Loss: -108788670464.0, Accuracy: 0.01787695474922657


In [None]:
# optimizing model via autotuning
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):  # hp stands for hyperparameters
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])
    # choice = choose from list  also could have more or different fxns

    # Allow kerastuner to decide number of neurons in first layer
    # Int = choose from an integer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation=activation, input_dim=6))
# use the activation chosen above

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 5)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))

  # always add the output layer
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="mse", optimizer='adam', metrics=["mae"])

    return nn_model



In [None]:
# Import the kerastuner library
import keras_tuner as kt

# Create a `Hyperband()` tuner instance
# uses the create_model defined above
# hyperband give validation_accuracy  (is like a free R  data held out from training)
tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2)

In [None]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))
# for each model that it creates train that model for 20 epochs
# this number of epochs * (hyperband_iterations+1) = total number of trials

Trial 5 Complete [00h 01m 37s]
val_accuracy: 0.01787695474922657

Best val_accuracy So Far: 0.01787695474922657
Total elapsed time: 00h 09m 42s

Search: Running Trial #6

Value             |Best Value So Far |Hyperparameter
sigmoid           |sigmoid           |activation
7                 |1                 |first_units
3                 |5                 |num_layers
9                 |5                 |units_0
7                 |1                 |units_1
9                 |1                 |units_2
7                 |1                 |units_3
9                 |1                 |units_4
3                 |3                 |tuner/epochs
0                 |0                 |tuner/initial_epoch
2                 |2                 |tuner/bracket
0                 |0                 |tuner/round

Epoch 1/3
Epoch 2/3
Epoch 3/3

In [None]:
# Get best model hyperparameters
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'activation': 'tanh',
 'first_units': 9,
 'num_layers': 1,
 'units_0': 3,
 'units_1': 1,
 'units_2': 3,
 'units_3': 5,
 'units_4': 9,
 'units_5': 3,
 'tuner/epochs': 3,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 2,
 'tuner/round': 0}

In [None]:
# Evaluate best model against full test data
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 1s - loss: 0.5620 - accuracy: 0.7272 - 589ms/epoch - 2ms/step
Loss: 0.562040388584137, Accuracy: 0.7272303104400635


In [None]:
# Evaluate the top 3 models against the test dataset
top_model = tuner.get_best_models(3)
for model in top_model:
    model_loss, model_accuracy = model.evaluate(X_test_scaled,y_test,verbose=2)
    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 1s - loss: 0.5620 - accuracy: 0.7272 - 562ms/epoch - 2ms/step
Loss: 0.562040388584137, Accuracy: 0.7272303104400635
268/268 - 1s - loss: 0.5574 - accuracy: 0.7270 - 612ms/epoch - 2ms/step
Loss: 0.5574286580085754, Accuracy: 0.7269970774650574
268/268 - 1s - loss: 0.5659 - accuracy: 0.7269 - 590ms/epoch - 2ms/step
Loss: 0.5658947229385376, Accuracy: 0.7268804907798767


In [None]:
# Export our model