# TabularDNN

Use `ssh -L 8080:localhost:8080 username@example.com` to port forward 8080 on remote which will be running jupyter notebook to 8080 local then run `jupyter-notebook --no-browser --port 8080 .` on remote. After jupyter comes up use the provided links in terminal to open up jupyter notebook on your local browser. 

Use `/usr/bin/python3.11 -m venv venvName` to make venv. Use `source venvName/bin/activate` to enter venv. Python will be 3.11 when used either `python` or `python3`. To get out of venv use `deactivate`. `rm -r venvName` to delete venv.

Use `pip install tensorflow[and-cuda]` to install and check with `python3 -c "import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))"` to see if GPU is set up. If GPU devices are shown then everything is OK.

Use `pip install jupyter` to have jupyter-notebook in venv. Check with `which jupyter` to see if correct jupyter is used. Use `python -m ipykernel install --user --name=venvName` to add venv to jupyter kernels. When creating new notebook instead of using python3 use venvName. To remove venv kernel from jupyter use `jupyter kernelspec remove venvName`

In [1]:
! python --version

Python 3.11.8


In [2]:
import pandas as pd
print(f"Pandas Version: {pd.__version__}")

Pandas Version: 2.2.3


In [3]:
df = pd.read_csv("dataset/AQI.csv")
df.head()

Unnamed: 0,Date,Time,Temperature,Real Feel,Weather Status,Wind Gusts,Humidity,Indoor Humidity,Humidity Status,Dew Point,...,AQI,AQI-PM2.5,AQI-PM10,AQI-NO2,AQI-O3,Amount-PM2.5,Amount-PM10,Amount-NO2,Amount-O3,Filename
0,"FRIDAY, 18 OCTOBER",23:50,12,9,Cloudy,25,0.92,0.53,Ideal Humidity,10,...,35,20,17,35,11,5,12,17,31,Seattle/142.png
1,"WEDNESDAY, 9 OCTOBER",17:53,15,16,Mostly cloudy,8,0.74,0.53,Ideal Humidity,10,...,28,16,22,28,16,3,17,13,47,Seattle/27.png
2,"WEDNESDAY, 9 OCTOBER",19:13,15,15,Partly cloudy,7,0.75,0.54,Ideal Humidity,11,...,50,21,24,50,11,5,18,25,33,Seattle/28.png
3,"THURSDAY, 17 OCTOBER",06:22,9,9,Mostly cloudy,7,0.89,0.42,Ideal Humidity,7,...,25,11,9,25,10,2,7,12,29,Seattle/72.png
4,"TUESDAY, 8 OCTOBER",14:06,17,19,Mostly cloudy,10,0.89,0.74,Slightly Humid,15,...,41,20,18,41,6,4,13,20,17,Seattle/8.png


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2080 entries, 0 to 2079
Data columns (total 25 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Date                2080 non-null   object 
 1   Time                2080 non-null   object 
 2   Temperature         2080 non-null   int64  
 3   Real Feel           2080 non-null   int64  
 4   Weather Status      2080 non-null   object 
 5   Wind Gusts          2080 non-null   int64  
 6   Humidity            2080 non-null   float64
 7   Indoor Humidity     2080 non-null   float64
 8   Humidity Status     2080 non-null   object 
 9   Dew Point           2080 non-null   int64  
 10  Pressure            2080 non-null   int64  
 11  Pressure Direction  2080 non-null   object 
 12  Cloud Cover         2080 non-null   float64
 13  Visibility          2080 non-null   float64
 14  Cloud Ceiling       2080 non-null   int64  
 15  AQI                 2080 non-null   int64  
 16  AQI-PM

In [5]:
# Used in training --> Conversion required for warning in normalization!
df = df.astype({
    "Temperature": "float64",
    "Real Feel": "float64",
    "Wind Gusts": "float64",
    "Dew Point": "float64",
    "Pressure": "float64",
    "Cloud Ceiling": "float64"
})

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2080 entries, 0 to 2079
Data columns (total 25 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Date                2080 non-null   object 
 1   Time                2080 non-null   object 
 2   Temperature         2080 non-null   float64
 3   Real Feel           2080 non-null   float64
 4   Weather Status      2080 non-null   object 
 5   Wind Gusts          2080 non-null   float64
 6   Humidity            2080 non-null   float64
 7   Indoor Humidity     2080 non-null   float64
 8   Humidity Status     2080 non-null   object 
 9   Dew Point           2080 non-null   float64
 10  Pressure            2080 non-null   float64
 11  Pressure Direction  2080 non-null   object 
 12  Cloud Cover         2080 non-null   float64
 13  Visibility          2080 non-null   float64
 14  Cloud Ceiling       2080 non-null   float64
 15  AQI                 2080 non-null   int64  
 16  AQI-PM

#### Removing unnecessary columns + Only predicting AQI values for pollutants for now so drop others!

In [7]:
df = df.drop(["Date", "Time", "AQI", "Amount-PM2.5", "Amount-PM10", "Amount-NO2", "Amount-O3", "Filename"], axis = 1)

In [8]:
input_df = df.iloc[:, :-4]
input_df

Unnamed: 0,Temperature,Real Feel,Weather Status,Wind Gusts,Humidity,Indoor Humidity,Humidity Status,Dew Point,Pressure,Pressure Direction,Cloud Cover,Visibility,Cloud Ceiling
0,12.0,9.0,Cloudy,25.0,0.92,0.53,Ideal Humidity,10.0,1021.0,↔,0.99,15.0,800.0
1,15.0,16.0,Mostly cloudy,8.0,0.74,0.53,Ideal Humidity,10.0,1021.0,↑,0.76,23.0,6100.0
2,15.0,15.0,Partly cloudy,7.0,0.75,0.54,Ideal Humidity,11.0,1022.0,↑,0.37,19.0,10700.0
3,9.0,9.0,Mostly cloudy,7.0,0.89,0.42,Ideal Humidity,7.0,1013.0,↔,0.76,16.0,1200.0
4,17.0,19.0,Mostly cloudy,10.0,0.89,0.74,Slightly Humid,15.0,1015.0,↔,0.77,16.0,900.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2075,26.0,29.0,Partly sunny,22.0,0.74,0.74,Very Humid,21.0,1017.0,↔,0.45,16.0,6500.0
2076,27.0,29.0,Mostly sunny,26.0,0.72,0.72,Very Humid,22.0,1020.0,↑,0.25,16.0,9100.0
2077,25.0,26.0,Partly cloudy,16.0,0.77,0.77,Very Humid,21.0,1016.0,↑,0.39,16.0,9100.0
2078,28.0,28.0,Mostly sunny,39.0,0.60,0.60,Very Humid,19.0,1019.0,↔,0.22,16.0,9100.0


In [9]:
output_df = df.iloc[:, -4:]
output_df

Unnamed: 0,AQI-PM2.5,AQI-PM10,AQI-NO2,AQI-O3
0,20,17,35,11
1,16,22,28,16
2,21,24,50,11
3,11,9,25,10
4,20,18,41,6
...,...,...,...,...
2075,27,18,1,11
2076,35,24,2,15
2077,29,20,1,11
2078,34,24,1,16


In [10]:
weatherStatus = pd.get_dummies(df["Weather Status"], dtype = float, drop_first = True)
humidityStatus = pd.get_dummies(df["Humidity Status"], dtype = float, drop_first = True)
pressureDirection = pd.get_dummies(df["Pressure Direction"], dtype = float, drop_first = True)

print(f"Weather Status One-Hot Shape: f{weatherStatus.shape}")
print(f"Humidity Status One-Hot Shape: f{humidityStatus.shape}")
print(f"Pressure Direction One-Hot Shape: f{pressureDirection.shape}")

Weather Status One-Hot Shape: f(2080, 17)
Humidity Status One-Hot Shape: f(2080, 8)
Pressure Direction One-Hot Shape: f(2080, 2)


In [11]:
input_df = input_df.drop(["Weather Status", "Humidity Status", "Pressure Direction"], axis = 1)
input_df = pd.concat([input_df, weatherStatus, humidityStatus, pressureDirection], axis = 1)
input_df

Unnamed: 0,Temperature,Real Feel,Wind Gusts,Humidity,Indoor Humidity,Dew Point,Pressure,Cloud Cover,Visibility,Cloud Ceiling,...,Dry,Extremely Humid,Humid,Ideal Humidity,Slightly Dry,Slightly Humid,Very Dry,Very Humid,↓,↔
0,12.0,9.0,25.0,0.92,0.53,10.0,1021.0,0.99,15.0,800.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,15.0,16.0,8.0,0.74,0.53,10.0,1021.0,0.76,23.0,6100.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,15.0,15.0,7.0,0.75,0.54,11.0,1022.0,0.37,19.0,10700.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9.0,9.0,7.0,0.89,0.42,7.0,1013.0,0.76,16.0,1200.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,17.0,19.0,10.0,0.89,0.74,15.0,1015.0,0.77,16.0,900.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2075,26.0,29.0,22.0,0.74,0.74,21.0,1017.0,0.45,16.0,6500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2076,27.0,29.0,26.0,0.72,0.72,22.0,1020.0,0.25,16.0,9100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2077,25.0,26.0,16.0,0.77,0.77,21.0,1016.0,0.39,16.0,9100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2078,28.0,28.0,39.0,0.60,0.60,19.0,1019.0,0.22,16.0,9100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [12]:
output_df

Unnamed: 0,AQI-PM2.5,AQI-PM10,AQI-NO2,AQI-O3
0,20,17,35,11
1,16,22,28,16
2,21,24,50,11
3,11,9,25,10
4,20,18,41,6
...,...,...,...,...
2075,27,18,1,11
2076,35,24,2,15
2077,29,20,1,11
2078,34,24,1,16


In [13]:
import keras
from sklearn.model_selection import KFold
import numpy as np
print(f"Keras Version: {keras.__version__}")

2024-10-20 00:57:45.484732: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-20 00:57:45.522032: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-20 00:57:45.533576: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-20 00:57:45.560599: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Keras Version: 3.6.0


## Model

In [14]:
def get_model():
    inputs = keras.Input(shape = (input_df.shape[1],))
    x = keras.layers.Dense(16, activation = "relu")(inputs)
    x = keras.layers.Dense(8, activation = "relu")(x)
    outputs = keras.layers.Dense(4)(x)
    
    model = keras.Model(inputs = inputs, outputs = outputs, name = "TabularDNN")
    model.compile(
        loss = keras.losses.MeanAbsoluteError(),
        optimizer = keras.optimizers.Adam(learning_rate = 0.0005),
        metrics = ["accuracy", "f1_score"]
    )
    return model

In [15]:
get_model().summary()

In [16]:
keras.utils.plot_model(get_model(), show_shapes = True)

You must install graphviz (see instructions at https://graphviz.gitlab.io/download/) for `plot_model` to work.


In [17]:
i = 1
kf = KFold(n_splits = 5)
pred_accs = []; pred_losses = []; pred_f1s = []
for train_idx, test_idx in kf.split(input_df, output_df):
    print(f"========================= Fold {i} =========================")
    i += 1
    
    train_X = input_df.iloc[train_idx]; train_Y = output_df.iloc[train_idx]
    test_X = input_df.iloc[test_idx]; test_Y = output_df.iloc[test_idx]

    # Min Max Normalization
    # temp_train_X = train_X.loc[:, :"Cloud Ceiling"]
    # train_X.loc[:, :"Cloud Ceiling"] = (temp_train_X - temp_train_X.min()) / (temp_train_X.max() - temp_train_X.min())
    # temp_test_X = test_X.loc[:, :"Cloud Ceiling"]
    # test_X.loc[:, :"Cloud Ceiling"] = (temp_test_X - temp_test_X.min()) / (temp_test_X.max() - temp_test_X.min())

    # Z-score Normalization
    temp_train_X = train_X.loc[:, :"Cloud Ceiling"]
    train_X.loc[:, :"Cloud Ceiling"] = (temp_train_X - temp_train_X.mean()) / temp_train_X.std()
    temp_test_X = test_X.loc[:, :"Cloud Ceiling"]
    test_X.loc[:, :"Cloud Ceiling"] = (temp_test_X - temp_test_X.mean()) / temp_test_X.std()

    # One-hot columns Normalization
    # for col in train_X.loc[:, "Clear":].columns:
    #     if(train_X[col].std() == 0):
    #         train_X.loc[:, col] = 1 / len(train_X)
    #     else:
    #         train_X.loc[:, col] = (train_X[col] - train_X[col].mean()) / train_X[col].std()

    # for col in test_X.loc[:, "Clear":].columns:
    #     if(test_X[col].std() == 0):
    #         test_X.loc[:, col] = 1 / len(test_X)
    #     else:
    #         test_X.loc[:, col] = (test_X[col] - test_X[col].mean()) / test_X[col].std()
    
    
    # In normalization because of one-hot there were some NaN values --> all zeros in one training group, all ones in testing group
    # This shows where is training we have NaN --> If we have NaN in training then we will have NaN loss in training!
    # print(train_X.columns[train_X.isnull().any()])
    # print(test_X.columns[test_X.isnull().any()])
    
    model = get_model()
    
    history = model.fit(
        train_X,
        train_Y,
        batch_size = 16,
        epochs = 100,
        validation_data = (test_X, test_Y)
    )
    
    pred_loss, pred_acc, pred_f1 = model.evaluate(test_X, test_Y)
    pred_accs.append(pred_acc); pred_losses.append(pred_loss); pred_f1s.append(pred_f1)

Epoch 1/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.1952 - f1_score: 0.4785 - loss: 26.9103 - val_accuracy: 0.1490 - val_f1_score: 0.4790 - val_loss: 24.8766
Epoch 2/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.1973 - f1_score: 0.4107 - loss: 26.4002 - val_accuracy: 0.1490 - val_f1_score: 0.4754 - val_loss: 23.7327
Epoch 3/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.1772 - f1_score: 0.4132 - loss: 24.6719 - val_accuracy: 0.1442 - val_f1_score: 0.4826 - val_loss: 21.7163
Epoch 4/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.1788 - f1_score: 0.4742 - loss: 21.9406 - val_accuracy: 0.1418 - val_f1_score: 0.5292 - val_loss: 18.3976
Epoch 5/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.1977 - f1_score: 0.3341 - loss: 18.3052 - val_accuracy: 0.1178 - val_f1

In [19]:
print(f"Accuracy: {np.mean(pred_accs)} ± {np.std(pred_accs)}")
print(f"F1 Score: {np.mean(pred_f1s)} ± {np.std(pred_f1s)}")
print(f"Loss: {np.mean(pred_losses)} ± {np.std(pred_losses)}")

Accuracy: 0.551442313194275 ± 0.04567561545054976
F1 Score: 0.6399177312850952 ± 0.43816402554512024
Loss: 10.98988208770752 ± 1.2876797267050448
