In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [3]:
df = pd.read_csv('dataset_AirQual.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43824 entries, 0 to 43823
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   No      43824 non-null  int64  
 1   year    43824 non-null  int64  
 2   month   43824 non-null  int64  
 3   day     43824 non-null  int64  
 4   hour    43824 non-null  int64  
 5   pm2.5   41757 non-null  float64
 6   DEWP    43824 non-null  int64  
 7   TEMP    43824 non-null  float64
 8   PRES    43824 non-null  float64
 9   cbwd    43824 non-null  object 
 10  Iws     43824 non-null  float64
 11  Is      43824 non-null  int64  
 12  Ir      43824 non-null  int64  
dtypes: float64(4), int64(8), object(1)
memory usage: 4.3+ MB


In [5]:
df['pm2.5'].describe()

count    41757.000000
mean        98.613215
std         92.050387
min          0.000000
25%         29.000000
50%         72.000000
75%        137.000000
max        994.000000
Name: pm2.5, dtype: float64

In [6]:
#use fillna() method to replace missing values with mean value
df['pm2.5'].fillna(df['pm2.5'].mean(), inplace = True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43824 entries, 0 to 43823
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   No      43824 non-null  int64  
 1   year    43824 non-null  int64  
 2   month   43824 non-null  int64  
 3   day     43824 non-null  int64  
 4   hour    43824 non-null  int64  
 5   pm2.5   43824 non-null  float64
 6   DEWP    43824 non-null  int64  
 7   TEMP    43824 non-null  float64
 8   PRES    43824 non-null  float64
 9   cbwd    43824 non-null  object 
 10  Iws     43824 non-null  float64
 11  Is      43824 non-null  int64  
 12  Ir      43824 non-null  int64  
dtypes: float64(4), int64(8), object(1)
memory usage: 4.3+ MB


In [8]:
df['cbwd'].unique()

array(['NW', 'cv', 'NE', 'SE'], dtype=object)

In [9]:
df['cbwd'].value_counts()

SE    15290
NW    14150
cv     9387
NE     4997
Name: cbwd, dtype: int64

In [10]:
#one hot encoding
cols = df.columns.tolist()
df_new = pd.get_dummies(df[cols])

In [11]:
df_new.head()

Unnamed: 0,No,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,Iws,Is,Ir,cbwd_NE,cbwd_NW,cbwd_SE,cbwd_cv
0,1,2010,1,1,0,98.613215,-21,-11.0,1021.0,1.79,0,0,0,1,0,0
1,2,2010,1,1,1,98.613215,-21,-12.0,1020.0,4.92,0,0,0,1,0,0
2,3,2010,1,1,2,98.613215,-21,-11.0,1019.0,6.71,0,0,0,1,0,0
3,4,2010,1,1,3,98.613215,-21,-14.0,1019.0,9.84,0,0,0,1,0,0
4,5,2010,1,1,4,98.613215,-20,-12.0,1018.0,12.97,0,0,0,1,0,0


In [12]:
#put column pm2.5 at the end of the df
#avoid one of the column rearrangement steps
cols = df_new.columns.tolist()
cols_new = cols[:5] + cols[6:] + cols[5:6]
df_new = df_new[cols_new]
df_new.head()

Unnamed: 0,No,year,month,day,hour,DEWP,TEMP,PRES,Iws,Is,Ir,cbwd_NE,cbwd_NW,cbwd_SE,cbwd_cv,pm2.5
0,1,2010,1,1,0,-21,-11.0,1021.0,1.79,0,0,0,1,0,0,98.613215
1,2,2010,1,1,1,-21,-12.0,1020.0,4.92,0,0,0,1,0,0,98.613215
2,3,2010,1,1,2,-21,-11.0,1019.0,6.71,0,0,0,1,0,0,98.613215
3,4,2010,1,1,3,-21,-14.0,1019.0,9.84,0,0,0,1,0,0,98.613215
4,5,2010,1,1,4,-20,-12.0,1018.0,12.97,0,0,0,1,0,0,98.613215


In [13]:
#get matrix arrays of dependent and independent variables
X = df_new.iloc[:, :-1].values
y = df_new.iloc[:, -1].values

In [14]:
from sklearn.preprocessing import StandardScaler

#training the model
def train(X_train, y):
  #scale the training set data
  sc = StandardScaler()
  X_train_trans = sc.fit_transform(X_train)
  #inintialize ANN as sequence of layers
  ann = tf.keras.models.Sequential()
  #add input and first hidden layer
  ann.add(tf.keras.layers.Dense(units=7, activation='relu'))
  #add second hidden layer
  ann.add(tf.keras.layers.Dense(units=7, activation='relu'))
  #add third hidden layer
  ann.add(tf.keras.layers.Dense(units=7, activation='relu'))
  #add fourth hidden layer
  ann.add(tf.keras.layers.Dense(units=7, activation='relu'))
  #add output layer
  ann.add(tf.keras.layers.Dense(units=1))
  #compile the ANN
  ann.compile(optimizer='adam', loss='mean_squared_error')
  #train ANN on training set
  ann.fit(X_train_trans, y, batch_size=32, epochs=100)

  return ann



In [15]:
from sklearn.preprocessing import StandardScaler

#make predictions (apply model to new data)
def predict(X_val, ann):
  #scale the new data
  sc = StandardScaler()
  X_val_trans = sc.fit_transform(X_val)
  y_pred = ann.predict(X_val_trans)

  return y_pred

In [16]:
from sklearn.metrics import mean_squared_error

#do k-fold cross-validation
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, shuffle=True, random_state=1)
mse_list = []


for train_idx, val_idx in kfold.split(X):
  #split data in train & val sets
  X_train = X[train_idx]
  X_val = X[val_idx]
  y_train = y[train_idx]
  y_val = y[val_idx]
  #train model and make predictions
  model = train(X_train, y_train)
  y_pred = predict(X_val, model)
  #evaluate
  mse = mean_squared_error(y_val, y_pred)
  mse_list.append(mse)   

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [17]:
print('mse = %0.3f ± %0.3f' % (np.mean(mse_list), np.std(mse_list)))

mse = 3778.259 ± 195.038


In [20]:
#compare predicted values with real ones
np.set_printoptions(precision=2)
conc_vec = np.concatenate((y_pred.reshape(len(y_pred),1), y_val.reshape(len(y_val),1)), 1)
conc_vec[50:100]

array([[ 31.25,  98.61],
       [ 21.16,  98.61],
       [ 29.19,  98.61],
       [ 84.11,  98.61],
       [ 99.76,  98.61],
       [ 88.02, 127.  ],
       [174.96, 340.  ],
       [228.09, 298.  ],
       [232.9 , 299.  ],
       [126.46,  41.  ],
       [ 49.69,  16.  ],
       [ 32.29,  24.  ],
       [ 21.61,  36.  ],
       [ 25.29,  52.  ],
       [ 34.52,  33.  ],
       [ 85.85,  72.  ],
       [ 90.29,  62.  ],
       [ 41.57,  43.  ],
       [ 60.5 ,  87.  ],
       [ 35.88, 102.  ],
       [ 93.46,  51.  ],
       [ 56.68,  69.  ],
       [ 73.55,  71.  ],
       [252.73, 162.  ],
       [204.94, 185.  ],
       [221.46, 166.  ],
       [ 42.81,  91.  ],
       [ 93.72,  49.  ],
       [ 22.15,  16.  ],
       [ 26.82,  11.  ],
       [ 28.65,   9.  ],
       [ 26.16,  13.  ],
       [ 26.32,  17.  ],
       [ 30.41,  22.  ],
       [ 28.42,  26.  ],
       [ 26.14,  13.  ],
       [ 25.73,  13.  ],
       [ 35.56,  53.  ],
       [ 45.09,  51.  ],
       [ 41.17,   8.  ],
