In [2]:
import numpy as np
import pandas as pd
df = pd.read_csv('dataset_AirQual.csv')

#use fillna() method to replace missing values with mean value
df['pm2.5'].fillna(df['pm2.5'].mean(), inplace = True)

#one hot encoding
cols = df.columns.tolist()
df_new = pd.get_dummies(df[cols])

#put column pm2.5 at the end of the df
#avoid one of the column rearrangement steps
cols = df_new.columns.tolist()
cols_new = cols[:5] + cols[6:] + cols[5:6]
df_new = df_new[cols_new]
df_new.head()

Unnamed: 0,No,year,month,day,hour,DEWP,TEMP,PRES,Iws,Is,Ir,cbwd_NE,cbwd_NW,cbwd_SE,cbwd_cv,pm2.5
0,1,2010,1,1,0,-21,-11.0,1021.0,1.79,0,0,0,1,0,0,98.613215
1,2,2010,1,1,1,-21,-12.0,1020.0,4.92,0,0,0,1,0,0,98.613215
2,3,2010,1,1,2,-21,-11.0,1019.0,6.71,0,0,0,1,0,0,98.613215
3,4,2010,1,1,3,-21,-14.0,1019.0,9.84,0,0,0,1,0,0,98.613215
4,5,2010,1,1,4,-20,-12.0,1018.0,12.97,0,0,0,1,0,0,98.613215


Before I start to build, train and validate the model, I want to check the correlation between the indepependent variables and the dependent variable pm2.5. The higher the cumulated wind speed (lws) and the more the wind is blowin from north west (cbwd_NW), the lower the concentration of pm2.5. <br>
The more the wind is blowing from south west (cbwd_cv) and the higher the dew point (DEWP), the higher the concentration of pm2.5 in the air. The dew point indicates the absolute humidity. During times with high humidity, more pm2.5 particles can connect themselves with water droplets, that hover in the air.

In [15]:
indep_var = cols_new[:-1]

In [17]:
df_new[indep_var].corrwith(df_new['pm2.5']).sort_values()

Iws       -0.239969
cbwd_NW   -0.208616
TEMP      -0.088204
Ir        -0.050224
PRES      -0.046298
cbwd_NE   -0.032056
month     -0.023533
hour      -0.022573
No        -0.017294
year      -0.014354
Is         0.019263
day        0.080685
cbwd_SE    0.094797
cbwd_cv    0.152465
DEWP       0.167334
dtype: float64

In [None]:
#get matrix arrays of dependent and independent variables
X = df_new.iloc[:, :-1].values
y = df_new.iloc[:, -1].values

In [None]:
#train random forest regression model

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

#training the model
def train(X_train, y):
  #scale the training set data
  sc = StandardScaler()
  X_train_trans = sc.fit_transform(X_train)
  regressor = RandomForestRegressor(n_estimators = 10, random_state=1)
  regressor.fit(X_train_trans, y)

  return regressor

In [None]:
from sklearn.preprocessing import StandardScaler

#make predictions (apply model to new data)
def predict(X_val, regressor):
  #scale the new data
  sc = StandardScaler()
  X_val_trans = sc.fit_transform(X_val)
  y_pred = regressor.predict(X_val_trans)

  return y_pred

In [None]:
from sklearn.metrics import mean_squared_error

#do k-fold cross-validation
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, shuffle=True, random_state=1)
mse_list = []


for train_idx, val_idx in kfold.split(X):
  #split data in train & val sets
  X_train = X[train_idx]
  X_val = X[val_idx]
  y_train = y[train_idx]
  y_val = y[val_idx]
  #train model and make predictions
  model = train(X_train, y_train)
  y_pred = predict(X_val, model)
  #evaluate
  mse = mean_squared_error(y_val, y_pred)
  mse_list.append(mse)   

In [None]:
print('mse = %0.3f ± %0.3f' % (np.mean(mse_list), np.std(mse_list)))

mse = 2293.779 ± 135.446


In [None]:
#compare predicted values with real ones
np.set_printoptions(precision=2)
conc_vec = np.concatenate((y_pred.reshape(len(y_pred),1), y_val.reshape(len(y_val),1)), 1)
conc_vec[50:100]

array([[ 62.21,  98.61],
       [ 59.01,  98.61],
       [ 63.67,  98.61],
       [103.48,  98.61],
       [ 99.17,  98.61],
       [ 90.26, 127.  ],
       [156.28, 340.  ],
       [224.7 , 298.  ],
       [222.8 , 299.  ],
       [ 67.08,  41.  ],
       [ 57.55,  16.  ],
       [ 72.61,  24.  ],
       [ 51.25,  36.  ],
       [ 53.45,  52.  ],
       [ 66.71,  33.  ],
       [ 83.8 ,  72.  ],
       [ 74.4 ,  62.  ],
       [ 53.1 ,  43.  ],
       [ 79.9 ,  87.  ],
       [ 75.76, 102.  ],
       [ 61.3 ,  51.  ],
       [ 85.5 ,  69.  ],
       [ 78.9 ,  71.  ],
       [106.4 , 162.  ],
       [107.1 , 185.  ],
       [119.8 , 166.  ],
       [ 59.  ,  91.  ],
       [ 91.9 ,  49.  ],
       [ 47.9 ,  16.  ],
       [ 36.5 ,  11.  ],
       [ 18.  ,   9.  ],
       [ 22.8 ,  13.  ],
       [ 24.7 ,  17.  ],
       [ 20.  ,  22.  ],
       [ 24.3 ,  26.  ],
       [ 21.3 ,  13.  ],
       [ 23.1 ,  13.  ],
       [ 30.9 ,  53.  ],
       [ 55.5 ,  51.  ],
       [ 35.5 ,   8.  ],
