In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from tqdm.notebook import tqdm_notebook
from sklearn import metrics

In [6]:
provinces = ['BKK','Chiangmai','Khonkaen','Rayong','Saraburi','Surat']
province = provinces[3]
data = pd.read_csv('./'+province+'/train/'+province.lower()+'_train_format_2.csv')

y_train = data.pop('PM2.5')
X_train = data

In [7]:
tscv = TimeSeriesSplit(n_splits=3)
i = 1
score = []
for tr_index, val_index in tqdm_notebook(tscv.split(X_train)):
    X_tr, X_val = X_train.iloc[tr_index], X_train.iloc[val_index]
    y_tr, y_val = y_train.iloc[tr_index], y_train.iloc[val_index]
    for mf in tqdm_notebook(np.linspace(1, 7, 7)):
        for ne in np.linspace(50, 100, 6):
            for md in np.linspace(20, 40, 5):
                for msl in np.linspace(30, 100, 8):
                    rfr = RandomForestRegressor(
                        max_features=int(mf),
                        n_estimators=int(ne),
                        max_depth=int(md),
                        min_samples_leaf=int(msl),
                        n_jobs = -1)
                    rfr.fit(X_tr, y_tr)
                    y_pred = rfr.predict(X_val)
                    score.append([i,
                                  mf, 
                                  ne,
                                  md, 
                                  msl, 
                                  rfr.score(X_val, y_val),
                                  np.sqrt(metrics.mean_squared_error(y_val, y_pred))])
    i += 1

0it [00:00, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

In [8]:
import pickle

with open('rf_'+province.lower()+'_score.pickle', 'wb') as fp:
    pickle.dump(score, fp)

In [9]:
score2 = np.array(score)
print('best parem:',score[np.argmin(score2, axis=0)[-1]]) #please note all these params in note

best parem: [1, 2.0, 50.0, 30.0, 40.0, -0.8194841886020148, 9.29520811161377]


In [36]:
param_dict = {'BKK':[7.0, 70.0, 40.0, 100.0],'Chiangmai':[7.0, 50.0, 40.0, 100.0],'Khonkaen':[1.0, 70.0, 30.0, 90.0],'Rayong':[2.0, 50.0, 30.0, 40.0],'Saraburi':[3.0, 50.0, 20.0, 90.0],'Surat':[2.0, 60.0, 20.0, 30.0]}

In [16]:
province = 'BKK'

In [32]:
train_data = pd.read_csv('./'+province+'/train/'+province.lower()+'_train_format_2.csv')
# train_data.head()

y_train = train_data.pop('PM2.5')
X_train = train_data

In [33]:
test_data = pd.read_csv(province+'_clean.csv')
test_data['date_time'] = pd.to_datetime(test_data['date_time'])
test_data['year'] = test_data['date_time'].dt.year
test_data['month'] = test_data['date_time'].dt.month
test_data['day'] = test_data['date_time'].dt.day
test_data['hour'] = test_data['date_time'].dt.hour
test_data = test_data[['year','month','day','hour','temp','wind speed','wind dir','PM2.5']]
test_data.dropna(inplace=True)
# test_data.head()
y_test = test_data.pop('PM2.5')
X_test = test_data

In [34]:
rfr = RandomForestRegressor(
                        max_features=int(param_dict[province][0]),
                        n_estimators=int(param_dict[province][1]),
                        max_depth=int(param_dict[province][2]),
                        min_samples_leaf=int(param_dict[province][3]),
                        n_jobs = -1)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

RMSE: 10.575646339818894


In [46]:
SE = []
n = 0
for province in param_dict:
    train_data = pd.read_csv('./'+province+'/train/'+province.lower()+'_train_format_2.csv')
    y_train = train_data.pop('PM2.5')
    X_train = train_data
    test_data = pd.read_csv(province+'_clean.csv')
    test_data['date_time'] = pd.to_datetime(test_data['date_time'])
    test_data['year'] = test_data['date_time'].dt.year
    test_data['month'] = test_data['date_time'].dt.month
    test_data['day'] = test_data['date_time'].dt.day
    test_data['hour'] = test_data['date_time'].dt.hour
    test_data = test_data[['year','month','day','hour','temp','wind speed','wind dir','PM2.5']]
    test_data.dropna(inplace=True)
    y_test = test_data.pop('PM2.5')
    X_test = test_data
    rfr = RandomForestRegressor(
                        max_features=int(param_dict[province][0]),
                        n_estimators=int(param_dict[province][1]),
                        max_depth=int(param_dict[province][2]),
                        min_samples_leaf=int(param_dict[province][3]),
                        n_jobs = -1)
    rfr.fit(X_train, y_train)
    y_pred = rfr.predict(X_test)
    SE.append(np.sum(np.square(np.subtract(np.array(y_test), np.array(y_pred)))))
    n += len(y_pred)
    print('RMSE of '+province+':',np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('all province RMSE:',np.sqrt(np.sum(np.array(SE))/n))

RMSE of BKK: 10.531763558991464
RMSE of Chiangmai: 21.129193184149443
RMSE of Khonkaen: 12.697137874589506
RMSE of Rayong: 9.225255646244463
RMSE of Saraburi: 15.02105761057381
RMSE of Surat: 7.430652517725827
all province RMSE: 13.418868100953738
