In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from tqdm.notebook import tqdm_notebook
from sklearn import metrics

In [3]:
provinces = ['BKK','Chiangmai','Khonkaen','Rayong','Saraburi','Surat']
province = provinces[4]
data = pd.read_csv('./'+province+'/train/'+province.lower()+'_train_format_2.csv')

y_train = data.pop('PM2.5')
X_train = data

In [6]:
tscv = TimeSeriesSplit(n_splits=3)
i = 1
score = []
for tr_index, val_index in tqdm_notebook(tscv.split(X_train)):
    X_tr, X_val = X_train.iloc[tr_index], X_train.iloc[val_index]
    y_tr, y_val = y_train.iloc[tr_index], y_train.iloc[val_index]
    for mf in tqdm_notebook(np.linspace(1, 7, 7)):
        for ne in np.linspace(20, 200, 10):
            for md in np.linspace(20, 80, 6):
                for msl in np.linspace(30, 200, 10):
                    rfr = RandomForestRegressor(
                        max_features=int(mf),
                        n_estimators=int(ne),
                        max_depth=int(md),
                        min_samples_leaf=int(msl),
                        n_jobs = -1)
                    rfr.fit(X_tr, y_tr)
                    y_pred = rfr.predict(X_val)
                    score.append([i,
                                  mf, 
                                  ne,
                                  md, 
                                  msl, 
                                  rfr.score(X_val, y_val),
                                  np.sqrt(metrics.mean_squared_error(y_val, y_pred))])
    i += 1

0it [00:00, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

In [7]:
import pickle

with open('rf_'+province.lower()+'_score_2.pickle', 'wb') as fp:
    pickle.dump(score, fp)

In [9]:
score2 = np.array(score)
print('best parem:',score[np.argmin(score2, axis=0)[-1]]) #please note all these params in note

best parem: [2, 2.0, 40.0, 20.0, 67.77777777777777, 0.35417922566285087, 17.113411589680062]


In [36]:
param_dict = {'BKK':[7.0, 70.0, 40.0, 100.0],'Chiangmai':[7.0, 50.0, 40.0, 100.0],'Khonkaen':[1.0, 70.0, 30.0, 90.0],'Rayong':[2.0, 50.0, 30.0, 40.0],'Saraburi':[3.0, 50.0, 20.0, 90.0],'Surat':[2.0, 60.0, 20.0, 30.0]}

In [16]:
province = 'BKK'

In [32]:
train_data = pd.read_csv('./'+province+'/train/'+province.lower()+'_train_format_2.csv')
# train_data.head()

y_train = train_data.pop('PM2.5')
X_train = train_data

In [33]:
test_data = pd.read_csv(province+'_clean.csv')
test_data['date_time'] = pd.to_datetime(test_data['date_time'])
test_data['year'] = test_data['date_time'].dt.year
test_data['month'] = test_data['date_time'].dt.month
test_data['day'] = test_data['date_time'].dt.day
test_data['hour'] = test_data['date_time'].dt.hour
test_data = test_data[['year','month','day','hour','temp','wind speed','wind dir','PM2.5']]
test_data.dropna(inplace=True)
# test_data.head()
y_test = test_data.pop('PM2.5')
X_test = test_data

In [34]:
rfr = RandomForestRegressor(
                        max_features=int(param_dict[province][0]),
                        n_estimators=int(param_dict[province][1]),
                        max_depth=int(param_dict[province][2]),
                        min_samples_leaf=int(param_dict[province][3]),
                        n_jobs = -1)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

RMSE: 10.575646339818894


In [46]:
SE = []
n = 0
for province in param_dict:
    train_data = pd.read_csv('./'+province+'/train/'+province.lower()+'_train_format_2.csv')
    y_train = train_data.pop('PM2.5')
    X_train = train_data
    test_data = pd.read_csv(province+'_clean.csv')
    test_data['date_time'] = pd.to_datetime(test_data['date_time'])
    test_data['year'] = test_data['date_time'].dt.year
    test_data['month'] = test_data['date_time'].dt.month
    test_data['day'] = test_data['date_time'].dt.day
    test_data['hour'] = test_data['date_time'].dt.hour
    test_data = test_data[['year','month','day','hour','temp','wind speed','wind dir','PM2.5']]
    test_data.dropna(inplace=True)
    y_test = test_data.pop('PM2.5')
    X_test = test_data
    rfr = RandomForestRegressor(
                        max_features=int(param_dict[province][0]),
                        n_estimators=int(param_dict[province][1]),
                        max_depth=int(param_dict[province][2]),
                        min_samples_leaf=int(param_dict[province][3]),
                        n_jobs = -1)
    rfr.fit(X_train, y_train)
    y_pred = rfr.predict(X_test)
    SE.append(np.sum(np.square(np.subtract(np.array(y_test), np.array(y_pred)))))
    n += len(y_pred)
    print('RMSE of '+province+':',np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('all province RMSE:',np.sqrt(np.sum(np.array(SE))/n))

RMSE of BKK: 10.531763558991464
RMSE of Chiangmai: 21.129193184149443
RMSE of Khonkaen: 12.697137874589506
RMSE of Rayong: 9.225255646244463
RMSE of Saraburi: 15.02105761057381
RMSE of Surat: 7.430652517725827
all province RMSE: 13.418868100953738


## Randomized Search

In [10]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

forest = RandomForestRegressor(n_jobs=-1)

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 200, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 45, num = 3)]
# Minimum number of samples required to split a node
min_samples_split = [5, 10]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split}

pprint(random_grid)
# Use the random grid to search for best hyperparameters

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = forest, param_distributions = random_grid, n_iter = 10, cv = 10, verbose=2, random_state=42, n_jobs = -1, scoring='neg_mean_squared_error')
# Fit the random search model
rf_random.fit(X_train, y_train)

{'max_depth': [1, 23, 45],
 'max_features': ['auto', 'sqrt'],
 'min_samples_split': [5, 10],
 'n_estimators': [20, 65, 110, 155, 200]}
Fitting 10 folds for each of 10 candidates, totalling 100 fits


RandomizedSearchCV(cv=10, estimator=RandomForestRegressor(n_jobs=-1), n_jobs=-1,
                   param_distributions={'max_depth': [1, 23, 45],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_split': [5, 10],
                                        'n_estimators': [20, 65, 110, 155,
                                                         200]},
                   random_state=42, scoring='neg_mean_squared_error',
                   verbose=2)

In [11]:
#now let's how the RMSE changes for each parameter configuration
cvres2 = rf_random.cv_results_
for mean_score, params in zip(cvres2["mean_test_score"], cvres2["params"]):
    print(np.sqrt(-mean_score), params)

20.523009241027463 {'n_estimators': 20, 'min_samples_split': 5, 'max_features': 'auto', 'max_depth': 1}
20.518289343784335 {'n_estimators': 20, 'min_samples_split': 10, 'max_features': 'auto', 'max_depth': 1}
18.91589175670347 {'n_estimators': 65, 'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 23}
20.59557996481142 {'n_estimators': 20, 'min_samples_split': 10, 'max_features': 'auto', 'max_depth': 45}
21.948674538397665 {'n_estimators': 155, 'min_samples_split': 5, 'max_features': 'sqrt', 'max_depth': 1}
18.680708824980332 {'n_estimators': 200, 'min_samples_split': 5, 'max_features': 'sqrt', 'max_depth': 45}
18.84261319110746 {'n_estimators': 155, 'min_samples_split': 5, 'max_features': 'sqrt', 'max_depth': 23}
20.440627499975733 {'n_estimators': 155, 'min_samples_split': 10, 'max_features': 'auto', 'max_depth': 45}
21.934591024764497 {'n_estimators': 110, 'min_samples_split': 5, 'max_features': 'sqrt', 'max_depth': 1}
18.685189392489402 {'n_estimators': 110, 'min_samples