In [130]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from tqdm.notebook import tqdm_notebook
from sklearn import metrics
from datetime import datetime

In [2]:
province = 'BKK'

In [131]:
param_dict = {'BKK':[7.0, 70.0, 40.0, 100.0],
'Chiangmai':[7.0, 50.0, 40.0, 100.0],
'Rayong':[2.0, 50.0, 30.0, 40.0],
'Saraburi':[3.0, 50.0, 20.0, 90.0],
'Khonkaen':[1.0, 70.0, 30.0, 90.0],
'Surat':[2.0, 60.0, 20.0, 30.0]}

In [5]:
train_data = pd.read_csv('./'+province+'/train/'+province.lower()+'_train_format_2.csv')
# train_data.head()

y_train = train_data.pop('PM2.5')
X_train = train_data

In [3]:
test_data = pd.read_csv(province+'_clean.csv')
test_data['date_time'] = pd.to_datetime(test_data['date_time'])
test_data['year'] = test_data['date_time'].dt.year
test_data['month'] = test_data['date_time'].dt.month
test_data['day'] = test_data['date_time'].dt.day
test_data['hour'] = test_data['date_time'].dt.hour
test_data = test_data[['year','month','day','hour','temp','wind speed','wind dir','PM2.5']]
test_data.dropna(inplace=True)
# test_data.head()
y_test = test_data.pop('PM2.5')
X_test = test_data

In [6]:
rfr = RandomForestRegressor(
                        max_features=int(param_dict[province][0]),
                        n_estimators=int(param_dict[province][1]),
                        max_depth=int(param_dict[province][2]),
                        min_samples_leaf=int(param_dict[province][3]),
                        n_jobs = -1)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

RMSE: 10.524302381945748


In [16]:
y_pred

array([10.39163601, 10.39163601, 10.39163601, ..., 14.08060402,
       14.0962378 , 14.15397429])

In [15]:
X_test.shape

(7814, 7)

In [17]:
df = X_test.copy()
df['PM2.5']=y_pred
df.tail()

Unnamed: 0,year,month,day,hour,temp,wind speed,wind dir,PM2.5
8779,2021,7,1,19,29.1,18,250,14.096238
8780,2021,7,1,20,29.5,21,255,14.071075
8781,2021,7,1,21,29.8,20,250,14.080604
8782,2021,7,1,22,29.1,16,245,14.096238
8783,2021,7,1,23,28.9,14,235,14.153974


In [18]:
df.shape

(7814, 8)

In [27]:
def formatDatetime(year,month,day,hour):
    year = int(year)
    month = int(month)
    day = int(day)
    hour = int(hour)
    return datetime(year,month,day,hour)

In [28]:
df['date_time'] = df.apply(lambda x: formatDatetime(x['year'],x['month'],x['day'],x['hour']),axis=1)
df.head()

Unnamed: 0,year,month,day,hour,temp,wind speed,wind dir,PM2.5,date_time
0,2020,7,1,0,29.4,35,270,10.391636,2020-07-01 00:00:00
1,2020,7,1,1,29.4,42,260,10.391636,2020-07-01 01:00:00
2,2020,7,1,2,28.5,42,260,10.391636,2020-07-01 02:00:00
3,2020,7,1,3,28.5,42,260,10.391636,2020-07-01 03:00:00
4,2020,7,1,4,28.3,55,270,10.391636,2020-07-01 04:00:00


In [30]:
df=df[['date_time','PM2.5']]
df.columns = ['predicted','PM2.5']

In [35]:
df.head()

Unnamed: 0,predicted,PM2.5
0,2020-07-01 00:00:00,10.391636
1,2020-07-01 01:00:00,10.391636
2,2020-07-01 02:00:00,10.391636
3,2020-07-01 03:00:00,10.391636
4,2020-07-01 04:00:00,10.391636


In [37]:
submit = pd.read_csv('submit_time_format.csv')
submit['predicted'] = pd.to_datetime(submit['predicted'])
submit.head()

Unnamed: 0,time,predicted
0,2020-07-01 00:00:00,2020-07-01 06:00:00
1,2020-07-01 00:00:00,2020-07-01 12:00:00
2,2020-07-01 00:00:00,2020-07-01 18:00:00
3,2020-07-01 00:00:00,2020-07-02 00:00:00
4,2020-07-01 00:00:00,2020-07-02 06:00:00


In [38]:
result = pd.merge(submit, df, how="left", on=["predicted"])

In [39]:
result.head()

Unnamed: 0,time,predicted,PM2.5
0,2020-07-01 00:00:00,2020-07-01 06:00:00,19.329812
1,2020-07-01 00:00:00,2020-07-01 12:00:00,17.072939
2,2020-07-01 00:00:00,2020-07-01 18:00:00,16.968878
3,2020-07-01 00:00:00,2020-07-02 00:00:00,10.391636
4,2020-07-01 00:00:00,2020-07-02 06:00:00,21.065477


# All province

In [100]:
submit = pd.read_excel('_submission_example\Example_df_kaggle_test2022_without_nan.xlsx')
submit.shape

(94248, 5)

In [101]:
submit.to_csv('Example_df_kaggle_test2022_without_nan.csv')

In [91]:
submit = pd.read_csv('Example_df_kaggle_test2022_with_nan.csv')
submit = submit.iloc[:17490,:]
submit['Predicted'] = pd.to_datetime(submit['Predicted'])
submit['Time'] = pd.to_datetime(submit['Time'])
submit['Time'] = submit['Time'].fillna(method='ffill')
submit = submit[['Time','Predicted']]

In [90]:
submit = pd.read_csv('Example_df_kaggle_test2022_with_nan.csv')
submit[submit['station']=='Chiangmai']

Unnamed: 0.1,Unnamed: 0,station,Time,Predicted,PM2.5
17490,17490,Chiangmai,2020-07-01,2020-07-01 06:00:00,86.0


In [92]:
submit.to_csv('submit_format.csv',index=False)

In [117]:
SE = []
n = 0
ans = pd.DataFrame.from_dict({'Time':[],'Predicted':[],'PM2.5':[],'Province':[]})
for province in param_dict:


    print(province)

    train_data = pd.read_csv('./'+province+'/train/'+province.lower()+'_train_format_2.csv')
    y_train = train_data.pop('PM2.5')
    X_train = train_data
    test_data = pd.read_csv(province+'_clean.csv')
    test_data['date_time'] = pd.to_datetime(test_data['date_time'])
    test_data['year'] = test_data['date_time'].dt.year
    test_data['month'] = test_data['date_time'].dt.month
    test_data['day'] = test_data['date_time'].dt.day
    test_data['hour'] = test_data['date_time'].dt.hour
    test_data = test_data[['year','month','day','hour','temp','wind speed','wind dir','PM2.5']]
    test_data.dropna(inplace=True)
    y_test = test_data.pop('PM2.5')
    X_test = test_data
    rfr = RandomForestRegressor(
                        max_features=int(param_dict[province][0]),
                        n_estimators=int(param_dict[province][1]),
                        max_depth=int(param_dict[province][2]),
                        min_samples_leaf=int(param_dict[province][3]),
                        n_jobs = -1)
    rfr.fit(X_train, y_train)
    y_pred = rfr.predict(X_test)

    df = X_test.copy()
    df['PM2.5']=y_pred
    df['date_time'] = df.apply(lambda x: formatDatetime(x['year'],x['month'],x['day'],x['hour']),axis=1)
    df=df[['date_time','PM2.5']]
    df.columns = ['Predicted','PM2.5']

    submit=pd.read_csv('submit_time_format_2.csv')
    submit['Predicted'] = pd.to_datetime(submit['Predicted'])
    submit['Time'] = pd.to_datetime(submit['Time'])

    result = pd.merge(submit, df, how="left", on=["Predicted"])

    result['Province'] = province
    ans = pd.concat([ans,result])
    

BKK
Chiangmai
Rayong
Saraburi
Khonkaen
Surat


In [103]:
result.head()

Unnamed: 0,Time,Predicted,PM2.5,Province
0,2020-07-01,2020-07-01 06:00:00,16.364234,Surat
1,2020-07-01,2020-07-01 12:00:00,11.780813,Surat
2,2020-07-01,2020-07-01 18:00:00,10.260545,Surat
3,2020-07-01,2020-07-02 00:00:00,14.669757,Surat
4,2020-07-01,2020-07-02 06:00:00,16.446531,Surat


In [118]:
ans.shape

(104940, 4)

In [119]:
ans.dropna(inplace=True)
ans.to_csv('ans.csv')

In [120]:
ans.isna().sum()

Time         0
Predicted    0
PM2.5        0
Province     0
dtype: int64

In [121]:
ans.shape

(94248, 4)

In [122]:
ans.columns = ['Time','Predicted','PM2.5','station']
ans.head()

Unnamed: 0,Time,Predicted,PM2.5,station
0,2020-07-01,2020-07-01 06:00:00,18.760174,BKK
1,2020-07-01,2020-07-01 12:00:00,17.207703,BKK
2,2020-07-01,2020-07-01 18:00:00,17.103964,BKK
3,2020-07-01,2020-07-02 00:00:00,10.330393,BKK
4,2020-07-01,2020-07-02 06:00:00,20.699504,BKK


In [126]:
ans.reset_index(drop=True,inplace=True)

In [127]:
ans.head()

Unnamed: 0,Time,Predicted,PM2.5,station
0,2020-07-01,2020-07-01 06:00:00,18.760174,BKK
1,2020-07-01,2020-07-01 12:00:00,17.207703,BKK
2,2020-07-01,2020-07-01 18:00:00,17.103964,BKK
3,2020-07-01,2020-07-02 00:00:00,10.330393,BKK
4,2020-07-01,2020-07-02 06:00:00,20.699504,BKK


In [128]:
kaggle =ans['PM2.5']
kaggle.columns = ['Predicted']
kaggle.head()

0    18.760174
1    17.207703
2    17.103964
3    10.330393
4    20.699504
Name: PM2.5, dtype: float64

In [129]:
kaggle.to_csv('kaggle.csv')

In [46]:
example = pd.read_excel('_submission_example\Example_df_kaggle_test2022_without_nan.xlsx')

In [49]:
example['Predicted'] = pd.to_datetime(example['Predicted'])
example.head()

Unnamed: 0,id,station,Time,Predicted,PM2.5
0,0,BKK,2020-07-01,2020-07-01 06:00:00,4
1,1,,NaT,2020-07-01 12:00:00,18
2,2,,NaT,2020-07-01 18:00:00,79
3,3,,NaT,2020-07-02 00:00:00,40
4,4,,NaT,2020-07-02 06:00:00,69


In [54]:
ans.tail()

Unnamed: 0,Time,Predicted,PM2.5,station
17521,2021-07-01 00:00:00,2021-07-01 12:00:00,13.645473,Surat
17522,2021-07-01 00:00:00,2021-07-01 18:00:00,11.832724,Surat
17532,2021-07-01 06:00:00,2021-07-01 12:00:00,13.645473,Surat
17533,2021-07-01 06:00:00,2021-07-01 18:00:00,11.832724,Surat
17544,2021-07-01 12:00:00,2021-07-01 18:00:00,11.832724,Surat


In [55]:
example.shape

(94248, 5)

In [57]:
example[example['station']=='Chiangmai']

Unnamed: 0,id,station,Time,Predicted,PM2.5
15510,15510,Chiangmai,2020-07-01,2020-07-01 06:00:00,86


In [53]:
example.tail()

Unnamed: 0,id,station,Time,Predicted,PM2.5
94243,94243,,NaT,2021-07-01 12:00:00,94
94244,94244,,NaT,2021-07-01 18:00:00,36
94245,94245,,2021-07-01 06:00:00,2021-07-01 12:00:00,5
94246,94246,,NaT,2021-07-01 18:00:00,41
94247,94247,,2021-07-01 12:00:00,2021-07-01 18:00:00,46


In [51]:
chk = pd.merge(ans,example,how='right',on=['Predicted'])

In [52]:
chk.isna().sum()

Time_x             0
Predicted          0
PM2.5_x            0
station_x          0
id                 0
station_y    6525540
Time_y       5920872
PM2.5_y            0
dtype: int64

In [163]:
ans = pd.DataFrame.from_dict({'Time':[],'Predicted':[],'PM2.5':[],'Province':[]})

In [154]:
param_dict

{'BKK': [7.0, 70.0, 40.0, 100.0],
 'Chiangmai': [7.0, 50.0, 40.0, 100.0],
 'Rayong': [2.0, 50.0, 30.0, 40.0],
 'Saraburi': [3.0, 50.0, 20.0, 90.0],
 'Khonkaen': [1.0, 70.0, 30.0, 90.0],
 'Surat': [2.0, 60.0, 20.0, 30.0]}

In [167]:
ans = pd.DataFrame.from_dict({'Time':[],'Predicted':[],'PM2.5':[],'Province':[]})
for key,value in param_dict.items():
    print(key,'-----------')
    print()
    df1 = pd.read_csv(key+'_clean.csv')
    df1.dropna(subset=['PM2.5'],inplace=True)
    print(' df1 shape after drop',df1.shape)
    print(df1.isna().sum())
    df2 = pd.read_csv('lstm_'+key.lower()+'.csv')
    print(' df2 shape',df2.shape)
    print(df2.isna().sum())

    df3 =  pd.DataFrame()
    df3['Predicted']=df1['date_time']
    df3['PM2.5']=df2['0']

    print(' -df3-')
    print(df3.isna().sum())
    print(df3.head())
    print(df3[df3['PM2.5'].isna()])

    df3['Predicted'] = pd.to_datetime(df3['Predicted'])

    submit=pd.read_csv('submit_time_format_2.csv')
    submit['Predicted'] = pd.to_datetime(submit['Predicted'])
    submit['Time'] = pd.to_datetime(submit['Time'])

    print(' -submit-')
    print(submit.isna().sum())
    print(submit.head())

    result = pd.merge(submit, df3, how="left", on=["Predicted"])

    result['Province'] = key

    print(' -result-')
    print(result.isna().sum())
    print(result.head())
    
    ans = pd.concat([ans,result])

BKK -----------

 df1 shape after drop (7814, 8)
Unnamed: 0    0
date_time     0
PM2.5         0
temp          0
wind speed    0
wind dir      0
lat           0
long          0
dtype: int64
 df2 shape (7814, 2)
Unnamed: 0    0
0             0
dtype: int64
 -df3-
Predicted      0
PM2.5        894
dtype: int64
             Predicted     PM2.5
0  2020-07-01 00:00:00  6.786786
1  2020-07-01 01:00:00  7.388136
2  2020-07-01 02:00:00  7.234481
3  2020-07-01 03:00:00  9.275022
4  2020-07-01 04:00:00  8.716820
                Predicted  PM2.5
7814  2021-05-22 14:00:00    NaN
7815  2021-05-22 15:00:00    NaN
7816  2021-05-22 16:00:00    NaN
7817  2021-05-22 17:00:00    NaN
7818  2021-05-22 18:00:00    NaN
...                   ...    ...
8779  2021-07-01 19:00:00    NaN
8780  2021-07-01 20:00:00    NaN
8781  2021-07-01 21:00:00    NaN
8782  2021-07-01 22:00:00    NaN
8783  2021-07-01 23:00:00    NaN

[894 rows x 2 columns]
 -submit-
Time         0
Predicted    0
dtype: int64
        Time       

In [144]:
ans.head()

Unnamed: 0,Time,Predicted,PM2.5,Province
0,2020-07-01,2020-07-01 06:00:00,9.098557,BKK
1,2020-07-01,2020-07-01 12:00:00,11.9814,BKK
2,2020-07-01,2020-07-01 18:00:00,14.420508,BKK
3,2020-07-01,2020-07-02 00:00:00,9.88724,BKK
4,2020-07-01,2020-07-02 06:00:00,9.686534,BKK


In [158]:
ans.shape

(104940, 4)

In [159]:
ans.isna().sum()

Time             0
Predicted        0
PM2.5        20256
Province         0
dtype: int64

In [160]:
ans.dropna(inplace=True)
ans.shape

(84684, 4)