In [24]:
import os
import math
import numpy as np
import pandas as pd
from utils.sarimax import sarimax_training, predict_target
from tqdm import tqdm
from sklearn.metrics import mean_squared_error

In [43]:
basepath = 'datasci_dataset_2022'
province = ['BKK', 'Chiangmai', 'Khonkaen', 'Rayong', 'Saraburi', 'Surat'] # Chiangmai, Khonkaen, Saraburi
idx_province = 4
folderpath = os.path.join(basepath, province[idx_province])

mode = 'train'
data = pd.read_csv(os.path.join(folderpath, f"{mode}\\{mode}ing.csv")).rename({'Unnamed: 0': 'date_time'}, axis=1)
data['date_time'] = pd.to_datetime(data['date_time'])
data = data.set_index('date_time')

train_data = data.reindex(pd.date_range("2017-07-01 00:00:00", "2019-07-01 00:00:00",freq='6h'), fill_value=np.nan)
val_data = data.reindex(pd.date_range("2019-07-01 06:00:00", "2020-06-30 23:00:00",freq='6h'), fill_value=np.nan)
# val_data = data.reindex(pd.date_range("2019-07-01 06:00:00", "2020-06-30 23:00:00",freq='6h'), fill_value=np.nan)

In [44]:
param = {
  'order': [(1, 1, 1), (1, 0, 1), (1, 0, 0), (0, 0, 1)],
  'seasonal order': [(1, 0, 0, 12), (1, 1, 0, 12), (0, 0, 1, 12), ((0, 1, 1, 12))],
}
best = (0, 0)

In [62]:
def run(order, seasonal_order):
  exog_columns = ['temp', 'wind speed', 'wind dir'] + ['Thailand'] # Hotspot + wind + temp
  target = 'PM2.5'

  result, results_exog = sarimax_training(order, seasonal_order, train_data, exog_columns, target)

  li = []
  update_result, update_results_exog = result, results_exog
  for i in tqdm(range(len(val_data)-1)):
    update_result = update_result.append(val_data['PM2.5'].iloc[i:i+1], exog=val_data[exog_columns].iloc[i:i+1])
    for idx, exog in enumerate(exog_columns):
      update_results_exog[idx] = update_results_exog[idx].append(val_data[exog].iloc[i:i+1])

    
    li.append(predict_target(update_result, update_results_exog, exog_columns, steps=12))

  df = pd.concat(li, keys=val_data.index)

  gt = []
  for data in tqdm(df.iteritems()):
    try:
      gt.append(val_data.loc[data[0][1]].values[0])
    except:
      gt.append(np.NaN)
  gt = np.array(gt)

  mask = np.isnan(gt)
  predicted = df.values[~mask]
  gt_target = gt[~mask]
  mse = mean_squared_error(gt_target, predicted)
  rmse = math.sqrt(mse)
  print('Test on SARIMAX with RMSE: %f' % (rmse, ))

In [63]:
for i in range(5):
  selected_order = param['order'][np.random.randint(4)]
  selected_seasonal_order = param['seasonal order'][np.random.randint(4)]
  print("Iter:", i+1)
  print("Order:", selected_order)
  print("Seasonal Order:", selected_seasonal_order)
  run(selected_order, selected_seasonal_order)
  print()

Iter: 1
Order: (1, 0, 0)
Seasonal Order: (0, 0, 1, 12)


100%|██████████| 1462/1462 [09:09<00:00,  2.66it/s]
17544it [00:02, 7745.71it/s]


Test on SARIMAX with RMSE: 22.032124

Iter: 2
Order: (1, 0, 0)
Seasonal Order: (0, 1, 1, 12)


100%|██████████| 1462/1462 [20:11<00:00,  1.21it/s]
17544it [00:02, 8016.28it/s]


Test on SARIMAX with RMSE: 17.187643

Iter: 3
Order: (1, 1, 1)
Seasonal Order: (1, 0, 0, 12)


100%|██████████| 1462/1462 [07:55<00:00,  3.07it/s]
17544it [00:02, 8289.05it/s]


Test on SARIMAX with RMSE: 17.251824

Iter: 4
Order: (1, 1, 1)
Seasonal Order: (1, 1, 0, 12)


100%|██████████| 1462/1462 [19:49<00:00,  1.23it/s]
17544it [00:02, 8233.97it/s]


Test on SARIMAX with RMSE: 20.503308

Iter: 5
Order: (1, 1, 1)
Seasonal Order: (0, 0, 1, 12)


100%|██████████| 1462/1462 [09:11<00:00,  2.65it/s]
17544it [00:02, 8086.26it/s]


Test on SARIMAX with RMSE: 17.681143



In [64]:
run((1, 0, 1), (1, 1, 0, 12))

100%|██████████| 1462/1462 [18:48<00:00,  1.30it/s]
17544it [00:02, 8626.02it/s]


Test on SARIMAX with RMSE: 17.873208


## SLJA

In [41]:
li = []
best_order = (1, 1, 1) # best_order = (1, 1, 1)
best_seasonal_order = (0, 0, 1, 12) # (1, 0, 0, 12)
exog_columns = ['temp', 'wind speed', 'wind dir'] + ['Thailand'] # Hotspot + wind + temp
target = 'PM2.5'

result, results_exog = sarimax_training(best_order, best_seasonal_order, train_data, exog_columns, target)

In [42]:
li = []
update_result, update_results_exog = result, results_exog
for i in tqdm(range(len(val_data)-1)):
  update_result = update_result.append(val_data['PM2.5'].iloc[i:i+1], exog=val_data[exog_columns].iloc[i:i+1])
  for idx, exog in enumerate(exog_columns):
    update_results_exog[idx] = update_results_exog[idx].append(val_data[exog].iloc[i:i+1])

  
  li.append(predict_target(update_result, update_results_exog, exog_columns, steps=12))

df = pd.concat(li, keys=val_data.index)

  4%|▍         | 60/1462 [00:20<07:51,  2.98it/s]


KeyboardInterrupt: 

In [35]:
gt = []
for data in tqdm(df.iteritems()):
  try:
    gt.append(val_data.loc[data[0][1]].values[0])
  except:
    gt.append(np.NaN)
gt = np.array(gt)

5832it [00:00, 7659.73it/s]


In [36]:
mask = np.isnan(gt)
predicted = df.values[~mask]
gt_target = gt[~mask]

In [25]:
mse = mean_squared_error(gt_target, predicted)
rmse = math.sqrt(mse)
print('Test on SARIMAX with RMSE: %f' % (rmse, ))

Test on SARIMAX with RMSE: 10.396560


In [30]:
mse = mean_squared_error(gt_target, predicted)
rmse = math.sqrt(mse)
print('Test on SARIMAX with RMSE: %f' % (rmse, ))

Test on SARIMAX with RMSE: 11.652615


In [37]:
mse = mean_squared_error(gt_target, predicted)
rmse = math.sqrt(mse)
print('Test on SARIMAX with RMSE: %f' % (rmse, ))

Test on SARIMAX with RMSE: 10.608445
