In [51]:
import os
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import mean_squared_error

In [441]:
basepath = 'datasci_dataset_2022'
province = ['BKK', 'Chiangmai', 'Rayong', 'Saraburi', 'Khonkaen', 'Surat']
idx_province = 5
folderpath = os.path.join(basepath, province[idx_province])

predicted_path = f'predict/{province[idx_province].lower()}_predict_test.csv'
gt_path = os.path.join(folderpath, f"test\\{province[idx_province].lower()}_test.csv")

# Post Processing

In [442]:
rename_encode = {"Unnamed: 0": "Time", "Unnamed: 1": "Predicted", "predicted_mean": "PM2.5"}
predicted_df = pd.read_csv(predicted_path).rename(rename_encode, axis=1)
print("Prediction PATH:", predicted_path)

Prediction PATH: predict/surat_predict_test.csv


In [443]:
li = []
for i in range(0, predicted_df.shape[0], 12):
  li.append(predicted_df.iloc[i:i+12].rolling(window=25, min_periods=2, center=True, closed='both').mean())

In [444]:
predicted_df['PM2.5'] = pd.concat(li)

In [445]:
predicted_df.to_csv(f'predict/smooth/{province[idx_province].lower()}_predict_test.csv', index=False)

# Create Submission & Evaluate

In [446]:
def get_predicted_gt(predicted_path, gt_path):
  rename_encode = {"Unnamed: 0": "Time", "Unnamed: 1": "Predicted", "predicted_mean": "PM2.5"}
  predicted_df = pd.read_csv(predicted_path).rename(rename_encode, axis=1)
  print(f"Predicted Data: {predicted_path}")


  testPM = pd.read_csv(gt_path).rename({"Unnamed: 0": "date_time"}, axis=1)
  testPM['date_time'] = pd.to_datetime(testPM['date_time'])
  testPM = testPM.set_index('date_time')
  print(f"Ground truth: {gt_path}")

  gt = []
  for _, data in predicted_df.iterrows():
    try:
      gt.append(testPM.loc[data[1]].values[0])
    except:
      gt.append(np.NaN)
  gt = np.array(gt)

  mask = np.isnan(gt)
  predicted = predicted_df['PM2.5'].values[~mask]
  gt_target = gt[~mask]

  return predicted, gt_target

In [447]:
li_predicted, li_gt_target = [], []
for idx_province in range(6):
  folderpath = os.path.join(basepath, province[idx_province])
  predicted_path = f'predict/smooth/{province[idx_province].lower()}_predict_test.csv'
  gt_path = os.path.join(folderpath, f"test\\{province[idx_province].lower()}_test.csv")
  predicted, gt_target = get_predicted_gt(predicted_path, gt_path)
  li_predicted.append(predicted)
  li_gt_target.append(gt_target)
  print(province[idx_province])
  mse = mean_squared_error(gt_target, predicted)
  rmse = math.sqrt(mse)
  print('Test on SARIMAX with RMSE: %f' % (rmse, ))
  print()

predicted = np.concatenate(li_predicted)
gt_target = np.concatenate(li_gt_target)

Predicted Data: predict/smooth/bkk_predict_test.csv
Ground truth: datasci_dataset_2022\BKK\test\bkk_test.csv
BKK
Test on SARIMAX with RMSE: 6.416884

Predicted Data: predict/smooth/chiangmai_predict_test.csv
Ground truth: datasci_dataset_2022\Chiangmai\test\chiangmai_test.csv
Chiangmai
Test on SARIMAX with RMSE: 10.240516

Predicted Data: predict/smooth/rayong_predict_test.csv
Ground truth: datasci_dataset_2022\Rayong\test\rayong_test.csv
Rayong
Test on SARIMAX with RMSE: 8.308127

Predicted Data: predict/smooth/saraburi_predict_test.csv
Ground truth: datasci_dataset_2022\Saraburi\test\saraburi_test.csv
Saraburi
Test on SARIMAX with RMSE: 11.540631

Predicted Data: predict/smooth/khonkaen_predict_test.csv
Ground truth: datasci_dataset_2022\Khonkaen\test\khonkaen_test.csv
Khonkaen
Test on SARIMAX with RMSE: 11.108259

Predicted Data: predict/smooth/surat_predict_test.csv
Ground truth: datasci_dataset_2022\Surat\test\surat_test.csv
Surat
Test on SARIMAX with RMSE: 5.928553



In [448]:
mse = mean_squared_error(gt_target, predicted)
rmse = math.sqrt(mse)
print('Test on SARIMAX with RMSE: %f' % (rmse, ))

Test on SARIMAX with RMSE: 9.174101


In [346]:
predicted.shape

(94248,)

In [347]:
data = {"Predicted": predicted}
submission_df = pd.DataFrame(data)

In [348]:
submission_df.reset_index().rename({'index': 'id'}, axis=1).to_csv('submissionSmooth13.csv', index=False)