In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

In [80]:
df_surat = pd.read_csv('data_preprocess/raw_data/surat_data.csv', parse_dates=["timestamp"], index_col="timestamp")
df_surat = df_surat.sort_values(by="timestamp")
df_surat['dayofweek'] = df_surat.index.dayofweek
df_surat['month'] = df_surat.index.month
df_surat['day'] = df_surat.index.day
# Create lag features for the past 7 days
for lag in range(1, 8):  # Lags from 1 to 7 days
    df_surat[f'pm_2_5_lag_{lag}'] = df_surat['pm_2_5'].shift(lag)

# Drop NaN values caused by shifting
df_surat.dropna(inplace=True)
df_surat = df_surat.drop(columns=["Unnamed: 0", "timezone", "pm_2_5_sp"], axis=1)
df_surat = df_surat[(np.abs(stats.zscore(df_surat['humidity'])) < 3)]  # ลบค่าผิดปกติออก
df_surat = df_surat[(np.abs(stats.zscore(df_surat['temperature'])) < 3)]  # ลบค่าผิดปกติออก
df_surat = df_surat[(np.abs(stats.zscore(df_surat['pm_2_5'])) < 3)]  # ลบค่าผิดปกติออก

In [81]:
df_surat.describe()

Unnamed: 0,humidity,pm_2_5,temperature,dayofweek,month,day,pm_2_5_lag_1,pm_2_5_lag_2,pm_2_5_lag_3,pm_2_5_lag_4,pm_2_5_lag_5,pm_2_5_lag_6,pm_2_5_lag_7
count,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0
mean,81.792955,23.142502,28.858913,3.012566,6.39427,15.697444,23.222385,23.29132,23.339039,23.372646,23.399623,23.430825,23.453939
std,14.864375,13.690111,3.08032,2.001458,3.554239,8.7991,13.941962,14.179992,14.338276,14.44933,14.527723,14.624267,14.690078
min,37.254628,0.0,20.98126,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,71.295685,11.833333,26.447233,1.0,3.0,8.0,11.833333,11.833333,11.833333,11.833333,11.833333,11.833333,11.831933
50%,83.817064,21.733333,28.1262,3.0,6.0,16.0,21.733333,21.733333,21.733333,21.72,21.722689,21.716667,21.716667
75%,94.768443,32.907948,30.950287,5.0,10.0,23.0,32.925,32.933333,32.934155,32.934955,32.933333,32.932184,32.933333
max,100.0,68.420168,38.147769,6.0,12.0,31.0,126.016667,176.166667,176.166667,176.166667,176.166667,176.166667,176.166667


In [82]:
df_surat.to_csv("data_preprocess/clean_data/clean_surat.csv", index=True, encoding="utf-8")

In [84]:
df_engineer = pd.read_csv('data_preprocess/raw_data/export-pm25_eng-1h.csv', parse_dates=["timestamp"], index_col="timestamp")
df_engineer = df_engineer.sort_values(by="timestamp")
df_engineer['dayofweek'] = df_engineer.index.dayofweek
df_engineer['month'] = df_engineer.index.month
df_engineer['day'] = df_engineer.index.day
# Create lag features for the past 7 days
for lag in range(1, 8):  # Lags from 1 to 7 days
    df_engineer[f'pm_2_5_lag_{lag}'] = df_engineer['pm_2_5'].shift(lag)

# Drop NaN values caused by shifting
df_engineer.dropna(inplace=True)
df_engineer = df_engineer.drop(columns=["Unnamed: 0", "timezone", "pm_2_5_sp"], axis=1)
df_engineer = df_engineer[(np.abs(stats.zscore(df_engineer['humidity'])) < 3)]  # ลบค่าผิดปกติออก
df_engineer = df_engineer[(np.abs(stats.zscore(df_engineer['temperature'])) < 3)]  # ลบค่าผิดปกติออก
df_engineer = df_engineer[(np.abs(stats.zscore(df_engineer['pm_2_5'])) < 3)]  # ลบค่าผิดปกติออก

In [113]:
df_engineer.describe()

Unnamed: 0,humidity,pm_10,pm_2_5,temperature,dayofweek,month,day,pm_2_5_lag_1,pm_2_5_lag_2,pm_2_5_lag_3,pm_2_5_lag_4,pm_2_5_lag_5,pm_2_5_lag_6,pm_2_5_lag_7
count,14310.0,14310.0,14310.0,14310.0,14310.0,14310.0,14310.0,14310.0,14310.0,14310.0,14310.0,14310.0,14310.0,14310.0
mean,79.14684,24.080782,20.979503,29.279318,3.0,6.809504,15.575891,20.98906,20.973734,20.957115,20.93633,20.901114,20.862902,20.831636
std,12.86411,13.756745,12.452783,4.022695,2.012575,3.376493,8.819935,12.678102,12.747251,12.79803,12.839655,12.857274,12.874776,12.883832
min,39.496231,1.684211,1.421053,20.419051,0.0,1.0,1.0,1.421053,1.421053,1.421053,1.421053,1.421053,1.421053,1.421053
25%,68.090261,13.116667,11.283333,26.342927,1.0,4.0,8.0,11.243534,11.233333,11.169492,11.15,11.116667,11.066667,11.033333
50%,80.597502,22.366667,18.866667,28.370391,3.0,7.0,15.0,18.8,18.758333,18.7,18.647034,18.583333,18.529379,18.466667
75%,90.004054,34.0,29.307759,31.136898,5.0,10.0,23.0,29.230085,29.183333,29.151907,29.133333,29.083333,29.016667,28.966667
max,100.0,74.716667,60.6,43.98259,6.0,12.0,31.0,184.366667,184.366667,184.366667,184.366667,184.366667,184.366667,184.366667


In [85]:
df_engineer.to_csv("data_preprocess/clean_data/clean_engineer.csv", index=True, encoding="utf-8")

In [86]:
df_songkla_001 = pd.read_csv('data_preprocess/raw_data/songkla_001-1h.csv', parse_dates=["timestamp"], index_col="timestamp")
df_songkla_001 = df_songkla_001.sort_values(by="timestamp")
df_songkla_001['dayofweek'] = df_songkla_001.index.dayofweek
df_songkla_001['month'] = df_songkla_001.index.month
df_songkla_001['day'] = df_songkla_001.index.day
# Create lag features for the past 7 days
for lag in range(1, 8):  # Lags from 1 to 7 days
    df_songkla_001[f'pm_2_5_lag_{lag}'] = df_songkla_001['pm_2_5'].shift(lag)

# Drop NaN values caused by shifting
df_songkla_001.dropna(inplace=True)
df_songkla_001 = df_songkla_001.drop(columns=["Unnamed: 0", "timezone", "pm_2_5_sp"], axis=1)
df_songkla_001 = df_songkla_001[(np.abs(stats.zscore(df_songkla_001['humidity'])) < 3)]  # ลบค่าผิดปกติออก
df_songkla_001 = df_songkla_001[(np.abs(stats.zscore(df_songkla_001['temperature'])) < 3)]  # ลบค่าผิดปกติออก
df_songkla_001 = df_songkla_001[(np.abs(stats.zscore(df_songkla_001['pm_2_5'])) < 3)]  # ลบค่าผิดปกติออก

In [87]:
df_songkla_001.to_csv("data_preprocess/clean_data/clean_songkla_001.csv", index=True, encoding="utf-8")

In [88]:
df_songkla_001.describe()

Unnamed: 0,humidity,pm_2_5,temperature,dayofweek,month,day,pm_2_5_lag_1,pm_2_5_lag_2,pm_2_5_lag_3,pm_2_5_lag_4,pm_2_5_lag_5,pm_2_5_lag_6,pm_2_5_lag_7
count,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0
mean,81.792955,23.142502,28.858913,3.012566,6.39427,15.697444,23.222385,23.29132,23.339039,23.372646,23.399623,23.430825,23.453939
std,14.864375,13.690111,3.08032,2.001458,3.554239,8.7991,13.941962,14.179992,14.338276,14.44933,14.527723,14.624267,14.690078
min,37.254628,0.0,20.98126,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,71.295685,11.833333,26.447233,1.0,3.0,8.0,11.833333,11.833333,11.833333,11.833333,11.833333,11.833333,11.831933
50%,83.817064,21.733333,28.1262,3.0,6.0,16.0,21.733333,21.733333,21.733333,21.72,21.722689,21.716667,21.716667
75%,94.768443,32.907948,30.950287,5.0,10.0,23.0,32.925,32.933333,32.934155,32.934955,32.933333,32.932184,32.933333
max,100.0,68.420168,38.147769,6.0,12.0,31.0,126.016667,176.166667,176.166667,176.166667,176.166667,176.166667,176.166667


In [89]:
df_songkla_012 = pd.read_csv('data_preprocess/raw_data/songkla_012-1h.csv', parse_dates=["timestamp"], index_col="timestamp")
df_songkla_012 = df_songkla_012.sort_values(by="timestamp")
df_songkla_012['dayofweek'] = df_songkla_012.index.dayofweek
df_songkla_012['month'] = df_songkla_012.index.month
df_songkla_012['day'] = df_songkla_012.index.day
# Create lag features for the past 7 days
for lag in range(1, 8):  # Lags from 1 to 7 days
    df_songkla_012[f'pm_2_5_lag_{lag}'] = df_songkla_012['pm_2_5'].shift(lag)

# Drop NaN values caused by shifting
df_songkla_012.dropna(inplace=True)
df_songkla_012 = df_songkla_012.drop(columns=["Unnamed: 0", "timezone", "pm_2_5_sp"], axis=1)
df_songkla_012 = df_songkla_012[(np.abs(stats.zscore(df_songkla_012['humidity'])) < 3)]  # ลบค่าผิดปกติออก
df_songkla_012 = df_songkla_012[(np.abs(stats.zscore(df_songkla_012['temperature'])) < 3)]  # ลบค่าผิดปกติออก
df_songkla_012 = df_songkla_012[(np.abs(stats.zscore(df_songkla_012['pm_2_5'])) < 3)]  # ลบค่าผิดปกติออก

In [90]:
df_songkla_012.describe()

Unnamed: 0,humidity,pm_2_5,temperature,dayofweek,month,day,pm_2_5_lag_1,pm_2_5_lag_2,pm_2_5_lag_3,pm_2_5_lag_4,pm_2_5_lag_5,pm_2_5_lag_6,pm_2_5_lag_7
count,10080.0,10080.0,10080.0,10080.0,10080.0,10080.0,10080.0,10080.0,10080.0,10080.0,10080.0,10080.0,10080.0
mean,86.680017,13.62253,28.825,3.01627,6.732044,15.28125,13.652828,13.675982,13.693705,13.711838,13.722688,13.725176,13.725018
std,12.837446,9.642081,1.989222,2.000033,3.047223,8.762318,9.717587,9.766627,9.802332,9.834461,9.861495,9.879572,9.89991
min,48.309731,0.082645,22.921875,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,76.023882,6.072592,27.338312,1.0,4.0,8.0,6.066667,6.066667,6.066667,6.067087,6.066667,6.058701,6.053146
50%,86.770558,11.311191,28.517278,3.0,7.0,15.0,11.323657,11.338843,11.358333,11.36249,11.36249,11.359839,11.358333
75%,100.0,19.1,30.102979,5.0,9.0,23.0,19.125,19.127795,19.150315,19.166667,19.2,19.20625,19.2
max,100.0,44.983471,35.658992,6.0,12.0,31.0,56.563025,57.65,59.818182,63.537815,63.537815,63.537815,63.537815


In [91]:
df_songkla_012.to_csv("data_preprocess/clean_data/clean_songkla_012.csv", index=True, encoding="utf-8")

In [92]:
df_songkla_013 = pd.read_csv('data_preprocess/raw_data/songkla_013-1h.csv', parse_dates=["timestamp"], index_col="timestamp")
df_songkla_013 = df_songkla_013.sort_values(by="timestamp")
df_songkla_013['dayofweek'] = df_songkla_013.index.dayofweek
df_songkla_013['month'] = df_songkla_013.index.month
df_songkla_013['day'] = df_songkla_013.index.day
# Create lag features for the past 7 days
for lag in range(1, 8):  # Lags from 1 to 7 days
    df_songkla_013[f'pm_2_5_lag_{lag}'] = df_songkla_013['pm_2_5'].shift(lag)

# Drop NaN values caused by shifting
df_songkla_013.dropna(inplace=True)
df_songkla_013 = df_songkla_013.drop(columns=["Unnamed: 0", "timezone", "pm_2_5_sp"], axis=1)
df_songkla_013 = df_songkla_013[(np.abs(stats.zscore(df_songkla_013['humidity'])) < 3)]  # ลบค่าผิดปกติออก
df_songkla_013 = df_songkla_013[(np.abs(stats.zscore(df_songkla_013['temperature'])) < 3)]  # ลบค่าผิดปกติออก
df_songkla_013 = df_songkla_013[(np.abs(stats.zscore(df_songkla_013['pm_2_5'])) < 3)]  # ลบค่าผิดปกติออก

In [93]:
df_songkla_013.describe()

Unnamed: 0,humidity,pm_2_5,temperature,dayofweek,month,day,pm_2_5_lag_1,pm_2_5_lag_2,pm_2_5_lag_3,pm_2_5_lag_4,pm_2_5_lag_5,pm_2_5_lag_6,pm_2_5_lag_7
count,22768.0,22768.0,22768.0,22768.0,22768.0,22768.0,22768.0,22768.0,22768.0,22768.0,22768.0,22768.0,22768.0
mean,48.845239,14.634044,34.478643,3.004304,6.746486,15.688247,14.692831,14.721374,14.738602,14.74821,14.752379,14.759187,14.767944
std,8.633279,10.152471,2.979333,2.001159,3.468225,8.881842,10.339809,10.410342,10.45089,10.479333,10.509118,10.524281,10.541074
min,23.247423,0.15,26.128387,0.0,1.0,1.0,0.15,0.15,0.15,0.15,0.15,0.15,0.15
25%,42.905174,6.65,32.35294,1.0,4.0,8.0,6.65,6.649781,6.65,6.648132,6.645028,6.644488,6.644488
50%,49.580837,12.14877,34.154388,3.0,7.0,16.0,12.15,12.152542,12.152542,12.152542,12.15,12.15,12.151271
75%,55.022704,20.65,36.393235,5.0,10.0,23.0,20.661916,20.672848,20.681338,20.681338,20.661916,20.655245,20.683333
max,73.614297,46.975,43.3365,6.0,12.0,31.0,148.5,148.5,148.5,148.5,148.5,148.5,148.5


In [94]:
df_songkla_013.to_csv("data_preprocess/clean_data/clean_songkla_013.csv", index=True, encoding="utf-8")

In [95]:
df_songkla_014 = pd.read_csv('data_preprocess/raw_data/songkla_014-1h.csv', parse_dates=["timestamp"], index_col="timestamp")
df_songkla_014 = df_songkla_014.sort_values(by="timestamp")
df_songkla_014['dayofweek'] = df_songkla_014.index.dayofweek
df_songkla_014['month'] = df_songkla_014.index.month
df_songkla_014['day'] = df_songkla_014.index.day
# Create lag features for the past 7 days
for lag in range(1, 8):  # Lags from 1 to 7 days
    df_songkla_014[f'pm_2_5_lag_{lag}'] = df_songkla_014['pm_2_5'].shift(lag)

# Drop NaN values caused by shifting
df_songkla_014.dropna(inplace=True)
df_songkla_014 = df_songkla_014.drop(columns=["Unnamed: 0", "timezone", "pm_2_5_sp"], axis=1)
df_songkla_014 = df_songkla_014[(np.abs(stats.zscore(df_songkla_014['humidity'])) < 3)]  # ลบค่าผิดปกติออก
df_songkla_014 = df_songkla_014[(np.abs(stats.zscore(df_songkla_014['temperature'])) < 3)]  # ลบค่าผิดปกติออก
df_songkla_014 = df_songkla_014[(np.abs(stats.zscore(df_songkla_014['pm_2_5'])) < 3)]  # ลบค่าผิดปกติออก

In [96]:
df_songkla_014.describe()

Unnamed: 0,humidity,pm_2_5,temperature,dayofweek,month,day,pm_2_5_lag_1,pm_2_5_lag_2,pm_2_5_lag_3,pm_2_5_lag_4,pm_2_5_lag_5,pm_2_5_lag_6,pm_2_5_lag_7
count,24591.0,24591.0,24591.0,24591.0,24591.0,24591.0,24591.0,24591.0,24591.0,24591.0,24591.0,24591.0,24591.0
mean,68.680586,17.465881,29.962067,3.007482,6.715343,15.815786,17.523416,17.562169,17.573988,17.591381,17.594855,17.582148,17.56715
std,12.443725,12.322003,2.459195,2.004333,3.435547,8.881176,12.508136,12.722238,12.690681,12.852464,12.880478,12.87365,12.856487
min,31.26,0.07,24.11,0.0,1.0,1.0,0.07,0.07,0.07,0.07,0.07,0.07,0.07
25%,60.13,7.415,28.09,1.0,4.0,8.0,7.41,7.41,7.41,7.4,7.4,7.395,7.4
50%,69.09,14.67,29.63,3.0,7.0,16.0,14.67,14.67,14.67,14.67,14.66,14.64,14.63
75%,76.85,25.715,31.57,5.0,10.0,24.0,25.72,25.715,25.705,25.695,25.665,25.63,25.6
max,100.0,56.98,37.47,6.0,12.0,31.0,111.82,297.11,170.83,297.11,297.11,297.11,297.11


In [97]:
df_songkla_014.to_csv("data_preprocess/clean_data/clean_songkla_014.csv", index=True, encoding="utf-8")

In [98]:
df_songkla_018 = pd.read_csv('data_preprocess/raw_data/songkla_018-1h.csv', parse_dates=["timestamp"], index_col="timestamp")
df_songkla_018 = df_songkla_018.sort_values(by="timestamp")
df_songkla_018['dayofweek'] = df_songkla_018.index.dayofweek
df_songkla_018['month'] = df_songkla_018.index.month
df_songkla_018['day'] = df_songkla_018.index.day
# Create lag features for the past 7 days
for lag in range(1, 8):  # Lags from 1 to 7 days
    df_songkla_018[f'pm_2_5_lag_{lag}'] = df_songkla_018['pm_2_5'].shift(lag)

# Drop NaN values caused by shifting
df_songkla_018.dropna(inplace=True)
df_songkla_018 = df_songkla_018.drop(columns=["Unnamed: 0", "timezone", "pm_2_5_sp"], axis=1)
df_songkla_018 = df_songkla_018[(np.abs(stats.zscore(df_songkla_018['humidity'])) < 3)]  # ลบค่าผิดปกติออก
df_songkla_018 = df_songkla_018[(np.abs(stats.zscore(df_songkla_018['temperature'])) < 3)]  # ลบค่าผิดปกติออก
df_songkla_018 = df_songkla_018[(np.abs(stats.zscore(df_songkla_018['pm_2_5'])) < 3)]  # ลบค่าผิดปกติออก

In [99]:
df_songkla_018.describe()

Unnamed: 0,humidity,pm_2_5,temperature,dayofweek,month,day,pm_2_5_lag_1,pm_2_5_lag_2,pm_2_5_lag_3,pm_2_5_lag_4,pm_2_5_lag_5,pm_2_5_lag_6,pm_2_5_lag_7
count,23322.0,23322.0,23322.0,23322.0,23322.0,23322.0,23322.0,23322.0,23322.0,23322.0,23322.0,23322.0,23322.0
mean,72.896773,18.753361,28.280665,2.996227,6.484135,15.725409,18.824135,18.852653,18.868478,18.876777,18.87372,18.872178,18.861795
std,15.622286,12.494189,3.605568,2.009536,3.476758,8.905816,12.682924,12.760604,12.836361,12.961647,12.99276,13.029488,13.0468
min,10.394804,0.024793,5.907572,0.0,1.0,1.0,0.024793,0.024793,0.024793,0.024793,0.024793,0.024793,0.024793
25%,62.684326,8.7,26.170185,1.0,3.0,8.0,8.716667,8.713065,8.708669,8.7,8.683333,8.666667,8.651293
50%,75.386735,16.237288,27.735147,3.0,7.0,16.0,16.266667,16.258333,16.25,16.233333,16.216667,16.194444,16.183333
75%,84.808372,27.3,30.259372,5.0,9.0,24.0,27.318662,27.333333,27.333333,27.316667,27.290323,27.283333,27.224078
max,98.994736,59.05,53.340333,6.0,12.0,31.0,152.875,127.95,127.95,211.25,211.25,211.25,211.25


In [100]:
df_songkla_018.to_csv("data_preprocess/clean_data/clean_songkla_018.csv", index=True, encoding="utf-8")

In [114]:
df_songkla_concat = pd.concat([df_engineer,df_songkla_001, df_songkla_014, df_songkla_018], ignore_index=False)

In [115]:
df_songkla_concat = df_songkla_concat.sort_values(by="timestamp")
df_songkla_concat = df_songkla_concat
df_songkla_concat = df_songkla_concat.drop(columns=['pm_10'], axis=1)


In [116]:
df_songkla_concat

Unnamed: 0_level_0,humidity,pm_2_5,temperature,dayofweek,month,day,pm_2_5_lag_1,pm_2_5_lag_2,pm_2_5_lag_3,pm_2_5_lag_4,pm_2_5_lag_5,pm_2_5_lag_6,pm_2_5_lag_7
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021-12-01 14:00:00,66.269500,2.000000,24.990000,2,12,1,4.066667,2.984848,2.681239,2.045555,2.656322,2.022951,1.566102
2021-12-01 15:00:00,60.997426,0.815789,25.950526,2,12,1,2.000000,4.066667,2.984848,2.681239,2.045555,2.656322,2.022951
2021-12-01 16:00:00,61.093943,1.083333,26.178000,2,12,1,0.815789,2.000000,4.066667,2.984848,2.681239,2.045555,2.656322
2021-12-01 17:00:00,61.953554,1.098361,26.292459,2,12,1,1.083333,0.815789,2.000000,4.066667,2.984848,2.681239,2.045555
2021-12-01 18:00:00,61.932943,1.133333,26.525667,2,12,1,1.098361,1.083333,0.815789,2.000000,4.066667,2.984848,2.681239
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-02-11 11:00:00,53.353909,31.842105,32.314087,1,2,11,40.848214,42.478992,43.263158,42.558333,37.806723,34.691667,35.283333
2025-02-11 11:12:18,52.840677,31.750000,32.533843,1,2,11,31.842105,40.848214,42.478992,43.263158,42.558333,37.806723,34.691667
2025-02-11 11:12:43,59.010000,12.920000,31.300000,1,2,11,15.500000,16.990000,37.260000,44.620000,35.930000,38.090000,39.530000
2025-02-11 11:13:14,62.618683,14.615385,30.839804,1,2,11,15.362069,17.850000,37.416667,38.133333,32.200000,25.847458,31.566667


In [117]:
df_songkla_concat.to_csv("data_preprocess/clean_data/songkla_concat_data.csv", index=True, encoding="utf-8")

In [118]:
df_songkla_concat2 = pd.concat([df_songkla_001, df_songkla_014, df_songkla_018], ignore_index=False)

In [None]:
df_songkla_concat2 = df_songkla_concat2.sort_values(by="timestamp")
df_songkla_concat2 = df_songkla_concat2

In [122]:
df_songkla_concat2.to_csv("data_preprocess/clean_data/test_songkla_concat_data.csv", index=True, encoding="utf-8")