In [2]:
import pandas as pd
import numpy as np
import holidays
from pycaret.time_series import TSForecastingExperiment
from scipy.stats import boxcox
from scipy.special import inv_boxcox

# อ่านข้อมูล CSV และตั้งค่า timestamp เป็น index
df = pd.read_csv("cleaned_data.csv", parse_dates=["timestamp"])
# df =df[-500:]
df["timestamp"] = pd.to_datetime(df["timestamp"], errors='coerce').dt.floor('s')  # ตัดมิลลิวินาทีออกไป
df.set_index("timestamp", inplace=True)
df = df[["pm_2_5"]]
# ลบค่าที่ซ้ำกันใน index
df = df[~df.index.duplicated(keep="last")]

# กำหนดความถี่ของข้อมูลเป็นรายชั่วโมง (H)
df = df.asfreq("H")

# เติมค่าหายไปโดยใช้ Interpolation แบบ Time-based
df.interpolate(method='time', inplace=True)

# กรองค่า pm_2_5 ให้อยู่ในช่วง 0 ถึง 80
df = df[(df['pm_2_5'] >= 0) & (df['pm_2_5'] <= 80)]

# เพิ่มฟีเจอร์ วัน เดือน ปี
df['day'] = df.index.day
df['month'] = df.index.month
df['year'] = df.index.year

# ฟังก์ชันระบุฤดูกาลของไทย
def get_thai_season(month):
    if month in [3, 4, 5]:
        return 'summer'
    elif month in [6, 7, 8, 9, 10]:
        return 'rainy'
    else:
        return 'winter'

# เพิ่มคอลัมน์ฤดูกาล
df['season'] = df['month'].apply(get_thai_season)
# One-hot encoding ฤดูกาล
df = pd.get_dummies(df, columns=['season'], prefix='season')

# # แปลงค่า True/False เป็น 1/0
df[['season_rainy', 'season_summer', 'season_winter']] = df[['season_rainy', 'season_summer', 'season_winter']].astype(int)

# เพิ่มคอลัมน์วันหยุดของไทย
thai_holidays = holidays.TH(years=sorted(df.index.year.unique()))
df['is_holiday'] = df.index.to_series().apply(lambda x: 1 if x in thai_holidays else 0)
df = df[-10000:]
print(df)

# แบ่งข้อมูล Train-Test
train_size = len(df) - 48
if train_size < 0:
    raise ValueError("จำนวนข้อมูลไม่เพียงพอสำหรับการแบ่ง Train-Test")
train_df = df.iloc[:train_size]
test_df = df.iloc[train_size:]

# แปลงค่า pm_2_5 ด้วย Box-Cox Transformation
train_df['pm_2_5'], lambda_ = boxcox(train_df['pm_2_5'] + 1)

# เติมค่าที่หายไปใน train_df
train_df['pm_2_5'].interpolate(method='time', inplace=True)

# Interpolation ข้อมูลที่หายไปใน test set
test_df.interpolate(method='time', inplace=True)
train_df = train_df.asfreq("H")
test_df = test_df.asfreq("H")
train_df['pm_2_5'].interpolate(method='time', inplace=True)
test_df['pm_2_5'].interpolate(method='time', inplace=True)
# ตรวจสอบค่าที่หายไป
print("Missing values in train_df['pm_2_5']:", train_df['pm_2_5'].isnull().sum())
print("Missing values in test_df['pm_2_5']:", test_df['pm_2_5'].isnull().sum())


                        pm_2_5  day  month  year  season_rainy  season_summer  \
timestamp                                                                       
2022-07-18 13:00:00  32.933333   18      7  2022             1              0   
2022-07-18 14:00:00  28.225806   18      7  2022             1              0   
2022-07-18 15:00:00  26.028986   18      7  2022             1              0   
2022-07-18 16:00:00  26.084034   18      7  2022             1              0   
2022-07-18 17:00:00  28.425000   18      7  2022             1              0   
...                        ...  ...    ...   ...           ...            ...   
2023-09-08 05:00:00  48.775000    8      9  2023             1              0   
2023-09-08 06:00:00  50.272727    8      9  2023             1              0   
2023-09-08 07:00:00  51.050420    8      9  2023             1              0   
2023-09-08 08:00:00  48.049587    8      9  2023             1              0   
2023-09-08 09:00:00  38.4201

In [3]:

# ตั้งค่า PyCaret
exp = TSForecastingExperiment()
exp.setup(
    data=train_df,
    target='pm_2_5',
    session_id=123,
    fh=48,
    use_gpu=True,
    seasonal_period=24,
    numeric_imputation_target='mean',  # เติมค่าที่หายไปด้วยค่าเฉลี่ย
    numeric_imputation_exogenous='mean'  # เติมค่าที่หายไปด้วยค่าเฉลี่ย
)

# สร้างและจูนโมเดล ARIMA
model = exp.create_model('arima', order=(1, 1, 1), seasonal_order=(1, 1, 1, 24))


Unnamed: 0,Description,Value
0,session_id,123
1,Target,pm_2_5
2,Approach,Univariate
3,Exogenous Variables,Present
4,Original data shape,"(9957, 8)"
5,Transformed data shape,"(9957, 8)"
6,Transformed train set shape,"(9909, 8)"
7,Transformed test set shape,"(48, 8)"
8,Rows with missing values,0.1%
9,Fold Generator,ExpandingWindowSplitter


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1035, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1035, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Dev

Unnamed: 0,cutoff,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2
0,2023-08-29 09:00,0.9151,0.8031,1.0072,1.152,0.3708,0.3133,0.5861
1,2023-08-31 09:00,1.2065,1.2198,1.327,1.7485,1.2424,0.5349,-0.7667
2,2023-09-02 09:00,0.5747,0.6185,0.6328,0.8879,0.3088,0.2862,0.3939
Mean,NaT,0.8988,0.8805,0.989,1.2628,0.6407,0.3781,0.0711
SD,NaT,0.2582,0.2515,0.2837,0.36,0.4263,0.1114,0.5976


In [4]:
model = exp.tune_model(model)


Unnamed: 0,cutoff,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2
0,2023-08-29 09:00,0.8699,0.7671,0.9574,1.1004,0.3385,0.2966,0.6224
1,2023-08-31 09:00,1.2582,1.0887,1.3838,1.5606,0.8852,1.1067,-0.4074
2,2023-09-02 09:00,1.6972,1.4132,1.8688,2.0286,1.2625,0.6794,-2.1642
Mean,NaT,1.2751,1.0897,1.4033,1.5632,0.8288,0.6942,-0.6497
SD,NaT,0.3379,0.2638,0.3723,0.3789,0.3793,0.3309,1.1505


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 52.2min finished


In [5]:
model = exp.finalize_model(model)



In [None]:
# Convert the index of test_df back to datetime64[ns] before prediction
test_df.index = test_df.index.to_period("H")

# ทำนายค่า pm_2_5
forecast = exp.predict_model(model, fh=48, X=test_df.drop(columns="pm_2_5", errors='ignore'))

# แปลงค่าทำนายกลับจาก Box-Cox
forecast['y_pred'] = inv_boxcox(forecast['y_pred'], lambda_) - 1
forecast['y_pred'] = np.maximum(forecast['y_pred'], 0)

# แสดงผลลัพธ์
print(forecast)

MemoryError: Unable to allocate 198. MiB for an array with shape (51, 51, 9958) and data type float64

: 

In [None]:
print(test_df)

               pm_2_5  day  month  year  season_rainy  season_summer  \
2025-01-28  10.800574   28      1  2025             0              0   
2025-01-29  10.605639   29      1  2025             0              0   
2025-01-30  14.708435   30      1  2025             0              0   
2025-01-31  16.408323   31      1  2025             0              0   
2025-02-01  27.336896    1      2  2025             0              0   
2025-02-02  30.186564    2      2  2025             0              0   
2025-02-03  30.616034    3      2  2025             0              0   
2025-02-04  18.877155    4      2  2025             0              0   
2025-02-05   6.793629    5      2  2025             0              0   
2025-02-06  10.463530    6      2  2025             0              0   
2025-02-07  19.059025    7      2  2025             0              0   
2025-02-08  19.064854    8      2  2025             0              0   
2025-02-09  13.824501    9      2  2025             0           

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ดึงค่าจริงจาก test_df
actual = test_df['pm_2_5']

# ดึงค่าทำนายจาก forecast
predicted = forecast['y_pred']

# คำนวณ MAE
mae = mean_absolute_error(actual, predicted)

# คำนวณ MSE
mse = mean_squared_error(actual, predicted)

# คำนวณ RMSE
rmse = np.sqrt(mse)

# คำนวณ MAPE
def calculate_mape(actual, predicted):
    return np.mean(np.abs((actual - predicted) / actual)) * 100

mape = calculate_mape(actual, predicted)

# คำนวณ R²
r2 = r2_score(actual, predicted)

# คำนวณความแม่นยำ (Accuracy)
mean_actual = np.mean(actual)
accuracy = (1 - (mae / mean_actual)) * 100

# แสดงผลลัพธ์
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAPE: {mape:.2f}%")
print(r2)
print(f"R² (ความแม่นยำ): {r2 * 100:.2f}%")
print(f"ความแม่นยำ (Accuracy): {accuracy:.2f}%")

MAE: 5.4474
MSE: 47.0706
RMSE: 6.8608
MAPE: 37.70%
0.09595384725230871
R² (ความแม่นยำ): 9.60%
ความแม่นยำ (Accuracy): 70.12%
