In [None]:
import pandas as pd
import numpy as np
import holidays
from pycaret.time_series import TSForecastingExperiment
from scipy.stats import boxcox
from scipy.special import inv_boxcox

# อ่านข้อมูล CSV และตั้งค่า timestamp เป็น index
df = pd.read_csv("cleaned_data.csv", parse_dates=["timestamp"])
df["timestamp"] = pd.to_datetime(df["timestamp"], errors='coerce').dt.floor('s')  # ตัดมิลลิวินาทีออกไป
df.set_index("timestamp", inplace=True)
df = df[["pm_2_5"]]

# ลบค่าที่ซ้ำกันใน index
df = df[~df.index.duplicated(keep="last")]

# กำหนดความถี่ของข้อมูลเป็นรายชั่วโมง (H)
df = df.asfreq("H")

# เติมค่าหายไปโดยใช้ Interpolation แบบ Time-based
df.interpolate(method='time', inplace=True)

# กรองค่า pm_2_5 ให้อยู่ในช่วง 0 ถึง 80
df = df[(df['pm_2_5'] >= 0) & (df['pm_2_5'] <= 80)]

# เพิ่มฟีเจอร์ วัน เดือน ปี
df['day'] = df.index.day
df['month'] = df.index.month
df['year'] = df.index.year

# ฟังก์ชันระบุฤดูกาลของไทย
def get_thai_season(month):
    if month in [3, 4, 5]:
        return 'summer'
    elif month in [6, 7, 8, 9, 10]:
        return 'rainy'
    else:
        return 'winter'

# เพิ่มคอลัมน์ฤดูกาล
df['season'] = df['month'].apply(get_thai_season)

# One-hot encoding ฤดูกาล
df = pd.get_dummies(df, columns=['season'], prefix='season')

# แปลงค่า True/False เป็น 1/0
df[['season_rainy', 'season_summer', 'season_winter']] = df[['season_rainy', 'season_summer', 'season_winter']].astype(int)

# เพิ่มคอลัมน์วันหยุดของไทย
thai_holidays = holidays.TH(years=sorted(df.index.year.unique()))
df['is_holiday'] = df.index.to_series().apply(lambda x: 1 if x in thai_holidays else 0)

# แบ่งข้อมูล Train-Test
train_size = len(df) - 24
if train_size < 0:
    raise ValueError("จำนวนข้อมูลไม่เพียงพอสำหรับการแบ่ง Train-Test")
train_df = df.iloc[:train_size]
test_df = df.iloc[train_size:]

# แปลงค่า pm_2_5 ด้วย Box-Cox Transformation
train_df['pm_2_5'], lambda_ = boxcox(train_df['pm_2_5'] + 1)

# เติมค่าที่หายไปใน train_df
train_df['pm_2_5'].interpolate(method='time', inplace=True)

# Interpolation ข้อมูลที่หายไปใน test set
test_df.interpolate(method='time', inplace=True)
train_df = train_df.asfreq("H")
test_df = test_df.asfreq("H")
train_df['pm_2_5'].interpolate(method='time', inplace=True)
test_df['pm_2_5'].interpolate(method='time', inplace=True)
# ตรวจสอบค่าที่หายไป
print("Missing values in train_df['pm_2_5']:", train_df['pm_2_5'].isnull().sum())
print("Missing values in test_df['pm_2_5']:", test_df['pm_2_5'].isnull().sum())

# ตั้งค่า PyCaret
exp = TSForecastingExperiment()
exp.setup(
    data=train_df,
    target='pm_2_5',
    session_id=123,
    fh=24,
    use_gpu=True,
    seasonal_period=24,
    numeric_imputation_target='mean',  # เติมค่าที่หายไปด้วยค่าเฉลี่ย
    numeric_imputation_exogenous='mean'  # เติมค่าที่หายไปด้วยค่าเฉลี่ย
)

# สร้างและจูนโมเดล ARIMA
model = exp.create_model('arima', order=(1, 1, 1), seasonal_order=(1, 1, 1, 24))
model = exp.tune_model(model)
model = exp.finalize_model(model)

# ทำนายค่า pm_2_5
forecast = exp.predict_model(model, fh=24, X=test_df.drop(columns="pm_2_5", errors='ignore'))

# แปลงค่าทำนายกลับจาก Box-Cox
forecast['y_pred'] = inv_boxcox(forecast['y_pred'], lambda_) - 1
forecast['y_pred'] = np.maximum(forecast['y_pred'], 0)

# แสดงผลลัพธ์
print(forecast)

Missing values in train_df['pm_2_5']: 0
Missing values in test_df['pm_2_5']: 0


Unnamed: 0,Description,Value
0,session_id,123
1,Target,pm_2_5
2,Approach,Univariate
3,Exogenous Variables,Present
4,Original data shape,"(12917, 8)"
5,Transformed data shape,"(12917, 8)"
6,Transformed train set shape,"(12893, 8)"
7,Transformed test set shape,"(24, 8)"
8,Rows with missing values,0.0%
9,Fold Generator,ExpandingWindowSplitter


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1035, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1035, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Dev

Exception ignored on calling ctypes callback function: <function _log_callback at 0x000002978848D7E0>
Traceback (most recent call last):
  File "c:\termproject1-2_cleandata\venv\lib\site-packages\lightgbm\basic.py", line 287, in _log_callback
    def _log_callback(msg: bytes) -> None:
KeyboardInterrupt: 


[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1035, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1035, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data poin

Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
print(test_df)

               pm_2_5  day  month  year  season_rainy  season_summer  \
2025-01-28  10.800574   28      1  2025             0              0   
2025-01-29  10.605639   29      1  2025             0              0   
2025-01-30  14.708435   30      1  2025             0              0   
2025-01-31  16.408323   31      1  2025             0              0   
2025-02-01  27.336896    1      2  2025             0              0   
2025-02-02  30.186564    2      2  2025             0              0   
2025-02-03  30.616034    3      2  2025             0              0   
2025-02-04  18.877155    4      2  2025             0              0   
2025-02-05   6.793629    5      2  2025             0              0   
2025-02-06  10.463530    6      2  2025             0              0   
2025-02-07  19.059025    7      2  2025             0              0   
2025-02-08  19.064854    8      2  2025             0              0   
2025-02-09  13.824501    9      2  2025             0           

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ดึงค่าจริงจาก test_df
actual = test_df['pm_2_5']

# ดึงค่าทำนายจาก forecast
predicted = forecast['y_pred']

# คำนวณ MAE
mae = mean_absolute_error(actual, predicted)

# คำนวณ MSE
mse = mean_squared_error(actual, predicted)

# คำนวณ RMSE
rmse = np.sqrt(mse)

# คำนวณ MAPE
def calculate_mape(actual, predicted):
    return np.mean(np.abs((actual - predicted) / actual)) * 100

mape = calculate_mape(actual, predicted)

# คำนวณ R²
r2 = r2_score(actual, predicted)

# คำนวณความแม่นยำ (Accuracy)
mean_actual = np.mean(actual)
accuracy = (1 - (mae / mean_actual)) * 100

# แสดงผลลัพธ์
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAPE: {mape:.2f}%")
print(r2)
print(f"R² (ความแม่นยำ): {r2 * 100:.2f}%")
print(f"ความแม่นยำ (Accuracy): {accuracy:.2f}%")

MAE: 5.4474
MSE: 47.0706
RMSE: 6.8608
MAPE: 37.70%
0.09595384725230871
R² (ความแม่นยำ): 9.60%
ความแม่นยำ (Accuracy): 70.12%
