In [4]:
#Regression

import pandas as pd
from pycaret.regression import *

# 1) อ่านข้อมูล
df = pd.read_csv(r"C:\Users\snpdp\pm2.5-Forecast-Dashboard\Data_csv\merge_data\unknow-m_1d.csv")

# 2) แปลง timestamp เป็น datetime แล้วเรียงตามเวลา
df['date'] = pd.to_datetime(df['date'])
df.sort_values(by='date', inplace=True)

# 3) สร้างคอลัมน์วันแบบตัวเลข (ordinal) เพื่อให้โมเดลจับลำดับของวันได้ง่ายขึ้น
df['date_ordinal'] = df['date'].map(pd.Timestamp.toordinal)

# 4) ในที่นี้ เราจะทำนาย pm_2_5 โดยใช้ฟีเจอร์ 4 ตัว: [timestamp_ordinal, humidity, pm_10, temperature]
# สร้าง DataFrame ใหม่ที่มีเฉพาะฟีเจอร์ + target
data = df[['date_ordinal', 'humidity', 'temperature', 'pm_2_5']].copy()

# 5) ตั้งค่า PyCaret ในโหมด Regression
#    - target = 'pm_2_5'
#    - fold_strategy = 'timeseries' เพื่อบอกให้ PyCaret split ข้อมูลตามเวลา
#    - fold = 3 (หรือกำหนดเองได้)
#    - session_id = 123 (เพื่อความสามารถในการ reproduce)
reg_setup = setup(
    data=data,
    target='pm_2_5',
    fold_strategy='timeseries',
    fold=3,
    session_id=123,
    numeric_features=['date_ordinal', 'humidity', 'temperature'],
    data_split_shuffle=False,  # ✅ ปิดการสุ่มข้อมูลตอนแบ่ง train/test
    fold_shuffle=False  # ✅ ปิดการ shuffle ข้อมูลใน cross-validation
)


# 6) เปรียบเทียบโมเดลต่าง ๆ เพื่อดูว่าโมเดลไหนแม่นที่สุด (ใช้ค่า default metric เช่น R2, RMSE, MAE)
best_model = compare_models()

# 7) ปรับแต่งโมเดลที่เลือก (tune) หรือสร้างโมเดลเอง
# tuned_model = tune_model(best_model)

# 8) เมื่อได้โมเดลที่พอใจแล้ว ทำการ final เพื่อเตรียมใช้งานจริง
final_model = finalize_model(best_model)

# 9) ประเมินโมเดลบนชุดข้อมูลทดสอบ (PyCaret จะแบ่งให้ตาม time-based folds)
# predict_model() จะให้ผลลัพธ์ (prediction) + คำนวณ error metrics
pred_results = predict_model(final_model)
print(pred_results.head())

# 10) หากต้องการทำนายข้อมูลใหม่ (unseen data) สมมติเราเตรียม data_unseen ไว้
# data_unseen ควรมีคอลัมน์ timestamp_ordinal, humidity, pm_10, temperature
# unseen_preds = predict_model(final_model, data=data_unseen)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,pm_2_5
2,Target type,Regression
3,Original data shape,"(1452, 4)"
4,Transformed data shape,"(1452, 4)"
5,Transformed train set shape,"(1016, 4)"
6,Transformed test set shape,"(436, 4)"
7,Numeric features,3
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,7.084,80.428,8.8697,-0.1931,0.5732,0.6745,0.0133
par,Passive Aggressive Regressor,7.6785,84.3422,9.0632,-0.2281,0.6324,0.9155,0.0167
dummy,Dummy Regressor,7.3812,80.5549,8.9373,-0.2291,0.5964,0.7807,0.01
ada,AdaBoost Regressor,7.8975,96.3455,9.5682,-0.352,0.629,0.8222,0.05
et,Extra Trees Regressor,7.8961,102.679,9.932,-0.4696,0.6432,0.7679,0.0567
knn,K Neighbors Regressor,8.3298,109.9428,10.2121,-0.5382,0.6524,0.8722,0.03
lightgbm,Light Gradient Boosting Machine,7.9423,107.0583,10.2158,-0.5696,0.6629,0.7486,0.1233
rf,Random Forest Regressor,8.1265,109.9976,10.3387,-0.6131,0.6831,0.7523,0.0733
lr,Linear Regression,8.3216,128.6745,10.8573,-0.886,0.8444,0.5898,0.0133
lar,Least Angle Regression,8.3216,128.6745,10.8573,-0.886,0.8444,0.5898,0.0133


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,5.8907,49.31,7.0221,-0.0011,0.6173,0.8734


      date_ordinal   humidity  temperature     pm_2_5  prediction_label
1016        738849  92.228432    26.323557  11.844190         12.426985
1017        738850  91.814728    26.528961   7.732240         12.427002
1018        738851  96.790894    24.615288   5.285687         12.427019
1019        738852  96.302872    24.793573   4.043248         12.427036
1020        738853  89.681732    27.159864   9.947243         12.427052


In [8]:
from pycaret.regression import setup, compare_models, predict_model

# แบ่งชุดข้อมูลเองก่อน
train_size = int(len(df)*0.8)
train_df = df.iloc[:train_size]
test_df = df.iloc[train_size:]

# setup เฉพาะ train_df
reg_setup = setup(
    data=train_df,
    target='pm_2_5',
    numeric_features=['date_ordinal','humidity','temperature'],
    data_split_shuffle=False,
    session_id=123,
    # silent=True  # เอาออก
)


best_model = compare_models()

# เมื่อได้โมเดลที่พอใจแล้ว
final_model = finalize_model(best_model)

# ทำนาย test_df
pred_df = predict_model(final_model, data=test_df)

# ดู metrics
from sklearn.metrics import mean_squared_error, r2_score
rmse = mean_squared_error(test_df['pm_2_5'], pred_df['Label'], squared=False)
r2 = r2_score(test_df['pm_2_5'], pred_df['Label'])
print("RMSE:", rmse)
print("R2:", r2)


Unnamed: 0,Description,Value
0,Session id,123
1,Target,pm_2_5
2,Target type,Regression
3,Original data shape,"(1161, 6)"
4,Transformed data shape,"(1161, 8)"
5,Transformed train set shape,"(812, 8)"
6,Transformed test set shape,"(349, 8)"
7,Numeric features,3
8,Date features,1
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.3185,0.2636,0.4733,0.9941,0.0264,0.0211,0.035
et,Extra Trees Regressor,0.3302,0.3359,0.5173,0.9933,0.027,0.0212,0.048
ridge,Ridge Regression,0.3627,0.2466,0.4735,0.993,0.0327,0.0283,0.009
br,Bayesian Ridge,0.3627,0.2466,0.4735,0.993,0.0327,0.0283,0.011
lr,Linear Regression,0.3652,0.2503,0.4769,0.9929,0.0328,0.0284,0.01
lar,Least Angle Regression,0.3833,0.2729,0.4948,0.9927,0.0346,0.0297,0.009
rf,Random Forest Regressor,0.3468,0.3935,0.5358,0.9927,0.0287,0.0225,0.072
en,Elastic Net,0.3757,0.2398,0.4726,0.9923,0.036,0.0315,0.008
lasso,Lasso Regression,0.378,0.2411,0.4742,0.9921,0.0364,0.0319,0.012
llar,Lasso Least Angle Regression,0.378,0.2411,0.4742,0.9921,0.0364,0.0319,0.01


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Gradient Boosting Regressor,0.2414,0.0903,0.3006,0.9982,0.0384,0.0343


KeyError: 'Label'