In [None]:
# # optuna
# !pip install optuna

# # Boost
# !pip install xgboost
# !pip install catboost
# !pip install lightgbm

In [3]:
# # 卸載不兼容版本
# !pip uninstall keras -y
# !pip uninstall tensorflow -y

# 安裝兼容版本
!pip install tensorflow==2.15.0 -q
!pip install keras==2.15.0 -q
!pip install scikeras==0.12.0 -q

# 安裝所需的其他包
!pip install scikeras -q
!pip install keras -q
!pip install prophet -q
!pip install catboost -q
!pip install lightgbm -q
!pip install xgboost -q
!pip install optuna -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.0/233.0 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
import os
import json
import logging
import pandas as pd
import numpy as np
import xgboost as xgb
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
import matplotlib.pyplot as plt
import joblib

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# 設置 logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# 載入資料
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/detailed_usage_for_xgboost.csv')

y = df['new_per_ping']
X = df.drop(columns=['new_per_ping', 'year_month_for_combine'])

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.8, random_state=42)

# 初始化 XGBoost 模型
CatBoost_model = CatBoostRegressor(iterations=60, depth=2, learning_rate=0.29042538244092253, loss_function='RMSE', random_seed=42, verbose=0)

# 定義參數網格
param_grid = {
    'iterations': [100, 200, 300],
    'depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'l2_leaf_reg': [1, 3, 5] }

# 使用 GridSearchCV 進行調參
grid_search = GridSearchCV(estimator=CatBoost_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# # 假設已定義 model 和 param_grid
# grid_search = GridSearchCV(
#     estimator=model,
#     param_grid=param_grid,
#     scoring='accuracy',
#     cv=2,
#     n_jobs=-1,
#     verbose=1,
#     refit=True,
#     return_train_score=True,
#     pre_dispatch='2*n_jobs',
#     error_score=0
#   )
grid_search.fit(X_train, y_train)

# 最佳模型
best_CatBoost_model = grid_search.best_estimator_
# 打印最佳參數和交叉驗證分數2
best_params = grid_search.best_params_
best_score = grid_search.best_score_
logger.info(f"Best params: {best_CatBoost_model}")
logger.info(f"Best params: {best_params}")
logger.info(f"Best cross-validation score: {best_score}")

# 模型訓練
best_CatBoost_model.fit(X_train, y_train)

# 預測
y_pred_train = best_CatBoost_model.predict(X_train)
y_pred_test = best_CatBoost_model.predict(X_test)

# 評估
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

rmse_train = np.sqrt(mse_train)
rmse_test = np.sqrt(mse_test)

logger.info("-" * 50)
logger.info(f"XGBoost Mean MSE (Train): {mse_train}")
logger.info(f"XGBoost RMSE (Train): {rmse_train}")
logger.info(f"XGBoost Mean R² (Train): {r2_train}")
logger.info("-" * 50)
logger.info(f"XGBoost Mean MSE (Test): {mse_test}")
logger.info(f"XGBoost RMSE (Test): {rmse_test}")
logger.info(f"XGBoost Mean R² (Test): {r2_test}")
logger.info("=" * 50)

# 將結果保存為 JSON 檔案
results = {
    "best_params": best_params,
    "best_score": best_score,
    "metrics": {
        "train": {
            "mse": mse_train,
            "rmse": rmse_train,
            "r2": r2_train,
        },
        "test": {
            "mse": mse_test,
            "rmse": rmse_test,
            "r2": r2_test,
        }
    }
}

# 確保目標目錄存在
output_dir = '/content/drive/MyDrive/Colab Notebooks'
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, 'CatBoostRegressor_model_GridSearchCV.json')
with open(output_path, 'w') as f:
    json.dump(results, f)

logger.info(f"Results saved to {output_path}")

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [14]:
'''
# LightGBM

# 初始化模型
model = LGBMRegressor(n_estimators=70, max_depth=5, learning_rate=0.07717356617261285, random_state=42)

# 定義參數網格
param_grid = {
    'n_estimators': [50, 100, 150],
    'num_leaves': [31, 62, 127],
    'learning_rate': [0.01, 0.1, 0.2],
    'min_child_samples': [20, 50, 100]
  }
'''

'''
# KNNeighborsRegressor

# 初始化模型
model = KNeighborsRegressor(n_neighbors=10)

# 定義模型的參數網格
param_grid = {
  'n_neighbors': [3, 5, 7],
  'weights': ['uniform', 'distance']

'''

'''
# RandomForestRegressor

# 初始化模型
model = RandomForestRegressor(n_estimators=51, max_depth=8)

# 定義模型的參數網格
param_grid = {
  'n_estimators': [50, 100, 150],
  'max_depth': [None, 10, 20, 30],
  'min_samples_split': [2, 5, 10]
  }
'''

'''
# MLPRegressor

# 初始化模型

model = MLPRegressor(hidden_layer_sizes=(65,), max_iter=500, learning_rate_init=0.01, alpha=2.257104482357874e-05, random_state=42)

# 定義模型的參數網格
param_grids = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'activation': ['relu', 'tanh'],
    'learning_rate_init': [0.001, 0.01]
  }

'''

'''
# XGBoost

# 初始化模型
model = xgb.XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.29042538244092253, random_state=42)

# 定義模型的參數網格
param_grid = {
  'n_estimators': [50, 100, 150],
  'max_depth': [3, 5, 7],
  'learning_rate': [0.01, 0.1, 0.2]
  }

'''

'\n# 其他模型\nmodels = {\n    "KNeighborsRegressor": KNeighborsRegressor(n_neighbors=10),\n    "RandomForestRegressor": RandomForestRegressor(n_estimators=51, max_depth=8),\n    "MLPRegressor": MLPRegressor(hidden_layer_sizes=(65,), max_iter=500, learning_rate_init=0.01, alpha=2.257104482357874e-05, random_state=42),\n    "XGBoost": xgb.XGBRegressor(n_estimators=79, max_depth=13, learning_rate=0.23868392277107967, reg_alpha=0.1, reg_lambda=0.1, random_state=42),\n    "CatBoost": CatBoostRegressor(iterations=60, depth=2, learning_rate=0.29042538244092253, loss_function=\'RMSE\', random_seed=42, verbose=0),\n    "LightGBM": LGBMRegressor(n_estimators=70, max_depth=5, learning_rate=0.07717356617261285, random_state=42),\n  }\n\n# 定義模型的參數網格\nparam_grids = {\n    "KNeighborsRegressor": {\n        \'n_neighbors\': [3, 5, 7],\n        \'weights\': [\'uniform\', \'distance\']\n    },\n    "RandomForestRegressor": {\n        \'n_estimators\': [50, 100, 150],\n        \'max_depth\': [None, 10, 20, 