In [None]:
!pip -q install lightgbm==4.3.0 optuna==3.6.0 shap==0.45.1 --no-deps

In [None]:
!pip -q install colorlog

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
optuna 3.6.0 requires alembic>=1.5.0, which is not installed.[0m[31m
[0m

In [None]:
import pandas as pd, numpy as np, lightgbm as lgb, datetime as dt, warnings
warnings.filterwarnings('ignore')

FX_PATH = '/content/processed_data.csv'     # ← 改為你的檔案路徑
MM_PATH = '/content/merged_data.xlsx'
TARGET  = 'TWD_USD'
SPLIT   = pd.Timestamp('2024-01-01')        # 2000–2023→Train, 2024→Test

# 1) 讀匯率
fx = (pd.read_csv(FX_PATH, parse_dates=['Date'])
        .set_index('Date'))

# 2) 讀貨幣市場利率
mm = (pd.read_excel(MM_PATH, parse_dates=['Time'])
        .rename(columns={'Time':'Date'})
        .set_index('Date')
        .apply(pd.to_numeric, errors='coerce')
        .resample('D').ffill())             # 月→日 & forward

# 3) 合併 + 特徵
df = fx.join(mm, how='left')
df['year'], df['month'], df['dow'] = df.index.year, df.index.month, df.index.dayofweek
for k in [1,5,20]:
    df[f'lag_{k}'] = df[TARGET].shift(k)

# 4) forward-only 插值 + 去 NaN
df = (df.interpolate(method='time', limit_direction='forward')
        .fillna(method='ffill')
        .dropna())

# 5) Train / Test 切分
train_df = df.loc[:SPLIT - dt.timedelta(days=1)]
test_df  = df.loc[SPLIT:]
FEATURES = [c for c in df.columns if c != TARGET]

print(f'Train rows: {len(train_df):,}   Test rows: {len(test_df):,}   X dim: {len(FEATURES)}')



Train rows: 1,746   Test rows: 370   X dim: 23


In [None]:
import optuna, time, numpy as np
from sklearn.metrics import mean_squared_error

train_set = lgb.Dataset(train_df[FEATURES], label=train_df[TARGET])
valid_set = lgb.Dataset(test_df[FEATURES],  label=test_df[TARGET], reference=train_set)

def objective(trial):
    params = {
        'objective':'regression', 'metric':'rmse', 'verbosity':-1, 'seed':42,
        'learning_rate':    trial.suggest_float('lr', 0.005, 0.12, log=True),
        'num_leaves':       trial.suggest_int('nl', 31, 256),
        'feature_fraction': trial.suggest_float('ff', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bf', 0.6, 1.0),
        'bagging_freq':     trial.suggest_int('bq', 1, 15),
        'min_data_in_leaf': trial.suggest_int('minleaf', 10, 200),
        'feature_pre_filter': False          # 👈 新增這行，避免報錯
    }
    booster = lgb.train(
        params, train_set,
        num_boost_round=3000,
        valid_sets=[valid_set],
        callbacks=[lgb.early_stopping(100, verbose=False)]
    )
    pred = booster.predict(test_df[FEATURES], num_iteration=booster.best_iteration)
    rmse = np.sqrt(mean_squared_error(test_df[TARGET], pred))
    return rmse

print("⏱  Optuna 80-trial search …")
t0 = time.time()
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=80, show_progress_bar=False)
best_params = study.best_params | {
    'objective':'regression','metric':'rmse','verbosity':-1,'seed':42,
    'feature_pre_filter': False            # 保持一致
}
print(f"Best RMSE = {study.best_value:.5f}   |  耗時 {time.time()-t0:.1f}s")



[I 2025-06-24 19:35:23,618] A new study created in memory with name: no-name-e1d1e574-f79e-412c-afc0-15b5267ce2e9


⏱  Optuna 80-trial search …


[I 2025-06-24 19:35:24,186] Trial 0 finished with value: 0.33697348662939436 and parameters: {'lr': 0.07892820342332399, 'nl': 188, 'ff': 0.7875286399936074, 'bf': 0.7290483596414288, 'bq': 9, 'minleaf': 62}. Best is trial 0 with value: 0.33697348662939436.
[I 2025-06-24 19:35:24,891] Trial 1 finished with value: 0.5103833449918808 and parameters: {'lr': 0.010224046873454348, 'nl': 48, 'ff': 0.6578184418752016, 'bf': 0.6905617699516244, 'bq': 14, 'minleaf': 147}. Best is trial 0 with value: 0.33697348662939436.
[I 2025-06-24 19:35:25,283] Trial 2 finished with value: 0.3766268231904782 and parameters: {'lr': 0.01817727942130396, 'nl': 43, 'ff': 0.6940350018597807, 'bf': 0.7194693051775554, 'bq': 12, 'minleaf': 64}. Best is trial 0 with value: 0.33697348662939436.
[I 2025-06-24 19:35:25,593] Trial 3 finished with value: 0.42898823488872595 and parameters: {'lr': 0.1178291548774603, 'nl': 79, 'ff': 0.8717347937699442, 'bf': 0.6246017089809444, 'bq': 4, 'minleaf': 185}. Best is trial 0 wi

Best RMSE = 0.30048   |  耗時 74.0s


In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

final = lgb.train(
    best_params, train_set,
    num_boost_round=6000,
    valid_sets=[valid_set],
    callbacks=[lgb.early_stopping(300, verbose=True)]
)

pred = final.predict(test_df[FEATURES], num_iteration=final.best_iteration)

rmse  = np.sqrt(mean_squared_error(test_df[TARGET], pred))
mae   = mean_absolute_error(test_df[TARGET], pred)
mape  = (np.abs((test_df[TARGET]-pred)/test_df[TARGET]).mean())*100
r2    = r2_score(test_df[TARGET], pred)
dir_acc = (np.sign(np.diff(pred)) == np.sign(np.diff(test_df[TARGET]))).mean()

print(f"""📊 2024 Out-Sample Metrics
   RMSE  : {rmse:.5f}
   MAE   : {mae:.5f}
   MAPE  : {mape:.3f} %
   R²    : {r2:.4f}
   DirAcc: {dir_acc:.3%}""")

# 儲存
MODEL_PATH = '/content/usd_twd_lgb_2024_full.txt'
CSV_PATH   = '/content/usd_twd_pred_2024_full.csv'
test_df[[TARGET]].assign(pred=pred).to_csv(CSV_PATH)
final.save_model(MODEL_PATH)
print("✅ Saved →", MODEL_PATH, "and", CSV_PATH)

Training until validation scores don't improve for 300 rounds
Did not meet early stopping. Best iteration is:
[5997]	valid_0's rmse: 0.299095
📊 2024 Out-Sample Metrics
   RMSE  : 0.29910
   MAE   : 0.21740
   MAPE  : 0.668 %
   R²    : 0.8268
   DirAcc: 55.285%
✅ Saved → /content/usd_twd_lgb_2024_full.txt and /content/usd_twd_pred_2024_full.csv


# Detailed Ver.

# Cell 0──一次安裝所有相容套件

In [None]:
!pip -q install numpy==1.26.4            # ⭐ 固定 NumPy 1.x
!pip -q install lightgbm==4.3.0 optuna==3.6.0 shap==0.45.1 colorlog --no-deps
print("✅ 依賴安裝完成 —— 重新啟動 Runtime 後再繼續")

✅ 依賴安裝完成 —— 重新啟動 Runtime 後再繼續


# Cell 1──讀檔＋預處理（23 特徵，無洩漏）

In [None]:
import pandas as pd, numpy as np, lightgbm as lgb, datetime as dt, warnings
warnings.filterwarnings('ignore')

FX_PATH = '/content/processed_data.csv'
MM_PATH = '/content/merged_data.xlsx'
TARGET  = 'TWD_USD'
SPLIT   = pd.Timestamp('2024-01-01')

# 1) 讀資料
fx = pd.read_csv(FX_PATH, parse_dates=['Date']).set_index('Date')
mm = (pd.read_excel(MM_PATH, parse_dates=['Time'])
        .rename(columns={'Time':'Date'})
        .set_index('Date')
        .apply(pd.to_numeric, errors='coerce')
        .resample('D').ffill())

# 2) 合併 + 時間 / Lag
df = fx.join(mm, how='left')
df['year'], df['month'], df['dow'] = df.index.year, df.index.month, df.index.dayofweek
for k in [1,5,20]:
    df[f'lag_{k}'] = df[TARGET].shift(k)

# 3) forward-only 插值 → 去 NaN
df = (df.interpolate(method='time', limit_direction='forward')
        .fillna(method='ffill')
        .dropna())

train_df = df.loc[:SPLIT - dt.timedelta(days=1)]
test_df  = df.loc[SPLIT:]
FEATURES = [c for c in df.columns if c != TARGET]

print(f"Train rows: {len(train_df):,} | Test rows: {len(test_df):,} | X dim: {len(FEATURES)}")


Train rows: 1,746 | Test rows: 370 | X dim: 23


#  Cell 2──Optuna 80 trials（自動調參）

In [None]:
import optuna, time
from sklearn.metrics import mean_squared_error

train_set = lgb.Dataset(train_df[FEATURES], label=train_df[TARGET])
valid_set = lgb.Dataset(test_df[FEATURES],  label=test_df[TARGET], reference=train_set)

def objective(trial):
    params = {
        'objective':'regression','metric':'rmse','verbosity':-1,'seed':42,
        'learning_rate':    trial.suggest_float('lr', 0.005, 0.12, log=True),
        'num_leaves':       trial.suggest_int('nl', 31, 256),
        'feature_fraction': trial.suggest_float('ff', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bf', 0.6, 1.0),
        'bagging_freq':     trial.suggest_int('bq', 1, 15),
        'min_data_in_leaf': trial.suggest_int('minleaf', 10, 200),
        'feature_pre_filter': False          # ⭐ 避免 minleaf 報錯
    }
    booster = lgb.train(
        params, train_set,
        num_boost_round=3000,
        valid_sets=[valid_set],
        callbacks=[lgb.early_stopping(100, verbose=False)]
    )
    pred = booster.predict(test_df[FEATURES], num_iteration=booster.best_iteration)
    rmse = np.sqrt(mean_squared_error(test_df[TARGET], pred))
    return rmse

print("⏱  Optuna 80 trials  (≈10 min) …")
t0 = time.time()
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=80, show_progress_bar=False)
best_params = study.best_params | {
    'objective':'regression','metric':'rmse','verbosity':-1,'seed':42,
    'feature_pre_filter': False
}
print(f"Best RMSE = {study.best_value:.5f} | 耗時 {time.time()-t0:.1f}s")


[I 2025-06-24 19:45:56,327] A new study created in memory with name: no-name-b13c3cb9-b334-4a74-b594-e3e41bd74f47


⏱  Optuna 80 trials  (≈10 min) …


[I 2025-06-24 19:45:56,943] Trial 0 finished with value: 0.5686858886918393 and parameters: {'lr': 0.005810449223965439, 'nl': 217, 'ff': 0.8033028614599329, 'bf': 0.7912876175992326, 'bq': 14, 'minleaf': 194}. Best is trial 0 with value: 0.5686858886918393.
[I 2025-06-24 19:45:57,916] Trial 1 finished with value: 0.38251821956091897 and parameters: {'lr': 0.008357974446574113, 'nl': 181, 'ff': 0.7119915778770106, 'bf': 0.8236915397766131, 'bq': 13, 'minleaf': 72}. Best is trial 1 with value: 0.38251821956091897.
[I 2025-06-24 19:45:58,135] Trial 2 finished with value: 0.33394747662545543 and parameters: {'lr': 0.11507581232354606, 'nl': 127, 'ff': 0.7205607938942745, 'bf': 0.7910500629506098, 'bq': 13, 'minleaf': 32}. Best is trial 2 with value: 0.33394747662545543.
[I 2025-06-24 19:45:58,413] Trial 3 finished with value: 0.38999649677013726 and parameters: {'lr': 0.023744751360676268, 'nl': 233, 'ff': 0.6383176638017789, 'bf': 0.8083421087189419, 'bq': 12, 'minleaf': 72}. Best is tri

Best RMSE = 0.29077 | 耗時 116.8s


#  Cell 3──最終 6 000 樹訓練 + 指標

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

final = lgb.train(
    best_params, train_set,
    num_boost_round=6000,
    valid_sets=[valid_set],
    callbacks=[lgb.early_stopping(300, verbose=True)]
)

pred = final.predict(test_df[FEATURES], num_iteration=final.best_iteration)

rmse  = np.sqrt(mean_squared_error(test_df[TARGET], pred))
mae   = mean_absolute_error(test_df[TARGET], pred)
mape  = (np.abs((test_df[TARGET]-pred)/test_df[TARGET]).mean())*100
r2    = r2_score(test_df[TARGET], pred)
dir_acc = (np.sign(np.diff(pred)) == np.sign(np.diff(test_df[TARGET]))).mean()

print(f"""📊 2024 Out-Sample Metrics
   RMSE  : {rmse:.5f}
   MAE   : {mae:.5f}
   MAPE  : {mape:.3f} %
   R²    : {r2:.4f}
   DirAcc: {dir_acc:.3%}""")

MODEL_PATH = '/content/usd_twd_lgb_2024_full.txt'
CSV_PATH   = '/content/usd_twd_pred_2024_full.csv'
test_df[[TARGET]].assign(pred=pred).to_csv(CSV_PATH)
final.save_model(MODEL_PATH)
print("✅  Saved →", MODEL_PATH, "and", CSV_PATH)

Training until validation scores don't improve for 300 rounds
Did not meet early stopping. Best iteration is:
[5997]	valid_0's rmse: 0.299095
📊 2024 Out-Sample Metrics
   RMSE  : 0.29910
   MAE   : 0.21740
   MAPE  : 0.668 %
   R²    : 0.8268
   DirAcc: 55.285%
✅  Saved → /content/usd_twd_lgb_2024_full.txt and /content/usd_twd_pred_2024_full.csv
