In [1]:
# === 🧹 清除 train_df / test_df 等相關變數與釋放記憶體 ===
import gc

print("🧹 正在清除讀入的資料 DataFrame...")

# 確保變數存在才刪除，避免報錯
if 'train_df' in locals():
    del train_df
if 'test_df' in locals():
    del test_df

# 嘗試釋放記憶體
gc.collect()

print("✅ 清除完成，可重新執行資料載入程式")


🧹 正在清除讀入的資料 DataFrame...
✅ 清除完成，可重新執行資料載入程式


In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import numpy as np
from tqdm import tqdm
import time

# === Step 1: 模擬讀取資料進度條 ===
print("📂 正在讀取 training.csv ...")
for _ in tqdm(range(100), desc="Loading training.csv"):
    time.sleep(0.002)

train_df = pd.read_csv("training.csv")

print("📂 正在讀取 public_x.csv ...")
for _ in tqdm(range(100), desc="Loading public_x.csv"):
    time.sleep(0.002)

test_df = pd.read_csv("public_x.csv")

📂 正在讀取 training.csv ...


Loading training.csv: 100%|██████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 65.19it/s]


📂 正在讀取 public_x.csv ...


Loading public_x.csv: 100%|██████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 64.61it/s]


In [5]:
# === 🧹 清除訓練結果並重設環境（保留資料） ===
import gc

print("🧹 清除之前模型的訓練結果和變數...")
for var in ['X', 'y', 'X_train', 'X_test', 'y_train', 'y_test', 'X_train', 'y_train',
            'model', 'best_model', 'grid_search', 'shap_values', 'shap_importance']:
    if var in globals():
        del globals()[var]

gc.collect()

print("🔄 重置完成，可以重新進行模型訓練")


🧹 清除之前模型的訓練結果和變數...
🔄 重置完成，可以重新進行模型訓練


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, precision_recall_fscore_support, f1_score
import shap
from tqdm import tqdm
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.family'] = 'Microsoft JhengHei'  # 支援中文字型

# === 📥 載入資料 ===
X = train_df[features].replace([np.inf, -np.inf], np.nan).fillna(0).astype(np.float32)
y = y.astype(int)

# === ✂️ 切分訓練 / 測試資料 ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# === ⚖️ 計算類別權重 ===
print("⚖️ 計算類別權重...")
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

# === 📈 初始 LightGBM 模型訓練（for SHAP） ===
print("📈 訓練初始 LightGBM 模型（for SHAP 分析）...")
model = LGBMClassifier(
    n_estimators=300,
    max_depth=10,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1,
    scale_pos_weight=scale_pos_weight,
    device='gpu',
    boosting_type='gbdt',
    objective='binary',
    random_state=42
)
model.fit(X_train, y_train)

# === 🔍 計算 SHAP 值 ===
print("🔍 計算 SHAP 值以篩選重要特徵...")
X_sample = X_train.sample(n=8000, random_state=42)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_sample)

# 安全檢查 shap_values 是否為 list（新版 LightGBM 有可能是 list，也可能是 array）
if isinstance(shap_values, list):
    shap_vals = shap_values[1]  # 正類
else:
    shap_vals = shap_values     # 若不是 list 就直接使用

shap_importance = pd.DataFrame({
    'feature': X_sample.columns,
    'mean_abs_shap': np.abs(shap_vals).mean(axis=0)
}).sort_values(by='mean_abs_shap', ascending=False)

# 繪圖
shap.summary_plot(shap_vals, X_sample, max_display=30)

# 匯出重要特徵
top_features = shap_importance.head(500)['feature'].tolist()
shap_importance.head(100).to_csv("shap_top_features.csv", index=False)

# === 🔧 RandomizedSearchCV 調參 ===
print("🎯 開始 RandomizedSearchCV 調參...")
param_dist = {
    'n_estimators': [600, 800, 1000, 1200],
    'max_depth': [8, 10, 12, 20],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [1, 5, 10],
    'min_data_in_leaf': [10, 20, 50],
    'min_gain_to_split': [0, 0.001, 0.01],
    'max_bin': [255],
}

random_search = RandomizedSearchCV(
    estimator=LGBMClassifier(
        scale_pos_weight=scale_pos_weight,
        objective='binary',
        device='gpu',
        boosting_type='gbdt',
        random_state=42
    ),
    param_distributions=param_dist,
    n_iter=10,
    scoring='f1',
    cv=3,
    verbose=3,
    n_jobs=-1,
    random_state=42
)
random_search.fit(X_train[top_features], y_train)

print("✅ 調參完成！")
print("最佳參數:", random_search.best_params_)
print("最佳F1-score:", random_search.best_score_)

# === 📊 交叉驗證評估 ===
print("📊 執行交叉驗證...")
best_model = random_search.best_estimator_
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = cross_val_score(best_model, X_train[top_features], y_train, cv=cv, scoring='f1')
print(f"交叉驗證 F1-score 平均值: {f1_scores.mean():.4f}")

# === 🧠 找最佳 Threshold ===
print("🧠 搜尋最佳 Threshold 中（precision/recall/F1）...")
y_probs = best_model.predict_proba(X_test[top_features])[:, 1]
thresholds = np.arange(0.1, 0.91, 0.01)
best_f1 = 0
best_threshold = 0.4
f1_list, prec_list, recall_list = [], [], []

for threshold in thresholds:
    preds = (y_probs >= threshold).astype(int)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, preds, average='binary')
    f1_list.append(f1)
    prec_list.append(precision)
    recall_list.append(recall)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"\n🎯 最佳 threshold: {best_threshold:.2f}")
print(f"🔹 對應 F1-score: {best_f1:.4f}")

# === 📈 繪圖：Threshold vs Precision / Recall / F1 ===
plt.figure(figsize=(10, 6))
plt.plot(thresholds, prec_list, label='Precision')
plt.plot(thresholds, recall_list, label='Recall')
plt.plot(thresholds, f1_list, label='F1-score')
plt.axvline(best_threshold, color='r', linestyle='--', label=f'Best threshold = {best_threshold:.2f}')
plt.xlabel("Threshold")
plt.ylabel("Score")
plt.title("Threshold vs Precision / Recall / F1")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# === 📤 測試資料預測，使用最佳 threshold ===
for col in top_features:
    if col not in test_df.columns:
        test_df[col] = np.nan
X_public = test_df[top_features].replace([np.inf, -np.inf], np.nan).fillna(0)
test_probs = best_model.predict_proba(X_public)[:, 1]
test_df["飆股"] = (test_probs >= best_threshold).astype(int)

# === 💾 輸出 submission 結果 ===
submission = test_df[["ID", "飆股"]]
submission.to_csv("public_result1.csv", index=False, encoding="utf-8", lineterminator='\n')
print("✅ 成功輸出 submission：public_result1.csv with optimal threshold")

NameError: name 'features' is not defined

In [14]:
#LSTM未做神經網路訓練
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler

start_time = time.time()
print("\U0001F4E6 資料預處理開始...")

# === 🪜 處理缺失值與資料型態（推薦寫法）===
print("🪛 Cleaning training data...")
float64_cols_train = train_df.select_dtypes(include='float64').columns
train_df[float64_cols_train] = train_df[float64_cols_train].astype('float32').replace([np.inf, -np.inf], np.nan).fillna(0)

print("🪛 Cleaning testing data...")
float64_cols_test = test_df.select_dtypes(include='float64').columns
test_df[float64_cols_test] = test_df[float64_cols_test].astype('float32').replace([np.inf, -np.inf], np.nan).fillna(0)

# === 🌟 特定特徵群處理 ===
feature_groups = [
    "官股券商_", "個股券商分點", "個股主力買賣超統計",
    "日外資_", "日自營_", "日投信_",
    "技術指標_", "月營收_", "季IFRS財報_"
]

for prefix in tqdm(feature_groups, desc="Processing feature groups"):
    cols = [col for col in train_df.columns if col.startswith(prefix)]
    train_df[cols] = train_df[cols].replace([np.inf, -np.inf], np.nan).fillna(0)

# === 📈 使用 LSTM 處理時間序列特徵 ===
class SimpleLSTM(nn.Module):
    def __init__(self, input_size, hidden_size=16):
        super(SimpleLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)

    def forward(self, x):
        output, (h_n, _) = self.lstm(x)
        return h_n[-1]

# === LSTM 1: 個股價格/量/大盤 ===
seq_cols_1 = [
    [f"個股前{i}天收盤價" for i in range(1, 21)],
    [f"個股前{i}天成交量" for i in range(1, 21)],
    [f"上市加權指數前{i}天收盤價" for i in range(1, 21)],
    [f"上市加權指數前{i}天成交量" for i in range(1, 21)]
]
all_seq_cols_1 = sum(seq_cols_1, [])
scaler_1 = StandardScaler()

X_seq_1 = scaler_1.fit_transform(train_df[all_seq_cols_1])
X_seq_1 = X_seq_1.reshape(len(train_df), 20, -1)
X_seq_tensor_1 = torch.tensor(X_seq_1, dtype=torch.float32)
model_1 = SimpleLSTM(input_size=X_seq_1.shape[2])
with torch.no_grad():
    lstm_output_1 = model_1(X_seq_tensor_1).numpy()
lstm_cols_1 = [f'LSTM_seq1_embed_{i}' for i in range(lstm_output_1.shape[1])]
train_df.drop(columns=[col for col in lstm_cols_1 if col in train_df.columns], inplace=True)
lstm_df1 = pd.DataFrame(lstm_output_1, columns=lstm_cols_1)
train_df = pd.concat([train_df, lstm_df1], axis=1)

# 測試資料同步處理
X_seq_1_test = scaler_1.transform(test_df[all_seq_cols_1])
X_seq_1_test = X_seq_1_test.reshape(len(test_df), 20, -1)
X_seq_tensor_1_test = torch.tensor(X_seq_1_test, dtype=torch.float32)
with torch.no_grad():
    lstm_output_1_test = model_1(X_seq_tensor_1_test).numpy()
test_df.drop(columns=[col for col in lstm_cols_1 if col in test_df.columns], inplace=True)
lstm_df1_test = pd.DataFrame(lstm_output_1_test, columns=lstm_cols_1)
test_df = pd.concat([test_df, lstm_df1_test], axis=1)

# === LSTM 2: 主力券商資料 ===
seq_cols_2 = [
    col for col in train_df.columns if any(
        col.startswith(f"{side}超第{rank}名分點前") and not col.endswith("券商代號")
        for side in ["買", "賣"] for rank in range(1, 16)
    )
]
scaler_2 = StandardScaler()
X_seq_2 = scaler_2.fit_transform(train_df[seq_cols_2])
X_seq_2 = X_seq_2.reshape(len(train_df), 20, -1)
X_seq_tensor_2 = torch.tensor(X_seq_2, dtype=torch.float32)
model_2 = SimpleLSTM(input_size=X_seq_2.shape[2])
with torch.no_grad():
    lstm_output_2 = model_2(X_seq_tensor_2).numpy()
lstm_cols_2 = [f'LSTM_seq2_embed_{i}' for i in range(lstm_output_2.shape[1])]
train_df.drop(columns=[col for col in lstm_cols_2 if col in train_df.columns], inplace=True)
lstm_df2 = pd.DataFrame(lstm_output_2, columns=lstm_cols_2)
train_df = pd.concat([train_df, lstm_df2], axis=1)

# 測試資料同步處理
X_seq_2_test = scaler_2.transform(test_df[seq_cols_2])
X_seq_2_test = X_seq_2_test.reshape(len(test_df), 20, -1)
X_seq_tensor_2_test = torch.tensor(X_seq_2_test, dtype=torch.float32)
with torch.no_grad():
    lstm_output_2_test = model_2(X_seq_tensor_2_test).numpy()
test_df.drop(columns=[col for col in lstm_cols_2 if col in test_df.columns], inplace=True)
lstm_df2_test = pd.DataFrame(lstm_output_2_test, columns=lstm_cols_2)
test_df = pd.concat([test_df, lstm_df2_test], axis=1)

# === 🌿 特徵與標籤分離 ===
target = "飆股"
features = [col for col in train_df.columns if col not in ["ID", target]]
X = train_df[features]
y = train_df[target]

print(f"📊 處理完成後的特徵數量：{len(features)} 個")

end_time = time.time()
print(f"✅ 資料處理完成，耗時：{end_time - start_time:.2f} 秒")


📦 資料預處理開始...
🪛 Cleaning training data...
🪛 Cleaning testing data...


Processing feature groups: 100%|█████████████████████████████████████████████████████████| 9/9 [00:02<00:00,  3.33it/s]


📊 處理完成後的特徵數量：10244 個
✅ 資料處理完成，耗時：102.53 秒


In [7]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm

start_time = time.time()
print("📦 資料預處理開始...")

# === 🧼 缺失值與型態處理 ===
for col in tqdm(train_df.columns, desc="Cleaning training data"):
    if train_df[col].dtype == 'float64':
        train_df[col] = train_df[col].astype('float32').replace([np.inf, -np.inf], np.nan).fillna(0)

# === 🎯 特定特徵群補值處理 ===
feature_groups = [
    "官股券商_", "個股券商分點", "個股主力買賣超統計",
    "日外資_", "日自營_", "日投信_",
    "技術指標_", "月營收_", "季IFRS財報_"
]
for prefix in tqdm(feature_groups, desc="Processing feature groups"):
    cols = [col for col in train_df.columns if col.startswith(prefix)]
    train_df[cols] = train_df[cols].replace([np.inf, -np.inf], np.nan).fillna(0)

# === ✅ 建立特徵儲存框架 ===
train_features_df = train_df.copy()

# === 📊 MA 特徵工程 ===
types = [
    "張增減", "金額增減(千)", "買張", "賣張", "買金額(千)", "賣金額(千)",
    "買筆數", "賣筆數", "買均張", "賣均張", "買均價", "賣均價", "買均值(千)", "賣均值(千)"
]

print("📊 生成 MA 特徵")
for rank in tqdm(range(1, 16), desc="MA 特徵"):
    for t in types:
        for ma in [5, 10, 20]:
            buy_cols = [f"買超第{rank}名分點前{i}天{t}" for i in range(1, ma + 1)]
            sell_cols = [f"賣超第{rank}名分點前{i}天{t}" for i in range(1, ma + 1)]

            if all(col in train_df.columns for col in buy_cols):
                train_features_df[f"買超第{rank}名_{t}_MA{ma}"] = train_df[buy_cols].mean(axis=1)

            if all(col in train_df.columns for col in sell_cols):
                train_features_df[f"賣超第{rank}名_{t}_MA{ma}"] = train_df[sell_cols].mean(axis=1)

# === 📊 EMA 特徵工程 ===
print("📊 生成 EMA 特徵")
for rank in tqdm(range(1, 16), desc="EMA 特徵"):
    for t in types:
        for span in [5, 10, 20]:
            buy_cols = [f"買超第{rank}名分點前{i}天{t}" for i in range(1, span + 1)]
            sell_cols = [f"賣超第{rank}名分點前{i}天{t}" for i in range(1, span + 1)]

            if all(col in train_df.columns for col in buy_cols):
                reversed_buy = np.fliplr(train_df[buy_cols].values)
                ema_vals = pd.DataFrame(reversed_buy).ewm(span=span, adjust=False).mean().iloc[:, -1].values
                train_features_df[f"買超第{rank}名_{t}_EMA{span}"] = ema_vals

            if all(col in train_df.columns for col in sell_cols):
                reversed_sell = np.fliplr(train_df[sell_cols].values)
                ema_vals = pd.DataFrame(reversed_sell).ewm(span=span, adjust=False).mean().iloc[:, -1].values
                train_features_df[f"賣超第{rank}名_{t}_EMA{span}"] = ema_vals

# === 📊 技術指標 ===
train_features_df['RSI_diff'] = train_df['技術指標_RSI(10)'].diff().fillna(0)
train_features_df['乖離率_change'] = train_df['技術指標_乖離率(20日)'].pct_change().fillna(0)

# === 📊 個股報酬率與波動度 ===
close_cols = [f'個股前{i}天收盤價' for i in range(1, 21)]
train_features_df['個股1天報酬率'] = (train_df['個股收盤價'] - train_df['個股前1天收盤價']) / train_df['個股前1天收盤價']
train_features_df['個股5天報酬率'] = (train_df['個股收盤價'] - train_df['個股前5天收盤價']) / train_df['個股前5天收盤價']
train_features_df['個股10天報酬率'] = (train_df['個股收盤價'] - train_df['個股前10天收盤價']) / train_df['個股前10天收盤價']
train_features_df['個股20天報酬率'] = (train_df['個股收盤價'] - train_df['個股前20天收盤價']) / train_df['個股前20天收盤價']
train_features_df['個股5天波動度'] = train_df[close_cols[:5]].std(axis=1)
train_features_df['個股10天波動度'] = train_df[close_cols[:10]].std(axis=1)
train_features_df['個股20天波動度'] = train_df[close_cols].std(axis=1)
train_features_df['個股5天乖離率'] = (train_df['個股收盤價'] - train_df[close_cols[:5]].mean(axis=1)) / train_df[close_cols[:5]].mean(axis=1)
train_features_df['個股10天乖離率'] = (train_df['個股收盤價'] - train_df[close_cols[:10]].mean(axis=1)) / train_df[close_cols[:10]].mean(axis=1)
train_features_df['個股19天乖離率'] = (train_df['個股收盤價'] - train_df[close_cols[:19]].mean(axis=1)) / train_df[close_cols[:19]].mean(axis=1)

# === 📊 成交量波動度 ===
volume_cols = [f'個股前{i}天成交量' for i in range(1, 21)]
train_features_df['個股5天成交量波動度'] = train_df[volume_cols[:5]].std(axis=1)
train_features_df['個股10天成交量波動度'] = train_df[volume_cols[:10]].std(axis=1)
train_features_df['個股20天成交量波動度'] = train_df[volume_cols].std(axis=1)

# === 📊 上市加權指數特徵 ===
market_close_cols = [f'上市加權指數前{i}天收盤價' for i in range(1, 21)]
market_vol_cols = [f'上市加權指數前{i}天成交量' for i in range(1, 21)]
train_features_df['上市加權指數1天報酬率'] = (train_df['上市加權指數收盤價'] - train_df['上市加權指數前1天收盤價']) / train_df['上市加權指數前1天收盤價']
train_features_df['上市加權指數5天報酬率'] = (train_df['上市加權指數收盤價'] - train_df['上市加權指數前5天收盤價']) / train_df['上市加權指數前5天收盤價']
train_features_df['上市加權指數10天報酬率'] = (train_df['上市加權指數收盤價'] - train_df['上市加權指數前10天收盤價']) / train_df['上市加權指數前10天收盤價']
train_features_df['上市加權指數20天報酬率'] = (train_df['上市加權指數收盤價'] - train_df['上市加權指數前20天收盤價']) / train_df['上市加權指數前20天收盤價']
train_features_df['上市加權指數5天波動度'] = train_df[market_close_cols[:5]].std(axis=1)
train_features_df['上市加權指數10天波動度'] = train_df[market_close_cols[:10]].std(axis=1)
train_features_df['上市加權指數20天波動度'] = train_df[market_close_cols].std(axis=1)
train_features_df['上市加權指數5天乖離率'] = (train_df['上市加權指數收盤價'] - train_df[market_close_cols[:5]].mean(axis=1)) / train_df[market_close_cols[:5]].mean(axis=1)
train_features_df['上市加權指數10天乖離率'] = (train_df['上市加權指數收盤價'] - train_df[market_close_cols[:10]].mean(axis=1)) / train_df[market_close_cols[:10]].mean(axis=1)
train_features_df['上市加權指數19天乖離率'] = (train_df['上市加權指數收盤價'] - train_df[market_close_cols[:19]].mean(axis=1)) / train_df[market_close_cols[:19]].mean(axis=1)
train_features_df['上市加權指數5天成交量波動度'] = train_df[market_vol_cols[:5]].std(axis=1)
train_features_df['上市加權指數10天成交量波動度'] = train_df[market_vol_cols[:10]].std(axis=1)
train_features_df['上市加權指數20天成交量波動度'] = train_df[market_vol_cols].std(axis=1)

# === 🎯 特徵與標籤分離 ===
target = "飆股"
features = [col for col in train_features_df.columns if col not in ["ID", target]]
X = train_features_df[features]
y = train_features_df[target]

end_time = time.time()
print(f"✅ 資料處理完成，耗時：{end_time - start_time:.2f} 秒")


📦 資料預處理開始...


Cleaning training data: 100%|██████████████████████████████████████████████████| 12736/12736 [00:11<00:00, 1068.92it/s]
Processing feature groups: 100%|█████████████████████████████████████████████████████████| 9/9 [00:02<00:00,  3.95it/s]


MemoryError: Unable to allocate 9.53 GiB for an array with shape (12734, 200864) and data type float32

In [21]:
#初始版本(已寫入特徵工程)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import shap
from tqdm import tqdm
from sklearn.model_selection import RandomizedSearchCV
# === 🧪 切分資料 ===
X_train, X_test, y_train, y_test = train_test_split(
    train_df[features], y, stratify=y, test_size=0.2, random_state=42
)

# === 🔧 設定類別權重 ===
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

# === 📈 模型直接訓練 (不使用SMOTE或ROS) ===
model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=1,
    reg_alpha=0.1,
    reg_lambda=1,
    scale_pos_weight=scale_pos_weight,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
model.fit(X_train, y_train)

# === 📊 SHAP重要特徵篩選（使用更多資料） ===
X_sample = X_train.sample(n=2000, random_state=42)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_sample)
shap_importance = pd.DataFrame({
    'feature': X_sample.columns,
    'mean_abs_shap': np.abs(shap_values).mean(axis=0)
}).sort_values(by='mean_abs_shap', ascending=False)

top_features = shap_importance.head(50)['feature'].tolist()

# === ♻️ 使用top特徵與GridSearchCV進行調參 ===

param_dist = {
    'n_estimators': [600, 800, 1000],
    'max_depth': [6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'gamma': [0, 1],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [1, 5, 10]
}

random_search = RandomizedSearchCV(
    estimator=XGBClassifier(
        scale_pos_weight=scale_pos_weight,
        eval_metric='logloss',
        random_state=42
    ),
    param_distributions=param_dist,
    n_iter=100,  # 搜尋40組參數
    scoring='f1',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train[top_features], y_train)
print("最佳參數:", random_search.best_params_)
print("最佳F1-score:", random_search.best_score_)


# === 📈 最佳模型交叉驗證評估 ===
best_model = random_search.best_estimator_

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = cross_val_score(best_model, X_train[top_features], y_train, cv=cv, scoring='f1')
print(f"交叉驗證 F1-score 平均值: {f1_scores.mean():.4f}")

# === 📤 測試資料預測 ===
X_public = test_df[top_features].replace([np.inf, -np.inf], np.nan).fillna(0)
test_df["飆股"] = best_model.predict(X_public)

# === 💾 輸出 submission 結果 ===
submission = test_df[["ID", "飆股"]]
submission.to_csv("public_result1.csv", index=False, encoding="utf-8", lineterminator='\n')
print("✅ 成功輸出 submission：public_result1.csv")


Cleaning training data: 100%|█████████████████████████████████████████████████| 11476/11476 [00:00<00:00, 19190.20it/s]
Cleaning testing data: 100%|█████████████████████████████████████████████████| 10214/10214 [00:00<00:00, 292178.15it/s]
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fitting 3 folds for each of 40 candidates, totalling 120 fits
最佳參數: {'subsample': 0.9, 'reg_lambda': 10, 'reg_alpha': 0.1, 'n_estimators': 800, 'max_depth': 8, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.9}
最佳F1-score: 0.7048661935785835
交叉驗證 F1-score 平均值: 0.7367
✅ 成功輸出 submission：public_result1.csv


In [11]:
# === 🎯 官股券商 & 籌碼分析特徵工程 ===

# 官股券商欄位
gov_cols = [col for col in train_df.columns if "官股券商_" in col]
train_df[gov_cols] = train_df[gov_cols].replace([np.inf, -np.inf], np.nan).fillna(0)

# 個股籌碼分析欄位（含熱門度、買賣分布係數、分點密度）
chip_cols = [col for col in train_df.columns if "個股券商分點" in col]
train_df[chip_cols] = train_df[chip_cols].replace([np.inf, -np.inf], np.nan).fillna(0)

# 主力買賣超統計資料
main_force_cols = [col for col in train_df.columns if "個股主力買賣超統計" in col]
train_df[main_force_cols] = train_df[main_force_cols].replace([np.inf, -np.inf], np.nan).fillna(0)

# 外資、自營、投信買賣欄位
inst_cols = [col for col in train_df.columns if col.startswith("日外資_") or col.startswith("日自營_") or col.startswith("日投信_")]
train_df[inst_cols] = train_df[inst_cols].replace([np.inf, -np.inf], np.nan).fillna(0)


In [12]:
# === 🎯 技術指標特徵處理 ===

# 抓出所有技術指標欄位
tech_cols = [col for col in train_df.columns if col.startswith("技術指標_")]

# 統一處理缺失與無限值
train_df[tech_cols] = train_df[tech_cols].replace([np.inf, -np.inf], np.nan).fillna(0)


In [13]:
# === 🎯 基本面（月營收）特徵處理 ===

# 所有以「月營收_」開頭的欄位統一抓取
fundamental_cols = [col for col in train_df.columns if col.startswith("月營收_")]

# 缺失處理與無限值補 0（某些財報欄位偶爾會空或除以 0）
train_df[fundamental_cols] = train_df[fundamental_cols].replace([np.inf, -np.inf], np.nan).fillna(0)


In [14]:
# === 📊 季IFRS財報特徵處理 ===

# 擷取所有季IFRS財報欄位
ifrs_cols = [col for col in train_df.columns if col.startswith("季IFRS財報_")]

# 替換 inf 與 NaN 為 0（避免報表計算錯誤或除以零）
train_df[ifrs_cols] = train_df[ifrs_cols].replace([np.inf, -np.inf], np.nan).fillna(0)
