In [59]:
%matplotlib inline
from pycaret.classification import setup, compare_models, create_model, tune_model, finalize_model, \
    evaluate_model, plot_model, predict_model, save_model
import pandas as pd
import os

# Step 1: Read Data

In [None]:
df_train = pd.read_csv("data/training.csv")
df_test = pd.read_csv("data/public_x.csv")

In [51]:
df_test.head()

Unnamed: 0,ID,外資券商_分點進出,外資券商_分點買賣力,外資券商_分點成交力(%),外資券商_分點吃貨比(%),外資券商_分點出貨比(%),外資券商_前1天分點進出,外資券商_前1天分點買賣力,外資券商_前1天分點成交力(%),外資券商_前1天分點吃貨比(%),...,上市加權指數前14天成交量,上市加權指數前15天成交量,上市加權指數前16天成交量,上市加權指數前17天成交量,上市加權指數前18天成交量,上市加權指數前19天成交量,上市加權指數前20天成交量,上市加權指數5天成交量波動度,上市加權指數10天成交量波動度,上市加權指數20天成交量波動度
0,PU-1,1.2219,,1.2413,0.5313,0.5329,1.2258,,1.2238,0.5228,...,0.3492,0.1014,0.4583,0.2233,0.563,0.2876,0.2664,0.863,0.4614,0.3086
1,PU-2,1.2219,,1.2413,0.5313,0.5329,1.2243,1.2072,1.2238,0.5228,...,1.8991,2.3784,1.922,2.3146,3.4425,2.8982,2.8924,1.6241,2.2758,1.8835
2,PU-3,3.2984,1.4356,1.8457,3.5096,2.8033,-2.3714,1.2767,1.3264,2.3178,...,0.6375,0.4582,0.8285,0.7927,1.4526,1.2864,1.1145,0.5992,2.3232,2.0103
3,PU-4,1.2211,-0.3363,0.7311,0.5313,1.0516,1.2258,,1.2104,0.5228,...,0.5781,0.9577,0.6808,0.5511,0.1747,0.029,0.1628,0.8125,0.7227,0.5577
4,PU-5,1.5689,1.5861,1.8969,2.4814,1.7553,1.0782,1.301,1.2881,1.7097,...,1.0682,1.0271,0.9876,1.739,1.3109,1.6806,1.3216,4.1865,2.7835,1.7238


In [None]:
cols = df_train.columns
for col in df_train.columns:
    if '毛利率' in col:
        print(col)

季IFRS財報_毛利率(%)
季IFRS財報_毛利率累季(%)


# Step 2: Feature Engineering

In [None]:
keywords = [
    '外資', '投信', '自營商', '賣賣超', # 籌碼
    '乖離率', '收盤價', '技術指標_K', '技術指標_D', 'MACD',  # 技術
    '月營收', '營業利益', '毛利率' # 基本
    ]
selected_cols = [col for col in df_train.columns if any(kw in col for kw in keywords)]

print(selected_cols)
print(len(selected_cols))

['外資券商_分點進出', '外資券商_分點買賣力', '外資券商_分點成交力(%)', '外資券商_分點吃貨比(%)', '外資券商_分點出貨比(%)', '外資券商_前1天分點進出', '外資券商_前1天分點買賣力', '外資券商_前1天分點成交力(%)', '外資券商_前1天分點吃貨比(%)', '外資券商_前1天分點出貨比(%)', '外資券商_前2天分點進出', '外資券商_前2天分點買賣力', '外資券商_前2天分點成交力(%)', '外資券商_前2天分點吃貨比(%)', '外資券商_前2天分點出貨比(%)', '外資券商_前3天分點進出', '外資券商_前3天分點買賣力', '外資券商_前3天分點成交力(%)', '外資券商_前3天分點吃貨比(%)', '外資券商_前3天分點出貨比(%)', '外資券商_前4天分點進出', '外資券商_前4天分點買賣力', '外資券商_前4天分點成交力(%)', '外資券商_前4天分點吃貨比(%)', '外資券商_前4天分點出貨比(%)', '外資券商_前5天分點進出', '外資券商_前5天分點買賣力', '外資券商_前5天分點成交力(%)', '外資券商_前5天分點吃貨比(%)', '外資券商_前5天分點出貨比(%)', '外資券商_前6天分點進出', '外資券商_前6天分點買賣力', '外資券商_前6天分點成交力(%)', '外資券商_前6天分點吃貨比(%)', '外資券商_前6天分點出貨比(%)', '外資券商_前7天分點進出', '外資券商_前7天分點買賣力', '外資券商_前7天分點成交力(%)', '外資券商_前7天分點吃貨比(%)', '外資券商_前7天分點出貨比(%)', '外資券商_前8天分點進出', '外資券商_前8天分點買賣力', '外資券商_前8天分點成交力(%)', '外資券商_前8天分點吃貨比(%)', '外資券商_前8天分點出貨比(%)', '外資券商_前9天分點進出', '外資券商_前9天分點買賣力', '外資券商_前9天分點成交力(%)', '外資券商_前9天分點吃貨比(%)', '外資券商_前9天分點出貨比(%)', '外資券商_前10天分點進出', '外資券商_前10天分點買賣力', '外資券商_前10天分點成交力(%)', '外資券商_前10天分點吃貨比(%)', '外資券商_前

In [None]:
df_train_selected = df_train[selected_cols + ['飆股']]
df_test_selected = df_test[selected_cols]

# Step 3: PyCaret Setup

In [None]:
exp_clf = setup(
    data = df_train_selected, 
    target = '飆股', 
    session_id = 123, 
    feature_selection = True,        # Enable feature filtering
    fold_strategy='stratifiedkfold',      # Make sure there are two categories for each fold
    ignore_features = ['ID'],
    use_gpu = True,
)

[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4080 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4080 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0

Unnamed: 0,Description,Value
0,Session id,123
1,Target,飆股
2,Target type,Binary
3,Original data shape,"(200864, 270)"
4,Transformed data shape,"(200864, 54)"
5,Transformed train set shape,"(140604, 54)"
6,Transformed test set shape,"(60260, 54)"
7,Numeric features,269
8,Rows with missing values,100.0%
9,Preprocess,True


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4080 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4080 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0

In [19]:
exp_clf.get_config('pipeline')

# Step 4: Model training and adjustment

In [None]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9941,0.9638,0.1973,1.0,0.3269,0.3254,0.4396,4.446
rf,Random Forest Classifier,0.9937,0.9525,0.1361,1.0,0.2379,0.2366,0.3647,5.57
gbc,Gradient Boosting Classifier,0.9934,0.962,0.1545,0.7209,0.2538,0.2519,0.3311,90.395
lightgbm,Light Gradient Boosting Machine,0.9929,0.969,0.2877,0.5384,0.3727,0.3694,0.389,4.507
ridge,Ridge Classifier,0.9927,0.934,0.0,0.0,0.0,0.0,0.0,3.468
dummy,Dummy Classifier,0.9927,0.5,0.0,0.0,0.0,0.0,0.0,3.069
knn,K Neighbors Classifier,0.9926,0.6808,0.034,0.4867,0.063,0.0621,0.1248,5.164
ada,Ada Boost Classifier,0.9925,0.9599,0.104,0.4432,0.167,0.1648,0.2106,20.142
svm,SVM - Linear Kernel,0.9923,0.7992,0.0049,0.0481,0.008,0.0073,0.0119,3.928
lr,Logistic Regression,0.9921,0.9243,0.0156,0.1457,0.028,0.0266,0.0451,4.312


In [55]:
tuned_model = tune_model(best_model) # Hyperparameter tuning

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9927,0.5,0.0,0.0,0.0,0.0,0.0
1,0.9927,0.5,0.0,0.0,0.0,0.0,0.0
2,0.9927,0.5,0.0,0.0,0.0,0.0,0.0
3,0.9927,0.5,0.0,0.0,0.0,0.0,0.0
4,0.9927,0.5,0.0,0.0,0.0,0.0,0.0
5,0.9927,0.5,0.0,0.0,0.0,0.0,0.0
6,0.9927,0.5,0.0,0.0,0.0,0.0,0.0
7,0.9927,0.5,0.0,0.0,0.0,0.0,0.0
8,0.9927,0.5,0.0,0.0,0.0,0.0,0.0
9,0.9927,0.5,0.0,0.0,0.0,0.0,0.0


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [None]:
final_model = finalize_model(tuned_model) # Train the model with all data

In [None]:
os.makedirs("model", exist_ok=True)
save_model(best_model, 'model/best_model_202503251210')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['外資券商_分點進出', '外資券商_分點買賣力',
                                              '外資券商_分點成交力(%)', '外資券商_分點吃貨比(%)',
                                              '外資券商_分點出貨比(%)', '外資券商_前1天分點進出',
                                              '外資券商_前1天分點買賣力',
                                              '外資券商_前1天分點成交力(%)',
                                              '外資券商_前1天分點吃貨比(%)',
                                              '外資券商_前1天分點出貨比(%)', '外資券商_前2天分點進出',
                                              '外資券商_前2天分點買賣力',
                                              '外資券商_前2天分點成交力(%)',
                                              '外資券商_前2天分點吃貨比(%)',
                                              '外資券商_前2天分點出貨...
                  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                

# Step 5: Model evaluation

In [49]:
evaluate_model(best_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

# Step 6: Model Prediction

In [None]:
best_model = load_model('./model/best_model_202503251210')

Transformation Pipeline and Model Successfully Loaded


In [29]:
predictions = predict_model(best_model, data=df_test_selected)

In [30]:
predictions

Unnamed: 0,外資券商_分點進出,外資券商_分點買賣力,外資券商_分點成交力(%),外資券商_分點吃貨比(%),外資券商_分點出貨比(%),外資券商_前1天分點進出,外資券商_前1天分點買賣力,外資券商_前1天分點成交力(%),外資券商_前1天分點吃貨比(%),外資券商_前1天分點出貨比(%),...,上市加權指數前16天收盤價,上市加權指數前17天收盤價,上市加權指數前18天收盤價,上市加權指數前19天收盤價,上市加權指數前20天收盤價,上市加權指數5天乖離率,上市加權指數10天乖離率,上市加權指數19天乖離率,prediction_label,prediction_score
0,1.2219,,1.2413,0.5313,0.5329,1.2258,,1.2238,0.5228,0.5343,...,0.2350,0.2142,0.2106,0.2482,0.2772,0.9932,1.0880,1.1951,0,1.00
1,1.2219,,1.2413,0.5313,0.5329,1.2243,1.2072,1.2238,0.5228,0.7477,...,2.1812,2.0894,2.0801,2.2595,2.1809,0.6168,0.1937,0.5583,0,1.00
2,3.2984,1.4356,1.8457,3.5096,2.8033,-2.3714,1.2767,1.3264,2.3178,4.5040,...,1.6711,1.6184,1.6358,1.6796,1.7660,1.1539,0.6630,0.0192,0,0.99
3,1.2211,-0.3363,0.7311,0.5313,1.0516,1.2258,,1.2104,0.5228,0.5343,...,0.2202,0.2200,0.1790,0.1886,0.2123,1.0479,0.8022,0.7863,0,1.00
4,1.5689,1.5861,1.8969,2.4814,1.7553,1.0782,1.3010,1.2881,1.7097,2.0327,...,0.9382,0.9538,0.9505,0.8638,0.9011,-2.2153,-1.4432,-0.5617,0,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25103,1.2219,,1.2413,0.5313,0.5329,1.2250,1.2072,1.2238,0.5228,0.6311,...,2.0316,1.9386,1.9470,1.9270,1.8446,1.8832,0.9469,0.7410,0,1.00
25104,1.3073,1.2751,1.3396,3.4861,3.2950,-0.0939,1.2131,1.2345,3.0997,5.1106,...,2.5524,2.5708,2.5030,2.4973,2.4348,0.9875,1.1887,1.6128,0,0.98
25105,1.2041,-0.3363,0.3484,0.5313,1.4406,1.1811,1.0835,1.1827,0.5228,1.8184,...,0.1760,0.1847,0.1573,0.1994,0.1994,0.5661,0.6024,1.0461,0,1.00
25106,1.0892,0.6864,0.8777,0.8795,1.2398,1.4488,1.1700,1.2118,1.2927,1.0332,...,2.6849,2.6925,2.6192,2.6323,2.5999,0.3372,0.1103,-0.3843,0,0.99


In [None]:
if 'ID' in df_test.columns:
    submission = df_test[['ID']].copy()
    submission['飆股'] = predictions['prediction_label']
else:
    submission = predictions[['Label']].copy()
    submission['Id'] = range(1, len(predictions)+1)
    submission = submission[['Id', 'Label']]
    submission.columns = ['Id', 'Category']

os.makedirs("output", exist_ok=True)
submission.to_csv("output/submission_202503251210.csv", index=False)

submission.head()

Unnamed: 0,ID,飆股
0,PU-1,0
1,PU-2,0
2,PU-3,0
3,PU-4,0
4,PU-5,0
