In [None]:
! pip install pycaret



In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks/AIGO

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/AIGO


In [None]:
%matplotlib inline
from pycaret.classification import setup, compare_models, create_model, tune_model, finalize_model, \
    evaluate_model, plot_model, predict_model, save_model
import pandas as pd
import os

# Step 1: Read Data

In [None]:
df_train = pd.read_csv("data/training.csv")
df_test = pd.read_csv("data/public_x.csv")

In [None]:
print(df_train.shape)
print(df_test.shape)

(200864, 10214)
(25108, 10213)


In [None]:
df_train.head()

Unnamed: 0,ID,外資券商_分點進出,外資券商_分點買賣力,外資券商_分點成交力(%),外資券商_分點吃貨比(%),外資券商_分點出貨比(%),外資券商_前1天分點進出,外資券商_前1天分點買賣力,外資券商_前1天分點成交力(%),外資券商_前1天分點吃貨比(%),...,上市加權指數前15天成交量,上市加權指數前16天成交量,上市加權指數前17天成交量,上市加權指數前18天成交量,上市加權指數前19天成交量,上市加權指數前20天成交量,上市加權指數5天成交量波動度,上市加權指數10天成交量波動度,上市加權指數20天成交量波動度,飆股
0,TR-1,1.2227,2.8303,1.2789,0.5707,0.5329,1.2219,1.2686,1.2255,0.5228,...,0.3121,0.1312,0.2415,0.3157,0.4439,0.0533,0.1951,0.1357,0.2388,0
1,TR-2,1.2297,2.8303,1.7995,1.117,0.5329,1.2313,1.4432,1.2657,1.066,...,0.9439,0.7407,0.4615,0.4663,0.6618,0.1356,0.4063,0.58,0.3199,0
2,TR-3,1.3127,1.2826,1.4307,5.0469,4.7141,1.1049,1.2271,1.2514,3.0514,...,0.4925,0.7805,1.2576,0.9081,1.1555,1.4803,0.1924,0.4615,0.7695,0
3,TR-4,1.2219,,1.2413,0.5313,0.5329,1.2258,,1.2238,0.5228,...,1.7776,0.846,0.8016,0.8309,1.6546,1.5733,0.7886,0.208,0.7321,0
4,TR-5,0.7988,1.0454,0.8652,1.8842,2.2257,0.7108,1.1333,1.1712,1.5242,...,0.7412,0.8641,0.8693,0.7486,0.2326,0.4883,1.8395,1.4787,0.9049,0


In [None]:
df_train['飆股'].value_counts()

Unnamed: 0_level_0,count
飆股,Unnamed: 1_level_1
0,199394
1,1470


# Step 2: Feature Engineering

In [None]:
cols = df_train.columns
for col in df_train.columns:
    if '毛利率' in col:
        print(col)

季IFRS財報_毛利率(%)
季IFRS財報_毛利率累季(%)


In [None]:
keywords = [
    '外資', '投信', '自營商', '賣賣超', # 籌碼
    '乖離率', '收盤價', '技術指標_K', '技術指標_D', 'MACD',  # 技術
    '月營收', '營業利益', '毛利率' # 基本
    ]
selected_cols = [col for col in df_train.columns if any(kw in col for kw in keywords)]

print(selected_cols)
print(len(selected_cols))

['外資券商_分點進出', '外資券商_分點買賣力', '外資券商_分點成交力(%)', '外資券商_分點吃貨比(%)', '外資券商_分點出貨比(%)', '外資券商_前1天分點進出', '外資券商_前1天分點買賣力', '外資券商_前1天分點成交力(%)', '外資券商_前1天分點吃貨比(%)', '外資券商_前1天分點出貨比(%)', '外資券商_前2天分點進出', '外資券商_前2天分點買賣力', '外資券商_前2天分點成交力(%)', '外資券商_前2天分點吃貨比(%)', '外資券商_前2天分點出貨比(%)', '外資券商_前3天分點進出', '外資券商_前3天分點買賣力', '外資券商_前3天分點成交力(%)', '外資券商_前3天分點吃貨比(%)', '外資券商_前3天分點出貨比(%)', '外資券商_前4天分點進出', '外資券商_前4天分點買賣力', '外資券商_前4天分點成交力(%)', '外資券商_前4天分點吃貨比(%)', '外資券商_前4天分點出貨比(%)', '外資券商_前5天分點進出', '外資券商_前5天分點買賣力', '外資券商_前5天分點成交力(%)', '外資券商_前5天分點吃貨比(%)', '外資券商_前5天分點出貨比(%)', '外資券商_前6天分點進出', '外資券商_前6天分點買賣力', '外資券商_前6天分點成交力(%)', '外資券商_前6天分點吃貨比(%)', '外資券商_前6天分點出貨比(%)', '外資券商_前7天分點進出', '外資券商_前7天分點買賣力', '外資券商_前7天分點成交力(%)', '外資券商_前7天分點吃貨比(%)', '外資券商_前7天分點出貨比(%)', '外資券商_前8天分點進出', '外資券商_前8天分點買賣力', '外資券商_前8天分點成交力(%)', '外資券商_前8天分點吃貨比(%)', '外資券商_前8天分點出貨比(%)', '外資券商_前9天分點進出', '外資券商_前9天分點買賣力', '外資券商_前9天分點成交力(%)', '外資券商_前9天分點吃貨比(%)', '外資券商_前9天分點出貨比(%)', '外資券商_前10天分點進出', '外資券商_前10天分點買賣力', '外資券商_前10天分點成交力(%)', '外資券商_前10天分點吃貨比(%)', '外資券商_前

In [None]:
df_train_selected = df_train[['ID'] + selected_cols + ['飆股']]
df_test_selected = df_test[['ID'] + selected_cols]

In [None]:
df_train_selected

Unnamed: 0,ID,外資券商_分點進出,外資券商_分點買賣力,外資券商_分點成交力(%),外資券商_分點吃貨比(%),外資券商_分點出貨比(%),外資券商_前1天分點進出,外資券商_前1天分點買賣力,外資券商_前1天分點成交力(%),外資券商_前1天分點吃貨比(%),...,上市加權指數前15天收盤價,上市加權指數前16天收盤價,上市加權指數前17天收盤價,上市加權指數前18天收盤價,上市加權指數前19天收盤價,上市加權指數前20天收盤價,上市加權指數5天乖離率,上市加權指數10天乖離率,上市加權指數19天乖離率,飆股
0,TR-1,1.2227,2.8303,1.2789,0.5707,0.5329,1.2219,1.2686,1.2255,0.5228,...,-0.1848,-0.2071,-0.2070,-0.1941,-0.2030,-0.2347,-0.1436,0.0509,0.5606,0
1,TR-2,1.2297,2.8303,1.7995,1.1170,0.5329,1.2313,1.4432,1.2657,1.0660,...,-0.0834,-0.0038,0.0036,0.0731,0.0614,0.0727,1.0661,0.5333,0.8673,0
2,TR-3,1.3127,1.2826,1.4307,5.0469,4.7141,1.1049,1.2271,1.2514,3.0514,...,1.9106,1.9750,1.9706,1.9010,1.8601,1.9388,1.2908,0.9844,0.8960,0
3,TR-4,1.2219,,1.2413,0.5313,0.5329,1.2258,,1.2238,0.5228,...,2.3785,2.3049,2.2553,2.2237,2.2356,2.2834,2.2178,1.3338,1.2261,0
4,TR-5,0.7988,1.0454,0.8652,1.8842,2.2257,0.7108,1.1333,1.1712,1.5242,...,0.1199,0.1243,0.0784,0.0209,-0.0246,-0.0100,4.0936,3.5904,3.6015,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200859,TR-200860,1.2142,1.0947,0.9544,1.9464,2.1952,1.2462,1.1850,1.2088,2.1120,...,1.9401,1.9573,1.9157,1.9797,1.9540,1.9701,-0.7062,-0.5203,-0.0078,0
200860,TR-200861,1.2537,1.8772,1.7758,1.5163,0.9437,1.2729,1.3869,1.2519,1.1000,...,0.5524,0.5248,0.4608,0.4439,0.4056,0.4063,0.5096,0.8080,1.0108,0
200861,TR-200862,1.2219,,1.2413,0.5313,0.5329,1.2227,1.2072,1.2238,0.8381,...,0.2346,0.2350,0.2142,0.2106,0.2482,0.2772,0.9932,1.0880,1.1951,0
200862,TR-200863,2.5346,2.0940,3.6511,4.1595,1.5976,0.6009,1.3277,1.3877,2.7268,...,2.0940,2.0803,2.0655,2.0408,2.0436,2.0649,1.2413,0.9848,0.3719,0


# Step 3: PyCaret Setup

In [None]:
from imblearn.over_sampling import SMOTE

exp_clf = setup(
    data = df_train_selected,
    # data = df_train,
    target = '飆股',
    session_id = 123,
    feature_selection = True,        # Enable feature filtering
    fold_strategy='stratifiedkfold',      # Make sure there are two categories for each fold
    ignore_features = ['ID'],
    use_gpu = True,
    fix_imbalance = True,   # Increase the number of minority class
    fix_imbalance_method = SMOTE(),
    transformation = True,         # Do data distribution adjustment (nonlinear conversion)

)

[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bi

Unnamed: 0,Description,Value
0,Session id,123
1,Target,飆股
2,Target type,Binary
3,Original data shape,"(200864, 271)"
4,Transformed data shape,"(339410, 54)"
5,Transformed train set shape,"(279150, 54)"
6,Transformed test set shape,"(60260, 54)"
7,Ignore features,1
8,Numeric features,269
9,Rows with missing values,100.0%


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1


In [None]:
exp_clf.get_config('pipeline')

# Step 4: Model training and adjustment

In [None]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.9954,0.9828,0.6161,0.7167,0.6614,0.659,0.6616,44.613
et,Extra Trees Classifier,0.9947,0.9791,0.4616,0.7195,0.5618,0.5593,0.5736,49.719
dummy,Dummy Classifier,0.9927,0.5,0.0,0.0,0.0,0.0,0.0,43.426
lightgbm,Light Gradient Boosting Machine,0.9877,0.9781,0.6452,0.3306,0.4363,0.4308,0.456,46.342
dt,Decision Tree Classifier,0.9784,0.7176,0.4529,0.159,0.2353,0.2269,0.2596,89.518
knn,K Neighbors Classifier,0.9593,0.8292,0.6064,0.1052,0.1793,0.1689,0.2409,44.273
gbc,Gradient Boosting Classifier,0.9323,0.9637,0.859,0.0863,0.1568,0.1454,0.2597,465.978
ada,Ada Boost Classifier,0.919,0.9545,0.8416,0.0716,0.132,0.1201,0.2313,120.805
lr,Logistic Regression,0.8849,0.9437,0.8775,0.0533,0.1004,0.0878,0.1996,44.623
lda,Linear Discriminant Analysis,0.8702,0.9405,0.8678,0.047,0.0892,0.0763,0.184,44.949


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

[2025-03-26 04:52:54.492] [CUML] [info] Unused keyword parameter: n_jobs during cuML estimator initialization


In [None]:
tuned_model = tune_model(best_model, optimize='F1') # Hyperparameter tuning

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9942,0.9878,0.6893,0.5868,0.6339,0.631,0.6331
1,0.9943,0.9873,0.767,0.5852,0.6639,0.661,0.6672
2,0.9947,0.977,0.6408,0.6346,0.6377,0.635,0.635
3,0.9937,0.9815,0.6214,0.5614,0.5899,0.5867,0.5874
4,0.9946,0.9693,0.6667,0.6182,0.6415,0.6388,0.6393
5,0.9931,0.984,0.699,0.5217,0.5975,0.5941,0.6006
6,0.9932,0.9739,0.6505,0.5276,0.5826,0.5792,0.5824
7,0.9938,0.9814,0.6602,0.5667,0.6099,0.6068,0.6086
8,0.9945,0.9879,0.699,0.6102,0.6516,0.6488,0.6504
9,0.9945,0.9805,0.7282,0.6048,0.6608,0.6581,0.6609


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[LightGBM] [Info] Number of positive: 125617, number of negative: 125617
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.273376 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 62740
[LightGBM] [Info] Number of data points in the train set: 251234, number of used features: 257
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 125617, number of negative: 125617
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.177913 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 62667
[LightGBM] [Info] Number of data points in the train set: 251234, number of used features: 253
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 125617, num

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [None]:
from sklearn.utils import estimator_html_repr
print(tuned_model.get_params() == best_model.get_params())  # True means the parameters are the same

True


In [None]:
final_model = finalize_model(tuned_model) # Train the model with all data

In [None]:
os.makedirs("model", exist_ok=True)
save_model(final_model, 'model/final_model_202503251210')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['外資券商_分點進出', '外資券商_分點買賣力',
                                              '外資券商_分點成交力(%)', '外資券商_分點吃貨比(%)',
                                              '外資券商_分點出貨比(%)', '外資券商_前1天分點進出',
                                              '外資券商_前1天分點買賣力',
                                              '外資券商_前1天分點成交力(%)',
                                              '外資券商_前1天分點吃貨比(%)',
                                              '外資券商_前1天分點出貨比(%)', '外資券商_前2天分點進出',
                                              '外資券商_前2天分點買賣力',
                                              '外資券商_前2天分點成交力(%)',
                                              '外資券商_前2天分點吃貨比(%)',
                                              '外資券商_前2天分點出貨...
                                importance_type=None,
                                interaction_const

# Step 5: Model evaluation

In [26]:
evaluate_model(best_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

# Step 6: Model Prediction

In [None]:
best_model = load_model('./model/final_model_202503251210')

Transformation Pipeline and Model Successfully Loaded


In [None]:
predictions = predict_model(best_model, data=df_test_selected)

In [None]:
predictions

Unnamed: 0,ID,外資券商_分點進出,外資券商_分點買賣力,外資券商_分點成交力(%),外資券商_分點吃貨比(%),外資券商_分點出貨比(%),外資券商_前1天分點進出,外資券商_前1天分點買賣力,外資券商_前1天分點成交力(%),外資券商_前1天分點吃貨比(%),...,上市加權指數前16天收盤價,上市加權指數前17天收盤價,上市加權指數前18天收盤價,上市加權指數前19天收盤價,上市加權指數前20天收盤價,上市加權指數5天乖離率,上市加權指數10天乖離率,上市加權指數19天乖離率,prediction_label,prediction_score
0,PU-1,1.2219,,1.2413,0.5313,0.5329,1.2258,,1.2238,0.5228,...,0.2350,0.2142,0.2106,0.2482,0.2772,0.9932,1.0880,1.1951,0,1.0000
1,PU-2,1.2219,,1.2413,0.5313,0.5329,1.2243,1.2072,1.2238,0.5228,...,2.1812,2.0894,2.0801,2.2595,2.1809,0.6168,0.1937,0.5583,0,0.9996
2,PU-3,3.2984,1.4356,1.8457,3.5096,2.8033,-2.3714,1.2767,1.3264,2.3178,...,1.6711,1.6184,1.6358,1.6796,1.7660,1.1539,0.6630,0.0192,0,1.0000
3,PU-4,1.2211,-0.3363,0.7311,0.5313,1.0516,1.2258,,1.2104,0.5228,...,0.2202,0.2200,0.1790,0.1886,0.2123,1.0479,0.8022,0.7863,0,0.9997
4,PU-5,1.5689,1.5861,1.8969,2.4814,1.7553,1.0782,1.3010,1.2881,1.7097,...,0.9382,0.9538,0.9505,0.8638,0.9011,-2.2153,-1.4432,-0.5617,0,0.9999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25103,PU-25104,1.2219,,1.2413,0.5313,0.5329,1.2250,1.2072,1.2238,0.5228,...,2.0316,1.9386,1.9470,1.9270,1.8446,1.8832,0.9469,0.7410,0,1.0000
25104,PU-25105,1.3073,1.2751,1.3396,3.4861,3.2950,-0.0939,1.2131,1.2345,3.0997,...,2.5524,2.5708,2.5030,2.4973,2.4348,0.9875,1.1887,1.6128,0,0.9971
25105,PU-25106,1.2041,-0.3363,0.3484,0.5313,1.4406,1.1811,1.0835,1.1827,0.5228,...,0.1760,0.1847,0.1573,0.1994,0.1994,0.5661,0.6024,1.0461,0,1.0000
25106,PU-25107,1.0892,0.6864,0.8777,0.8795,1.2398,1.4488,1.1700,1.2118,1.2927,...,2.6849,2.6925,2.6192,2.6323,2.5999,0.3372,0.1103,-0.3843,0,0.9932


In [None]:
if 'ID' in df_test.columns:
    submission = df_test[['ID']].copy()
    submission['飆股'] = predictions['prediction_label']
else:
    submission = predictions[['Label']].copy()
    submission['Id'] = range(1, len(predictions)+1)
    submission = submission[['Id', 'Label']]
    submission.columns = ['Id', 'Category']

os.makedirs("output", exist_ok=True)
submission.to_csv("output/submission_202503251210.csv", index=False)

submission.head()

Unnamed: 0,ID,飆股
0,PU-1,0
1,PU-2,0
2,PU-3,0
3,PU-4,0
4,PU-5,0
