In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import math
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from flash_attn import flash_attn_func
from xgboost import XGBRegressor, DMatrix
import xgboost as xgb

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression

from kneed import KneeLocator
from collections import defaultdict

from gpu_pca import IncrementalPCAonGPU

# 自訂模組
from library import StockUniverse, FactorLibrary, MarketInfo, FileLoader, FactorLibrary2

📂 讀取: Y:\因子回測_江建彰\因子庫.pkl


In [3]:
stock_universe = 'TWSE'
flib = FactorLibrary2(path=f'Y:\因子回測_江建彰\因子庫{stock_universe}.pkl')

📂 讀取: Y:\因子回測_江建彰\因子庫TWSE.pkl


In [4]:
class AllDayFactorDataset(Dataset):
    def __init__(self, stock_universe='TWSE'):
        self.multi_df = FileLoader.load(f'Y:\因子回測_江建彰\因子庫{stock_universe}.pkl')
        self.adj_close_df = pd.read_feather(r'Y:\因子回測_江建彰\補上缺值日頻收盤價.ftr')
        self.stock_list = self.get_stock_list(stock_universe)
        
        self.TPEX_df = MarketInfo.TPEX_norm()
        self.RoR_df = (self.adj_close_df.shift(-5) - self.adj_close_df.shift(-1)) / self.adj_close_df.shift(-1)
        self.RoR_df.dropna(axis=0, inplace=True)


        new_ticker_list = self.multi_df.columns.get_level_values('ticker')
        new_ticker_list = new_ticker_list[~new_ticker_list.duplicated()]

        self.stock_list = new_ticker_list
        self.RoR_df = self.RoR_df[self.stock_list]
        self.adj_close_df = self.adj_close_df[self.stock_list]
        # 這裡所有值都包含當天資訊所以要向後移
        self.restrict_range()
        self.check_validility()

    def check_validility(self):
        ticker_list1 = self.stock_list
        ticker_list2 = self.RoR_df.columns
        ticker_list3 = self.multi_df.columns.get_level_values('ticker')
        ticker_list3 = ticker_list3[~ticker_list3.duplicated()]
        assert len(ticker_list1)==len(ticker_list2)==len(ticker_list3)
        
        BOOL = True
        for i in range(len(ticker_list1)):
            if not (ticker_list1[i]==ticker_list2[i]==ticker_list3[i]):
                BOOL = False
        assert BOOL==True
        

        factor_list = self.multi_df.columns.get_level_values('factor')
        factor_list = factor_list[~factor_list.duplicated()]
        BOOL = True
        for factor_name in factor_list:
            ticker_list4 = self.multi_df.loc[ : , factor_name].columns
            for i in range(len(ticker_list1)):
                if ticker_list1[i]!=ticker_list4[i]:
                    BOOL = False
        assert BOOL==True
        
    

    def restrict_range(self, global_start='2020-04-01', global_end='2025-04-09'):
        self.multi_df     = self.multi_df.loc[global_start : global_end]
        self.adj_close_df = self.adj_close_df.loc[global_start : global_end]
        self.TPEX_df      = self.TPEX_df.loc[global_start : global_end]
        self.RoR_df       = self.RoR_df.loc[global_start : global_end]
    
        
        
        
    def get_stock_list(self, stock_univserse):
        if stock_univserse=='TWSE':
            ticker1 = StockUniverse.TWSE() 
        elif stock_univserse=='OTC':
            ticker1 = StockUniverse.OTC()
        elif stock_univserse=='all':
            ticker1 = StockUniverse.all()
            
        
        ticker2 = self.multi_df.columns.get_level_values('ticker')
        ticker3 = self.adj_close_df.columns
        return list(set(ticker1)&set(ticker2)&set(ticker3))


    

In [5]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
factor_dataset = AllDayFactorDataset(stock_universe='TWSE')

⚡ 快取使用: Y:\因子回測_江建彰\因子庫TWSE.pkl


In [6]:
RoR_df = factor_dataset.RoR_df
z_return_df = (RoR_df - RoR_df.mean(axis=1).values[:, None]) / RoR_df.std(axis=1).values[:, None]

In [7]:
factor_dataset.multi_df

factor,factor_0,factor_0,factor_0,factor_0,factor_0,factor_0,factor_0,factor_0,factor_0,factor_0,...,factor_185,factor_185,factor_185,factor_185,factor_185,factor_185,factor_185,factor_185,factor_185,factor_185
ticker,1101,1102,1103,1104,1108,1109,1110,1201,1203,1210,...,9939,9940,9941,9942,9943,9944,9945,9946,9955,9958
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2020-04-01,-0.673939,-0.455914,-0.600366,-0.193277,1.754857,-0.967506,-1.611826,1.346149,-0.139294,0.163777,...,0.066802,0.072487,0.058731,0.070173,0.068826,0.072342,0.041585,0.107296,0.074130,0.078821
2020-04-06,-0.327203,-0.146773,-0.556454,-0.166725,1.027622,-0.758691,-2.130461,-0.347443,-1.021574,-0.870032,...,0.082190,0.082626,0.078126,0.092643,0.096279,0.101995,0.070161,0.119151,0.100967,0.086217
2020-04-07,0.098016,1.224380,0.101407,0.234013,0.153450,-0.239635,-1.987577,-0.916866,-0.984516,-0.930563,...,0.068940,0.073008,0.074640,0.075921,0.077645,0.111064,0.090337,0.051750,0.089592,0.078324
2020-04-08,-0.319991,0.674528,0.049169,0.289316,-0.621626,-0.291409,-1.360423,-0.868599,-0.781669,-0.766940,...,0.055604,0.033972,0.036535,0.031070,0.035563,0.098852,0.055969,-0.003307,0.017020,0.083563
2020-04-09,-0.232543,0.327937,0.066867,0.110226,-0.713421,-0.238935,-0.264998,-0.536980,-0.391901,-0.507913,...,0.049386,0.022256,0.040502,0.031352,0.046665,0.097971,0.031984,0.103135,0.003805,0.043055
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-01,0.340768,0.900310,0.464735,0.591395,0.006090,0.103580,1.969913,0.432038,0.489601,0.704419,...,-1.143642,-0.416881,-0.168819,0.081369,0.144244,1.824791,-0.507640,-1.891102,1.824791,0.835318
2025-04-02,1.004445,0.171334,-0.185994,0.407952,-0.374743,-0.487857,1.728988,1.728988,1.096996,1.728988,...,-0.992188,0.137274,1.358135,0.246138,0.452394,1.848424,-0.107221,-1.861662,1.848424,1.022157
2025-04-07,1.361418,1.361418,1.182799,1.361418,1.356861,1.361418,1.361418,1.361418,1.361418,1.361418,...,-0.008507,0.000583,0.073191,0.038328,0.005722,0.076073,0.036290,0.023162,0.076073,0.074417
2025-04-08,1.427229,1.427229,1.427229,1.427229,1.427229,1.427229,1.427229,1.427229,1.427229,1.427229,...,-0.027256,-0.034378,-0.011736,-0.036078,-0.000374,0.047075,0.018541,0.000232,0.104005,0.104005


In [8]:
z_return_df

symbol_id,1101,1102,1103,1104,1108,1109,1110,1201,1203,1210,...,9939,9940,9941,9942,9943,9944,9945,9946,9955,9958
trade_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-04-01,-0.165141,0.396585,0.078443,0.089599,-0.619321,-0.148782,-0.933043,-0.686305,-0.901289,-0.731514,...,-0.416946,0.616390,0.865555,-0.155905,-0.268315,-1.961625,-1.344977,0.984428,0.483491,-0.488042
2020-04-06,-0.277162,0.299730,0.009240,0.353117,-0.010091,0.044924,0.815303,-0.652765,-0.360477,-0.378793,...,-0.460243,0.036481,0.434861,-0.045912,0.179594,-0.824940,-0.726843,3.296798,0.899926,-0.943563
2020-04-07,-0.386795,-0.370135,-0.196156,0.088294,0.371439,-0.151109,0.292804,-0.508770,-0.557872,0.005393,...,-0.385102,-0.630818,0.216666,-0.017604,-0.245133,-1.081967,-0.760730,1.611201,1.893717,-1.222098
2020-04-08,-0.337903,-0.311649,0.183218,-0.299780,0.037332,-0.164683,-0.697732,-0.454069,-0.728089,0.217207,...,-0.521642,-0.991485,0.152767,0.299659,0.876070,-1.256489,-0.638734,0.623656,1.738175,-0.584666
2020-04-09,-0.309666,-0.202460,-0.103384,-0.248170,0.039149,-0.059893,-0.340883,-0.429343,-0.537162,0.121838,...,-0.527237,-1.149834,-0.385602,0.331072,1.192523,-0.723895,-0.482876,0.157595,0.957960,-0.913058
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-01,1.402303,1.502139,1.127752,1.433449,1.246867,1.440701,1.367405,1.270386,2.186734,1.630942,...,1.727693,1.844437,0.736191,1.195100,1.699285,-0.427417,0.635624,-0.805535,-0.804724,-0.851156
2025-04-02,0.691778,1.509598,0.995400,1.080868,1.191060,1.187973,0.226545,0.860312,2.288222,1.404747,...,1.320865,1.150398,1.312095,0.869575,1.440482,-0.532895,0.617429,-1.453804,-0.249547,-0.339102
2025-04-07,-0.513137,-0.020134,0.020299,-0.345851,0.241752,-0.180007,-0.681870,-0.986001,0.524322,-0.283805,...,-0.341533,-0.537381,0.948872,0.289577,-0.466110,-1.111572,-0.524792,-1.388810,0.199153,2.128009
2025-04-08,-1.628228,-1.820185,-0.847520,-1.362537,-1.093070,-1.081226,-2.175328,-0.904898,-1.223189,-1.004073,...,-0.247416,-1.499600,0.679468,-0.348475,-1.614591,-1.061943,-0.538320,-0.860649,0.645606,2.593529


In [9]:
# ---------- 共用基底類別 ----------

class BaseFeatureSelector:
    def _knee_select(self, scores_sorted):
        x = list(range(len(scores_sorted)))
        y_score = scores_sorted.values
        kn = KneeLocator(x, y_score, curve="convex", direction="decreasing")
        elbow_idx = kn.knee if kn.knee is not None else 10  # fallback 預設選前10個
        return scores_sorted.iloc[:elbow_idx].index.tolist()

# ---------- 特徵選擇器們 ----------

class XGBFeatureSelector(BaseFeatureSelector):
    def select(self, model, X, y):
        importances = pd.Series(model.feature_importances_, index=X.columns)
        scores_sorted = importances.sort_values(ascending=False)
        return self._knee_select(scores_sorted)
"""



class XGBFeatureSelector(BaseFeatureSelector):
    def select(self, model, X, y):
        importances = pd.Series(model.feature_importances_, index=X.columns)
        scores_sorted = importances.sort_values(ascending=False)

        self._plot_feature_scores(scores_sorted, title="XGBoost Feature Importance")
        return self._knee_select(scores_sorted)

    def _plot_feature_scores(self, scores_sorted, title="Feature Importance + Elbow Point"):
        x = list(range(len(scores_sorted)))
        y = scores_sorted.values

        # 使用 KneeLocator 找 elbow 點
        kn = KneeLocator(x, y, curve="convex", direction="decreasing")
        elbow_idx = kn.knee if kn.knee is not None else 10

        # 繪圖
        plt.figure(figsize=(14, 6))
        plt.plot(x, y, marker='o', markersize=0.1 ,linewidth=1, label="Feature Score")
        plt.axvline(elbow_idx, color='red', linestyle='--', label=f'Elbow Point)')

        # 不顯示 X 軸特徵名稱
        plt.xlabel("Feature Rank")  # 或 plt.xlabel("") 如果你連文字都不要
        plt.ylabel("Importance Score")
        plt.title(title)
        plt.legend()
        plt.tight_layout()
        plt.show()

"""

class FRegressionFeatureSelector(BaseFeatureSelector):
    def select(self, model, X, y):
        f_scores, _ = f_regression(X, y)
        scores = pd.Series(f_scores, index=X.columns).fillna(0)
        scores_sorted = scores.sort_values(ascending=False)
        return self._knee_select(scores_sorted)


class MutualInfoFeatureSelector(BaseFeatureSelector):
    def select(self, model, X, y):
        mi_scores = mutual_info_regression(X, y, discrete_features='auto')
        scores = pd.Series(mi_scores, index=X.columns).fillna(0)
        scores_sorted = scores.sort_values(ascending=False)
        return self._knee_select(scores_sorted)


import torch
import torch.nn as nn
import torch.optim as optim

class LassoRegressionGPU(nn.Module):
    def __init__(self, n_features, alpha=1.0):
        super().__init__()
        self.linear = nn.Linear(n_features, 1)
        self.alpha = alpha

    def forward(self, x):
        return self.linear(x)

    def l1_penalty(self):
        return torch.sum(torch.abs(self.linear.weight))

class LassoFeatureSelectorGPU(BaseFeatureSelector):
    def __init__(self, alpha=1.0, lr=0.01, epochs=1000):
        self.alpha = alpha
        self.lr = lr
        self.epochs = epochs

    def select(self, model, X, y):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        X_tensor = torch.tensor(X.values, dtype=torch.float32).to(device)
        y_tensor = torch.tensor(y.reshape(-1, 1), dtype=torch.float32).to(device)

        model = LassoRegressionGPU(X.shape[1], alpha=self.alpha).to(device)
        optimizer = optim.Adam(model.parameters(), lr=self.lr)
        loss_fn = nn.MSELoss()

        for epoch in range(self.epochs):
            model.train()
            optimizer.zero_grad()
            preds = model(X_tensor)
            loss = loss_fn(preds, y_tensor) + self.alpha * model.l1_penalty()
            loss.backward()
            optimizer.step()

        weights = model.linear.weight.detach().cpu().numpy().flatten()
        scores = pd.Series(abs(weights), index=X.columns).fillna(0)
        scores_sorted = scores.sort_values(ascending=False)
        return self._knee_select(scores_sorted)

     
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd

class ElasticNetRegressionGPU(nn.Module):
    def __init__(self, n_features, alpha=1.0, l1_ratio=0.5):
        super().__init__()
        self.linear = nn.Linear(n_features, 1)
        self.alpha = alpha
        self.l1_ratio = l1_ratio  # 1.0 = Lasso, 0.0 = Ridge

    def forward(self, x):
        return self.linear(x)

    def elasticnet_penalty(self):
        l1 = torch.sum(torch.abs(self.linear.weight))
        l2 = torch.sum(self.linear.weight ** 2)
        return self.l1_ratio * l1 + (1 - self.l1_ratio) * l2

class ElasticNetFeatureSelectorGPU(BaseFeatureSelector):
    def __init__(self, alpha=1.0, l1_ratio=0.5, lr=0.01, epochs=1000):
        self.alpha = alpha
        self.l1_ratio = l1_ratio
        self.lr = lr
        self.epochs = epochs

    def select(self, model, X, y):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        X_tensor = torch.tensor(X.values, dtype=torch.float32).to(device)
        y_tensor = torch.tensor(y.reshape(-1, 1), dtype=torch.float32).to(device)

        model = ElasticNetRegressionGPU(X.shape[1], alpha=self.alpha, l1_ratio=self.l1_ratio).to(device)
        optimizer = optim.Adam(model.parameters(), lr=self.lr)
        loss_fn = nn.MSELoss()

        for epoch in range(self.epochs):
            model.train()
            optimizer.zero_grad()
            preds = model(X_tensor)
            loss = loss_fn(preds, y_tensor) + self.alpha * model.elasticnet_penalty()
            loss.backward()
            optimizer.step()

        weights = model.linear.weight.detach().cpu().numpy().flatten()
        scores = pd.Series(abs(weights), index=X.columns).fillna(0)
        scores_sorted = scores.sort_values(ascending=False)
        return self._knee_select(scores_sorted)



# ---------- 模型訓練主類別 ----------

class XGBRegression:
    def __init__(self, X_train, X_val, y_train, y_val, feature_selector):
        self.X_train_full = X_train
        self.X_val_full = X_val
        self.y_train = y_train
        self.y_val = y_val
        self.feature_selector = feature_selector

        # 若是 XGB 型特徵選擇器，需要先訓練模型
        pre_model = self._fit_model(X_train, y_train)

        # 特徵選擇
        self.top_features = self.feature_selector.select(pre_model, X_train, y_train)

        # 用選出的特徵重新訓練模型
        self.model = self._fit_model(X_train[self.top_features], y_train)

        # 計算驗證損失
        self.loss = self._get_loss()

    def _fit_model(self, X, y):
        model = XGBRegressor(
            tree_method="hist",
            device="cuda",
            n_estimators=100,
            learning_rate=0.05,
            max_depth=6,
            random_state=42
        )
        model.fit(X, y)
        return model

    def _get_loss(self):
        y_pred = self.model.predict(self.X_val_full[self.top_features])
        return mean_absolute_error(self.y_val, y_pred)




In [10]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import TimeSeriesSplit

class PCAPreprocessor:
    def __init__(self, n_splits=10, explained_var_threshold=0.99):
        self.n_splits = n_splits
        self.explained_var_threshold = explained_var_threshold
        

    def get_multidf_shift(self, n, multi_df):
        new_columns = [(f"{fac}_shift{n}", tic) for fac, tic in multi_df.columns]
        shifted = multi_df.copy()
        shifted.columns = pd.MultiIndex.from_tuples(new_columns, names=["factor", "ticker"])
        return shifted.shift(n)

    def get_all_multidf_shift(self, multi_df):
        shifts = [self.get_multidf_shift(i, multi_df.copy()) for i in range(0, 9)]
        all_shifted = pd.concat(shifts, axis=1).dropna(axis=0, how='any')
        return all_shifted

    def fit_pca(self, X, n_components):
        from gpu_pca import IncrementalPCAonGPU
        model = IncrementalPCAonGPU(n_components=n_components)
        model.fit(X)
        return model

    def reduce_dimension(self, model, X):
        X_tensor = torch.tensor(X, dtype=torch.float32).to(model.device)
        reduced = model.transform(X_tensor).cpu().detach().numpy()
        return reduced

    def prepare_pca_dataframe(self, df_flat, reduced_X, index_99):
        f_col_names = [f'PCA_factor_{j}' for j in range(index_99)]
        df = pd.DataFrame(columns=['Date', 'ticker'] + f_col_names)
        df['Date'] = df_flat['Date']
        df['ticker'] = df_flat['ticker']
        df[f_col_names] = reduced_X
        return df.pivot(index='Date', columns='ticker')

    def process_fold(self, i, train_idx, val_idx, factor_dataset, z_return_df):
        # 補強資料
        val_idx = list(range(val_idx[0] - 8, val_idx[-1] + 1))

        df_flat_train = (factor_dataset.multi_df.iloc[train_idx]
            .stack(level='ticker', future_stack=True)
            .reset_index()
            .sort_values(by=['Date', 'ticker'])
        )

        df_flat_val = (factor_dataset.multi_df.iloc[val_idx]
            .stack(level='ticker', future_stack=True)
            .reset_index()
            .sort_values(by=['Date', 'ticker'])
        )

        X_train = df_flat_train.drop(columns=['Date', 'ticker']).values
        X_val = df_flat_val.drop(columns=['Date', 'ticker']).values

        model_full = self.fit_pca(X_train, n_components=X_train.shape[1])
        ratios = model_full.explained_variance_ratio_.cpu().numpy()
        cum_ratios = np.cumsum(ratios) / np.sum(ratios)
        index_99 = np.argmax(cum_ratios > self.explained_var_threshold) + 1

        print(f'篩選出{index_99}個PCA特徵')

        model_n = self.fit_pca(X_train, n_components=index_99)
        X_train_reduced = self.reduce_dimension(model_n, X_train)
        X_val_reduced = self.reduce_dimension(model_n, X_val)

        pca_multidf_train = self.prepare_pca_dataframe(df_flat_train, X_train_reduced, index_99)
        pca_multidf_val = self.prepare_pca_dataframe(df_flat_val, X_val_reduced, index_99)

        all_multidf_train = self.get_all_multidf_shift(pca_multidf_train)
        all_multidf_val = self.get_all_multidf_shift(pca_multidf_val)

        train_stacked = (all_multidf_train.stack(level='ticker', future_stack=True)
            .reset_index()
            .sort_values(by=['Date', 'ticker'])
            .drop(columns=['Date', 'ticker']))

        start_dt, end_dt = all_multidf_train.index[0], all_multidf_train.index[-1]
        r_train_array = z_return_df.loc[start_dt:end_dt].stack().sort_index().values

        val_stacked = (all_multidf_val.stack(level='ticker', future_stack=True)
            .reset_index()
            .sort_values(by=['Date', 'ticker'])
            .drop(columns=['Date', 'ticker']))

        start_dt_val, end_dt_val = all_multidf_val.index[0], all_multidf_val.index[-1]
        r_val_array = z_return_df.loc[start_dt_val:end_dt_val].stack().sort_index().values


        return train_stacked, r_train_array, val_stacked, r_val_array


In [11]:
# 儲存結果：用 defaultdict + list，簡潔統一
feature_results = defaultdict(list)
loss_results = defaultdict(list)
# 特徵選擇器定義
SELECTORS = {
    'XGB': XGBFeatureSelector(),
    'FRegression': FRegressionFeatureSelector(),
    'LASSO': LassoFeatureSelectorGPU(alpha=0.1, epochs=500),
    'Elastic': ElasticNetFeatureSelectorGPU(alpha=0.01, l1_ratio=0.7, epochs=500)
}
# 抽出每種特徵選擇結果
def get_feature_result(selector_name, X_train, X_val, y_train, y_val):
    selector = SELECTORS[selector_name]
    model = XGBRegression(X_train, X_val, y_train, y_val, feature_selector=selector)
    return model.top_features, model.loss
# 時序交叉驗證
tscv = TimeSeriesSplit(n_splits=10)
# PCA Preprocessor
preprocessor = PCAPreprocessor()


restrict_start, restrict_end = '2020-04-01', '2024-09-30'

"""
只能在trainset範圍內做TimesplitCrossValidation
(restrict_start, restrict_end)
"""

for i, (train_idx, val_idx) in enumerate(tscv.split(factor_dataset.multi_df.loc[restrict_start:restrict_end])):
    print(f"Fold {i + 1}")
    result = preprocessor.process_fold(i, train_idx, val_idx, factor_dataset, z_return_df)
    X_train, y_train, X_val, y_val = result
    

    for name in SELECTORS.keys():
        try:
            top_features, loss = get_feature_result(name, X_train, X_val, y_train, y_val)
            feature_results[name].append(top_features)
            loss_results[name].append(loss)
            print(f"{name}: {loss:.4f}, {len(top_features)} features")
        except Exception as e:
            print(f"❌ {name} failed: {e}")

      # 如需全跑可移除



    



Fold 1
篩選出80個PCA特徵


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


XGB: 0.6372, 30 features
FRegression: 0.6399, 45 features
LASSO: 0.6378, 18 features
Elastic: 0.6390, 42 features
Fold 2
篩選出83個PCA特徵
XGB: 0.6518, 35 features
FRegression: 0.6502, 50 features
LASSO: 0.6522, 46 features
Elastic: 0.6513, 45 features
Fold 3
篩選出89個PCA特徵
XGB: 0.6495, 24 features
FRegression: 0.6502, 22 features
LASSO: 0.6498, 27 features
Elastic: 0.6514, 20 features
Fold 4
篩選出88個PCA特徵
XGB: 0.6413, 35 features
FRegression: 0.6405, 42 features
LASSO: 0.6409, 38 features
Elastic: 0.6410, 32 features
Fold 5
篩選出88個PCA特徵
XGB: 0.6600, 44 features
FRegression: 0.6606, 43 features
LASSO: 0.6603, 74 features
Elastic: 0.6603, 12 features
Fold 6
篩選出88個PCA特徵
XGB: 0.6585, 22 features
FRegression: 0.6594, 40 features
LASSO: 0.6581, 22 features
Elastic: 0.6585, 55 features
Fold 7
篩選出87個PCA特徵
XGB: 0.6316, 13 features
FRegression: 0.6295, 27 features
LASSO: 0.6313, 20 features
Elastic: 0.6301, 8 features
Fold 8
篩選出87個PCA特徵
XGB: 0.6300, 40 features
FRegression: 0.6307, 26 features
LASSO: 0.630

In [None]:

pca_num_list = [80,83,89,88,88,88,87,87,86,86]

In [1]:
import psutil

mem = psutil.virtual_memory()

total_MB = mem.total / 1024 / 1024
available_MB = mem.available / 1024 / 1024

print(f"✅ 系統總記憶體: {total_MB:.2f} MB")
print(f"🟢 當前可用記憶體: {available_MB:.2f} MB")


✅ 系統總記憶體: 65298.48 MB
🟢 當前可用記憶體: 33635.85 MB


In [126]:
import numpy as np
import pandas as pd

# 收集統計結果
summary = []

for name, losses in loss_results.items():
    mae_mean = np.mean(losses)
    mae_std = np.std(losses)
    mae_to_std = mae_mean / mae_std if mae_std > 0 else np.inf

    summary.append({
        'Selector': name,
        'MAE Mean': mae_mean,
        'MAE Std': mae_std,
        'MAE Mean / Std': mae_to_std
    })

summary_df = pd.DataFrame(summary).sort_values(by='MAE Mean')
print(summary_df)


      Selector  MAE Mean   MAE Std  MAE Mean / Std
2        LASSO  0.645199  0.009801       65.829817
1  FRegression  0.645215  0.010141       63.625243
3      Elastic  0.645258  0.010306       62.607741
0          XGB  0.645369  0.009890       65.252973


In [12]:
loss_results

defaultdict(list,
            {'XGB': [0.6371521652280511,
              0.6518391206075661,
              0.649466554097214,
              0.6412612480876408,
              0.6599712763248164,
              0.6585298825669097,
              0.6316433739986268,
              0.6299590613687208,
              0.6439396595265641,
              0.6499289862859381],
             'FRegression': [0.6398752897588575,
              0.6502249588099752,
              0.6502253080225091,
              0.6404678686507277,
              0.6605854326012626,
              0.6593803803173828,
              0.6295238863225311,
              0.6306534043977898,
              0.641628809671446,
              0.6495854653051842],
             'LASSO': [0.6377775713605365,
              0.6522308616926162,
              0.649828604424157,
              0.6408839961129676,
              0.6603473505575431,
              0.6580647777695662,
              0.6312919068148891,
              0.6307112328002469,


In [127]:
from scipy.stats import friedmanchisquare

stat, p = friedmanchisquare(
    loss_results['XGB'],
    loss_results['FRegression'],
    loss_results['LASSO'],
    loss_results['Elastic']
)

print(f"Friedman statistic: {stat:.4f}")
print(f"p-value: {p:.4f}")

if p < 0.05:
    print("✅ 不同模型之間在 MAE 上存在顯著差異")
else:
    print("❌ 不同模型之間在 MAE 上沒有顯著差異")


Friedman statistic: 0.1200
p-value: 0.9893
❌ 不同模型之間在 MAE 上沒有顯著差異


In [128]:
import scikit_posthocs as sp
import pandas as pd

# 組成 df，每欄是模型，每列是 fold
data = pd.DataFrame(loss_results)
sp.posthoc_nemenyi_friedman(data)


Unnamed: 0,XGB,FRegression,LASSO,Elastic
XGB,1.0,0.998155,1.0,0.998155
FRegression,0.998155,1.0,0.998155,0.985723
LASSO,1.0,0.998155,1.0,0.998155
Elastic,0.998155,0.985723,0.998155,1.0
