In [21]:
from library import StockUniverse, FactorLibrary, MarketInfo
import pandas as pd
from torch.utils.data import Dataset
import numpy as np

# 上櫃公司OR ALL公司不能直接刪除資料
# 這一個 -> 檢查那些股票的因子是完整的，因為有些有殘缺，只保留沒有殘缺定在原本股票池的欄位
# 因為上櫃公司可能缺了某些因子

In [19]:
class FactorNormDataset(Dataset):
    def __init__(self, stock_universe='TWSE'):
        self.multi_df = FactorLibrary.multi_df
        self.adj_close_df = pd.read_feather(r'Y:\因子回測_江建彰\補上缺值日頻收盤價.ftr')
        self.stock_list = self.get_stock_list(stock_universe)

        self.TPEX_df = MarketInfo.TPEX()
        self.RoR_df = (self.adj_close_df.shift(-5) - self.adj_close_df.shift(-1)) / self.adj_close_df.shift(-1)
        self.RoR_df.dropna(axis=0 ,inplace=True)
        
        self.restrict_range()


        

        self.fill_na_value()
        #self.tensor_market = self.TPEX_df.values
        #self.tensor_return = self.RoR_df[self.stock_list]
        #print(f'stock list : {len(self.stock_list)}')

    def fill_na_value(self):
        factor_num = len(self.multi_df.columns.get_level_values('factor').unique())
        time = len(self.TPEX_df.index)
        """
        檢查那些股票的因子是完整的，因為有些有殘缺，只保留沒有殘缺定在原本股票池的欄位
        """
        new_stock_list = []
        for idx, ticker in enumerate(self.stock_list):
            numpy_array = self.multi_df.xs(ticker, axis=1, level='ticker').values
            shape = numpy_array.shape
            if(shape[1]==factor_num):
                new_stock_list.append(ticker)
        self.stock_list = new_stock_list
        
        self.multi_df = self.multi_df.loc[:, self.multi_df.columns.get_level_values(1).isin(self.stock_list)]

        """
        標準化區域(並且補上缺值)
        """
        for idx in range(factor_num):
            factor_name = f'factor_{idx}'
            if idx%20==0:
                print(idx)
            cols = self.multi_df.columns[self.multi_df.columns.get_level_values('factor') == factor_name]
            #print(cols)
            factor_df = self.multi_df.loc[:, cols]
            #print(factor_df)
    
            # 橫截面 winsorize + standardize
            transformed = factor_df.apply(self.winsorize_fillna_zero, axis=1)\
                                   .apply(self.standardize_fillna_zero, axis=1)
            
            self.multi_df.loc[:, cols] = transformed
            #print(self.multi_df.loc[ : , f'factor_{idx}'])
            #break
            
            

       

    @staticmethod
    def standardize_fillna_zero(row):
        valid = row[~row.isna()]                         # 有效值（非 NaN）
        if valid.empty:
            return row.fillna(0)
        standardized = (valid - valid.mean()) / valid.std(ddof=0)  # z-score 標準化
        row.update(standardized)                         # 更新原本 row
        return row.fillna(0)                             # 把剩下 NaN 補 0

    @staticmethod
    def winsorize_fillna_zero(row, n=2):
        """
        對每一橫截面 row 做 MAD 去極值，再補 NaN 為 0。
        :param row: pd.Series，一行（某一天）資料
        :param n: 幾倍 MAD
        :return: 去極值＋NaN 補 0 的 row
        """
        valid = row[~row.isna()]
        if valid.empty:
            return row.fillna(0)
        
        median = valid.median()
        mad = (valid - median).abs().median()
        up = median + n * 1.4826 * mad
        down = median - n * 1.4826 * mad
    
        clipped = valid.clip(lower=down, upper=up)
        row.update(clipped)
        return row.fillna(0)


   

    def restrict_range(self, global_start='2020-04-01', global_end='2025-04-09'):
        self.multi_df     = self.multi_df.loc[global_start : global_end]
        self.adj_close_df = self.adj_close_df.loc[global_start : global_end]
        self.TPEX_df      = self.TPEX_df.loc[global_start : global_end]
        self.RoR_df       = self.RoR_df.loc[global_start : global_end]
        
    def get_stock_list(self, stock_universe):
        if stock_universe=='TWSE':
            stock_list = StockUniverse.TWSE()
        elif stock_universe=='OTC':
            stock_list = StockUniverse.OTC()
        elif stock_universe=='all':
            stock_list = StockUniverse.all()
        
        ticker1 = stock_list # 可以改
        ticker2 = self.multi_df.columns.get_level_values('ticker')
        ticker3 = self.adj_close_df.columns
        return list(set(ticker1)&set(ticker2)&set(ticker3))

    def check(self):
        print(len(self.multi_df)==len(self.TPEX_df)==len(self.RoR_df)==len(self.adj_close_df))

    

In [20]:
e = FactorNormDataset(stock_universe='all')

0
20
40
60
80
100
120
140
160
180


In [13]:
e.multi_df

factor,factor_0,factor_0,factor_0,factor_0,factor_0,factor_0,factor_0,factor_0,factor_0,factor_0,...,factor_185,factor_185,factor_185,factor_185,factor_185,factor_185,factor_185,factor_185,factor_185,factor_185
ticker,1240,1268,1336,1565,1569,1570,1580,1584,1586,1591,...,8932,8933,8935,8936,8937,8938,8942,9950,9951,9962
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2020-04-01,0.163094,-0.840352,0.478392,-0.592883,1.796025,-0.687545,-0.390679,-1.100033,0.448920,-0.416707,...,0.194207,0.161518,0.179244,0.172759,-5.554008,0.177582,0.172258,0.168494,0.182445,0.168799
2020-04-06,-0.184074,-0.392232,0.063983,0.142768,1.802716,-0.184829,-0.199714,-0.294031,-0.348515,0.850856,...,-4.947570,0.187700,0.206338,0.192887,-4.947570,0.200033,0.197367,0.187700,0.211004,0.201992
2020-04-07,-0.687735,-0.846830,-0.729993,-0.004691,1.836616,-0.269753,0.030644,0.579134,-0.545959,1.525897,...,-5.554039,0.172373,0.168200,0.174731,0.179747,0.178105,0.180291,0.162302,0.196852,0.196852
2020-04-08,0.248219,-0.127206,-0.883969,0.498790,1.676479,-0.047411,1.098329,0.274384,-1.051445,0.971085,...,0.106605,0.115260,0.111040,0.138301,0.114737,0.127722,0.129975,0.117498,0.156556,0.152408
2020-04-09,0.355626,0.618663,-0.605220,-0.281240,1.842932,-0.098630,0.140168,0.179207,-0.996471,0.362546,...,0.125149,0.116628,0.121377,0.135606,0.104054,0.121316,0.128842,0.136271,0.153576,0.114072
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-01,0.447677,0.293960,-0.071632,1.969124,-0.231349,-0.423800,0.158500,1.969124,-0.695625,-0.770863,...,0.094204,0.079137,0.084567,0.096586,0.122160,0.106651,0.085284,0.078237,0.091593,0.093863
2025-04-02,0.849484,-0.234948,-0.359777,1.689098,0.570363,0.473812,-0.405979,1.689098,-0.405979,-0.571207,...,0.113477,0.091254,0.089351,0.101474,0.124640,0.115381,0.100588,0.099412,0.105007,0.126844
2025-04-07,1.364884,1.364884,-0.777285,1.364884,-0.462731,-0.282277,0.332624,1.364884,1.364884,-0.913147,...,0.119637,0.109726,0.101446,0.106768,0.124900,0.117203,0.106341,0.109235,0.109979,0.124900
2025-04-08,1.176484,1.176484,-0.868081,1.176484,-0.537910,-0.673632,1.176484,0.991692,1.176484,-0.909199,...,0.088543,-0.000769,-0.002791,-0.002365,0.043730,0.078030,0.032021,0.038212,0.090250,0.058972


In [14]:
e.multi_df.isna().sum().sum()

np.int64(0)

In [15]:
e.multi_df.to_pickle(r'Y:\因子回測_江建彰\因子庫all.pkl')