## Instruction
In this notebook, We use data which I donwload from Tushare to make up some alpha factors and risk factors.
1. Load data from Tushare time range bettwen 2017.1 - 2023.3.
2. Calculate portfolio risk by PCA and save idiosynchritic values view a factor
3. Make up some factors like we did in P4 project.
5. Evaluate factor returns in 5D, 20D, 60D, 120D. and seperate factors into two parts, risk factors and alpha factors.

## Load Data

In [1]:
import pandas as pd 
import numpy as np
from tqdm import tqdm

from matplotlib import pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = [10, 6]

import warnings
warnings.filterwarnings('ignore')

In [2]:
universe_raw = pd.read_csv('raw_20170103_20230317.csv').iloc[:,1:]
fundamental_df = pd.read_csv('fundamental_20170103_20230317.csv').iloc[:,1:]
#universe_raw = pd.read_csv('tmp.csv').iloc[:,1:]

In [3]:
# fill null to 0
universe = universe_raw.copy(deep=True)
universe['pb'] = np.where(universe_raw['pb'].isnull(), 100., universe_raw['pb'])
universe['dt_eps'] = np.where(universe['dt_eps'].isnull(), 0., universe['dt_eps'])
universe['dt_eps_yoy'] = np.where(universe['dt_eps_yoy'].isnull(), 0., universe['dt_eps_yoy'])

## Data Explain
  Some data columns about fundamental indicators below
  - 'cfps','revenue_ps', 'quick_ratio',  每股现金流，每股营业收入，速冻比率
  - 'dt_eps','basic_eps_yoy','dt_eps_yoy',  每股收益
  - 'bps','bps_yoy',  每股净资产
  - 'extra_item','profit_dedt', 扣非，扣非净利润
  - 'roe_dt','q_dt_roe','roe_yoy',  净资产收益
  - 'capital_rese_ps','surplus_rese_ps',  每股资本公积，每股公积盈余 
  - 'gross_margin','interestdebt','ca_to_assets', 毛利，带息债务, 流动资产/总资产
  - 'ebt_yoy','roe_yoy','or_yoy','equity_yoy' 总利润增长，净资产收益增长，营业收入增长，净资产增长

### Overnight Return
This factor we had did in P4, 

We calculate factor here, and add sma indicators later.

$factor=(open_{today}-close_{yesterday})\div close_{yesterday}$

In [None]:
class CloseToOpen(pd.DataFrame):
    """
        Overnight Return Factor Constructor
    """
    def __init__(self,data):
        super(CloseToOpen, self).__init__(data)
        self.df = self

    def calculate(self):
        '''
        add open-close as a column named close_to_return
        :return: dataframe
        '''
        unique_ticker = self.df.ts_code.unique()
        tmp_df = pd.DataFrame()
        for ts_code in tqdm(unique_ticker, desc='close_to_open'):
            stock = self.df.loc[self.df.ts_code == ts_code][["ts_code", "trade_date", "open", "close"]]
            stock['alpha_close2open'] = (stock['open'].shift(-1).fillna(method='ffill') - stock['close'])/stock['close']
            tmp_df = tmp_df.append(stock, ignore_index=True)
        self.df = self.df.merge(tmp_df[["ts_code", "trade_date", "alpha_close2open"]], on=["ts_code", "trade_date"], how="inner")
        self.df['date'] = pd.to_datetime(self.df['trade_date'], format='%Y%m%d')
        self.df = self.df.set_index(['date']).sort_values(by=['date'])
        return self.df
    

universe = CloseToOpen(universe).calculate().get_sma_factors()

## Add Technology Indicators

In [5]:
# add volume
universe['volume'] = universe['amount']/universe['close']
universe = universe.drop_duplicates(['trade_date','ts_code'])

In [6]:
import stockstats

class IndicatorHelper(pd.DataFrame):
    """
        add indicators to dataframe
    """

    def __init__(self, data):
        super(IndicatorHelper, self).__init__(data)

        self.stocks = stockstats.StockDataFrame.retype(data.copy())
        self.df = self

    def add_technical_indicator(self, tech_indicator_list):
        """
        calculate technical indicators
        use stockstats package to add technical inidactors
        :param ticker: (df) pandas dataframe
        :param tech_indeicator_list list
        :return: (df) pandas dataframe
        """
        unique_ticker = self.df.ts_code.unique()

        indicator_df = pd.DataFrame()
        for i in tqdm(range(len(unique_ticker)), desc='add tech indicators'):
            temp_indicator = self.stocks[self.stocks.ts_code == unique_ticker[i]]
            temp_indicator = temp_indicator[tech_indicator_list + ['ts_code','trade_date']]
            indicator_df = indicator_df.append(temp_indicator, ignore_index=True)
        
        self.df = self.df.merge(indicator_df, on=["ts_code", "trade_date"], how="inner")
        self.df['date'] = pd.to_datetime(self.df['trade_date'],format='%Y%m%d')
        self.df = self.df.set_index(['date']).sort_values(by=['date']).drop_duplicates(['trade_date','ts_code'])
        return self.df
    

In [7]:
# add tech indicators
tech_indicator_list = [
                       'supertrend','close_5_sma','close_20_sma','close_60_sma',
                       'log-ret','atr_5','cci_6', 'vwma_5','vwma_25',
                       'close_10_kama_2_30', 'close_10_kama_5_30','close_2_kama',
                       'alpha_close2open_5_sma', 'alpha_close2open_20_sma'
                      ] 
universe = IndicatorHelper(universe).add_technical_indicator(tech_indicator_list)
universe['alpha_close2open_20_sma'] = -universe['alpha_close2open_20_sma']
# 'close_2_kama_20_mstd'

add tech indicators: 100%|████████████████████| 746/746 [10:31<00:00,  1.18it/s]


## Constructor Factors Based on Indicators

### Supertrend Factors
This factor based on supertrend and close 5 days sma

In [8]:
universe['alpha_supertrend'] = universe['close_5_sma'] - universe['supertrend']

### CCI Factors
This factor based on cci and atr 5 days window

In [9]:
universe['alpha_cci'] = -np.where(universe['cci_6']>200, (universe['cci_6']-200)*universe['atr_5'], \
                      np.where(universe['cci_6']<-200, (universe['cci_6']+200)*universe['atr_5'], universe['atr_5']*30))

### KAMA Factors

In [10]:
# add KAMA alpha factor
def KAMA_filter(df):    
    unique_stocks = df.ts_code.unique()
    all_df = pd.DataFrame()
    for ts_code in tqdm(unique_stocks, desc='kama filter'):
        tmp = df.loc[df.ts_code == ts_code]
        tmp['close_2_kama'] = tmp['close_2_kama'].fillna(method='ffill')
        tmp['kama_filter'] = tmp['close_2_kama'].rolling(window=20).std().fillna(method='bfill') * 0.6
        tmp['kama_prior'] = tmp['close_2_kama'].shift(-5).fillna(method='ffill')
        tmp['alpha_kama'] = (tmp['close_10_kama_2_30'] - tmp['close_10_kama_5_30'])-(tmp['close_2_kama'] - tmp['kama_prior'] - tmp['kama_filter'])
        #tmp['alpha_kama'] = -(tmp['close_2_kama'] - tmp['kama_prior'] - tmp['kama_filter'])
        all_df = all_df.append(tmp[['ts_code','trade_date','alpha_kama']], ignore_index=True)
    df = df.merge(all_df, on=['ts_code','trade_date'], how='left')
    df['date'] = pd.to_datetime(df['trade_date'],format='%Y%m%d')
    df = df.set_index(['date']).sort_values(by=['date'])
    return df

universe = KAMA_filter(universe)

kama filter: 100%|████████████████████████████| 746/746 [01:01<00:00, 12.09it/s]


## Constructor Factors Based on Paper
### Overnight Return

In [12]:
# we had add overnight return factors

### Winner And Loser
This factor we also did in P4. It express a ticker how to reach a return in a spicific period time 

We use a time window as T, and regression d and v $return = T*d + T^2*v$  => $factor=d*v$ 

In [336]:
from statsmodels.formula.api import ols

class WinnerAndLoser(pd.DataFrame):
    """
        Winner and Loser Factor Constructor
    """
    def __init__(self,data, win_length=20):
        super(WinnerAndLoser, self).__init__(data)
        self.df = self
        self.win_lenth = win_length

    def _regression(self, data):
        df = pd.DataFrame(data, columns=['log-ret'])
        df['acc_ret'] = df['log-ret'].cumsum()
        df['t_dir'] = np.arange(self.win_lenth)+1
        df['t_velocity'] = df['t_dir'] ** 2
        regression = ols(formula='acc_ret ~ 0 + t_dir + t_velocity', data=df)
        model = regression.fit()
        data['alpha_winlos'] = -model.params.t_dir * model.params.t_velocity
        return  data['alpha_winlos']


    def calculate(self):
        '''
        convert time to value
        regress return to get mu and beta each time
        add facotor mu*beta to colomns
        :return: dataframe
        '''
        tickers = self.df.ts_code.unique()
        factor_df = pd.DataFrame()
        for ticker in tqdm(tickers, desc='winner and loser'):
            tmp_df = self.df.loc[self.df.ts_code == ticker][['trade_date', 'ts_code', 'log-ret']]
            tmp_df['alpha_winlos'] = tmp_df['log-ret'].rolling(self.win_lenth).apply(self._regression)
            tmp_df['alpha_winlos'].fillna(method='bfill',inplace=True)
            factor_df = factor_df.append(tmp_df, ignore_index=True)
        self.df = self.df.merge(factor_df[["ts_code", "trade_date", "alpha_winlos"]], on=["ts_code", "trade_date"], how="left")
        self.df['date'] = pd.to_datetime(self.df['trade_date'],format='%Y%m%d')
        self.df = self.df.set_index(['date']).sort_values(by=['date'])
        return  self.df
    
    
test = universe.loc[universe.ts_code=='603538.SH']
#test = universe.loc[universe.ts_code=='002038.SZ']
test = WinnerAndLoser(test).calculate()
#universe = WinnerAndLoser(universe).calculate()

winner and loser: 100%|███████████████████████████| 1/1 [00:14<00:00, 14.97s/it]


### Skew And Momentum
This factor we also did in P4. It express minority and majority sentiment of investor how to impact on market.

We calculate skew and median of log-return distribution in a period time, the skew view as marjority sentiment and median can view as minority sentiment.

$factor = abs(skew) * median * volume\_ratio$

In [13]:
class SkewandMomentum(pd.DataFrame):
    """
        Expected Skewness and Momentum Factor Constructor
    """
    def __init__(self,data, win_length=10):
        super(SkewandMomentum, self).__init__(data)
        self.df = self
        self.win_length = win_length

    def calculate(self):
        '''
        convert time to value
        regress return to get mu and beta each time
        add facotor mu*beta to colomns
        :return: dataframe
        '''
        def calculate_factor(data):
            return abs(data.skew()) * data.median()
        
        tmp_df = pd.DataFrame()
        for stock_tuple in tqdm(self.groupby('ts_code'), desc='skew and momentum'):
            stock = stock_tuple[1]
            stock['alpha_skew2sentiment'] = stock['log-ret'].rolling(self.win_length).apply(calculate_factor)
            stock['alpha_skew2sentiment'] = stock['alpha_skew2sentiment'].fillna(method='bfill') * stock['volume_ratio']
            tmp_df = tmp_df.append(stock,ignore_index=True)
        self.df = self.df.merge(tmp_df[["ts_code", "trade_date", "alpha_skew2sentiment"]], on=["ts_code", "trade_date"], how="left")
        self.df['date'] = pd.to_datetime(self.df['trade_date'],format='%Y%m%d')
        self.df = self.df.set_index(['date']).sort_values(by=['date'])
        return self.df


#test = universe.loc[universe.ts_code=='603538.SH']
#test = universe.loc[universe.ts_code=='002038.SZ']
#test = SkewandMomentum(test).calculate()
universe = SkewandMomentum(universe).calculate()

skew and momentum: 100%|██████████████████████| 746/746 [07:41<00:00,  1.62it/s]


## Fundamental Factor
This factor based on ticker fundamentals, it usually take a long period time to archive return. So called take a long line to catch a big fish! The factor we define as:

$ factor1 = mean(volume\_5\_windows) - std(volume\_25\_windows) * ((60 \div pe) + (5 \div pb))$

$ factor2 =  dt\_eps * dt\_eps\_yoy * type\_value \div 30 $

$ alpha\_factor = factor1 * factor2 $

- volume: trade volume
- pb: profit div balance
- pe: profit div net balance
- type_value: fundamentals prereport levels from -3 to 3
- dt_eps: prifit each share of stock
- dt_eps_yoy: profit increase percent of dt_esp

In [324]:
def fundamentals_alpha_fundamental(df):
    all_df = pd.DataFrame()
    for ts_code in tqdm(df.ts_code.unique(), desc='fundamental factor'):
        tmp = df.loc[df.ts_code==ts_code]
        tmp['alpha_fundamental'] = -tmp['vwma_5'] + tmp['vwma_25']
        tmp['alpha_fundamental'] = tmp['alpha_fundamental'] * (60/tmp['pe'] + 5/tmp['pb']) \
                        + (abs(tmp['dt_eps']) * tmp['dt_eps_yoy'] / 30  * tmp['type_value'])
        
        all_df = all_df.append(tmp, ignore_index=True)
        
    df = df.merge(all_df[['ts_code','trade_date','alpha_fundamental']], on=['ts_code','trade_date'], how='left')
    df['date'] = pd.to_datetime(df['trade_date'],format='%Y%m%d')
    df = df.set_index(['date']).sort_values(by=['date'])
    return df
    

universe = fundamentals_alpha_fundamental(universe)

fundamental factor: 100%|█████████████████████| 746/746 [03:36<00:00,  3.45it/s]


## Ticker Pool
After create factors we filte data by each day more in detail. 

We do this step after calculate factors in case of some tickers add in our portfolio calculate factor by historical data which not exists.

In real trade with this model factor use, download data each day, if you ensure add new tickers into portfolio, ensure get data 2 month ago at least, then calculate factos.

In [357]:
# remove tickers by each day not exist history
def remove_tickers(df, exist_ticker_list):
    diff_df = df.loc[df.ts_code.isin(exist_ticker_list)==False]
    if diff_df.empty == False:
        # remove tickers not exist over 3month from day
        diff1 = (pd.to_datetime(diff_df['trade_date'],format='%Y%m%d')\
                -pd.to_datetime(diff_df['list_date'],format='%Y%m%d')).apply(lambda x: x.days) < 90
        # remove tickers pe > 80
        diff2 = diff_df.pe > 80
        # remove tickers pb > 10
        diff3 = diff_df.pb > 10
        # remove fundamental bad
        #type_dict = {'不确定':0, '预增':2, '首亏':-2, '预减':-2, '扭亏':0, '续亏':-3, '略增':1, '续盈':3, '略减':-1}
        diff4 = diff_df['type_value']<-1
        # dt_eps <0 means profit is negative , remove it.
        diff5 = diff_df['dt_eps']<0
        # get remove df
        diff_df = diff_df.loc[diff1| diff2| diff3| diff4| diff5]
        # get rest data
        df = df.loc[df.ts_code.isin(diff_df.ts_code)==False]
    return df

# clean tickers pool day by day
calendar = universe.trade_date.unique()
universe_raw = pd.DataFrame()
for dt in tqdm(calendar, desc='filter tikers'):
    tmp = universe.loc[universe['trade_date']==dt]
    if universe_raw.empty:
        tmp = remove_tickers(tmp, [])
    else:
        tmp = remove_tickers(tmp, universe_raw.ts_code)
    universe_raw = universe_raw.append(tmp, ignore_index=True)

universe_raw['date'] = pd.to_datetime(universe_raw['trade_date'], format='%Y%m%d')
universe_raw = universe_raw.set_index(['date']).sort_values(by=['date'])
print(universe_raw.shape, len(universe_raw.ts_code.unique()))
universe_raw.head()

filter tikers: 100%|████████████████████████| 1508/1508 [03:34<00:00,  7.03it/s]


(720535, 78) 666


Unnamed: 0_level_0,ts_code,trade_date,turnover_rate,volume_ratio,pe,pb,total_share,free_share,total_mv,circ_mv,...,close_10_kama_5_30,close_2_kama,alpha_supertrend,alpha_cci,alpha_kama,alpha_close2open,alpha_close2open_5_sma,alpha_close2open_20_sma,alpha_skew2sentiment,alpha_fundamental
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-03,002424.SZ,20170103,0.5445,1.11,64.8012,9.4694,141120.0,40099.7083,2665757.0,1382108.0,...,18.07591,18.07591,-0.693765,-6.6027,0.028025,-0.001059,-0.001059,0.001059,-0.000114,0.0
2017-01-03,300039.SZ,20170103,0.664,1.02,30.3915,3.9229,82943.16,46646.814,855973.4,635955.5,...,7.29349,7.29349,-0.176675,-2.1201,0.103738,0.0,0.0,-0.0,-6.1e-05,0.0
2017-01-03,002099.SZ,20170103,0.9631,0.78,26.607,2.7737,162276.7253,69669.6358,1374484.0,690725.4,...,7.57607,7.57607,-0.31308,-3.2202,0.092117,0.0,0.0,-0.0,-7e-05,0.0
2017-01-03,300386.SZ,20170103,5.2164,0.74,57.8394,7.0644,41804.4,14146.8338,1061832.0,395563.8,...,25.02316,25.02316,-1.561465,-15.6639,0.437622,0.015748,0.015748,-0.015748,-6.8e-05,0.0
2017-01-03,002317.SZ,20170103,0.7086,1.4,34.4093,3.1101,81482.3076,44854.075,1020973.0,617050.2,...,11.27689,11.27689,-0.5085,-5.67,0.182228,0.0,0.0,-0.0,-6.2e-05,0.0


In [370]:
# filter from 20170405
# filte data columns as we use to next step
field_list = [
       'ts_code', 'trade_date', 'turnover_rate', 'open', 'close', 'log-ret', 'pe', 'pb','amount', 'total_mv', 'circ_mv', 
       'type', 'type_value', 'name', 'industry',
       'issue_price', 'issue_amount', 'cfps','revenue_ps',
       'quick_ratio', 'dt_eps', 'basic_eps_yoy', 'dt_eps_yoy',
       'bps', 'bps_yoy', 'profit_dedt', 'roe_dt', 'q_dt_roe',
       'roe_yoy', 'capital_rese_ps', 'surplus_rese_ps', 'gross_margin',
       'interestdebt', 'ca_to_assets', 'ebt_yoy', 'or_yoy', 'equity_yoy', 
       'alpha_cci', 'alpha_kama', 'alpha_close2open',
       'alpha_close2open_5_sma', 'alpha_close2open_20_sma',
       'alpha_skew2sentiment', 'alpha_fundamental']
universe_raw = universe_raw.loc[universe_raw['trade_date']>=20170405]
universe_factors = universe_raw[field_list]
universe_factors.head()

Unnamed: 0_level_0,ts_code,trade_date,turnover_rate,open,close,log-ret,pe,pb,amount,total_mv,...,ebt_yoy,or_yoy,equity_yoy,alpha_cci,alpha_kama,alpha_close2open,alpha_close2open_5_sma,alpha_close2open_20_sma,alpha_skew2sentiment,alpha_fundamental
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-04-05,600572.SH,20170405,0.4569,6.05644,6.12919,0.01194,38.4274,3.8664,63187.1,1692232.0,...,,,,-2.771452,-0.00229,0.0,-0.000293,0.000654,-1.5e-05,0.397315
2017-04-05,002275.SZ,20170405,0.3376,15.22203,15.48044,0.020554,30.037,4.642,34147.48,1131413.0,...,,,,-9.917674,0.056424,0.0,0.000114,0.000674,2.1e-05,0.221905
2017-04-05,600332.SH,20170405,1.1421,26.07224,25.86146,-0.00882,30.4236,2.6451,343342.1,4587982.0,...,19.4661,4.7636,105.2474,-20.017214,0.069222,-0.007796,-0.001846,0.000899,-1.9e-05,-1.91659
2017-04-05,300463.SZ,20170405,0.7519,20.86,21.29636,0.024804,49.9001,5.6289,50951.09,1252710.0,...,,,,-17.864264,0.698483,0.0,0.002281,0.000472,-1e-05,0.817148
2017-04-05,300250.SZ,20170405,2.3052,22.77656,23.44932,0.030824,77.225,3.3803,61754.77,553034.0,...,,,,-21.879821,-1.645608,-0.005821,-0.001599,0.001929,-5.9e-05,4.103632


In [388]:
# save tmp data
# fundamental_df.to_csv('fundamental_20170103_20230317.csv')
# universe_raw.to_csv('raw_20170103_20230317.csv')
# universe_factors.to_csv('factors_20170103_20230317.csv')
# stock_list = np.random.choice(fundamental_df.ts_code,99,replace=False)
# if '603538.SH' not in stock_list:
#     stock_list = np.append(stock_list,['603538.SH'])
# sample_df = universe_factors.loc[(universe_factors['trade_date']>20220101)&(universe_factors.ts_code.isin(stock_list))]
sample_df.columns

Index(['ts_code', 'trade_date', 'turnover_rate', 'open', 'close', 'log-ret',
       'pe', 'pb', 'amount', 'total_mv', 'circ_mv', 'type', 'type_value',
       'name', 'industry', 'issue_price', 'issue_amount', 'cfps', 'revenue_ps',
       'quick_ratio', 'dt_eps', 'basic_eps_yoy', 'dt_eps_yoy', 'bps',
       'bps_yoy', 'profit_dedt', 'roe_dt', 'q_dt_roe', 'roe_yoy',
       'capital_rese_ps', 'surplus_rese_ps', 'gross_margin', 'interestdebt',
       'ca_to_assets', 'ebt_yoy', 'or_yoy', 'equity_yoy', 'alpha_cci',
       'alpha_kama', 'alpha_close2open', 'alpha_close2open_5_sma',
       'alpha_close2open_20_sma', 'alpha_skew2sentiment', 'alpha_fundamental'],
      dtype='object')

In [396]:
from scipy.stats import zscore
tmp = sample_df.loc[sample_df.ts_code== '603538.SH']
#tmp = tmp.loc[(tmp['trade_date']>20170401) & (tmp['trade_date']<20180101)]
#tmp[['close_5_sma','alpha_fundamental']] = tmp[['close_5_sma','alpha_fundamental']].apply(zscore)

#tmp['alpha'] = tmp['alpha'].rank(method='min', pct=True)
tmp[['alpha_kama','alpha_fundamental','close']] = tmp[['alpha_kama','alpha_fundamental','close']].apply(zscore, axis=0)
tmp['alpha_fundamental'] = tmp['alpha_fundamental']
#tmp['close'] = tmp['close']
#tmp[['close','alpha_fundamental']].plot(grid=True)
#tmp[['vwma_5','vwma_25','close','vr_6']].plot(subplots=True, grid=True, figsize=(8, 10))
#tmp[['vwma_5','vwma_25','close']].plot(grid=True)