# Load Data From Tushare
We load data from 2018-2021. Due to some constraint of platform, we download thses data year by year and save seperately. After that we process these data.

In [1]:
import tushare as ts
import pandas as pd
import numpy as np

print(ts.__version__)

1.2.89


In [2]:
# register token
token = 'efb00e384fc87a6efe19bf75d4cf401319480f9e9b06c6dc4095890e' # your token
ts.set_token(token)
pro = ts.pro_api()

In [3]:
# got calendar from date range
start_date = '20180101'
end_date = '20210101'
calendar = pro.trade_cal(exchange='SSE', is_open='1', 
                            start_date=start_date, 
                            end_date=end_date, 
                            fields='cal_date')
# check all stocks exist in market today
stocks = pro.query('stock_basic', exchange='', list_status='L', market = '主板') # 主板/创业板/科创板/CDR/北交所
ts_code_list = ','.join(stocks.ts_code.values)
print(calendar.shape, stocks.shape)
calendar.tail()

(730, 1) (2188, 7)


Unnamed: 0,cal_date
725,20180108
726,20180105
727,20180104
728,20180103
729,20180102


In [4]:
# get fundamental data
# pick up market cap [5,30] billion
base_universe = pro.bak_daily(trade_date='20180102', 
                   fields='trade_date, ts_code, name, float_mv, total_mv, pe, turn_over, industry')
filte_stock = base_universe.loc[(base_universe.total_mv>=50) & (base_universe.total_mv<=300)]
filte_stock

Unnamed: 0,ts_code,trade_date,name,turn_over,pe,industry,float_mv,total_mv
3,600903.SH,20180102,贵州燃气,35.95,110.50,供气供热,22.62,150.81
9,000885.SZ,20180102,同力水泥,4.98,10.71,水泥,72.68,84.53
12,002372.SZ,20180102,伟星新材,1.75,27.46,其他建材,176.55,199.46
13,600507.SH,20180102,方大特钢,4.05,9.02,特种钢,185.12,185.12
14,300735.SZ,20180102,光弘科技,0.02,29.91,通信设备,14.04,56.15
...,...,...,...,...,...,...,...,...
3454,600074.SH,20180102,ST保千里,0.00,42.50,电脑设备,95.52,228.67
3455,600289.SH,20180102,ST信通,0.00,0.00,软件服务,53.42,59.57
3456,300630.SZ,20180102,普利制药,5.14,109.15,化学制药,22.02,88.08
3457,002127.SZ,20180102,南极电商,3.07,57.60,互联网,110.37,186.57


In [None]:
from helper import download_helper

In [None]:
# load total stock daily date for one year
ts_code_list = filte_stock.ts_code.values
all_stocks = download_helper.get_Daily_All(ts, ts_code_list, start_date, end_date)
print(all_stocks.shape)
all_stocks

In [None]:
# save data
universe = all_stocks.drop_duplicates()
universe.to_csv(start_date +'-'+ end_date + '.csv')
filte_stock.drop_duplicates().to_csv('fundamental_' + start_date +'.csv')

# Load Data by File
if we load data from saved file. 

In [1]:
# load data from csv
import pandas as pd
import numpy as np
universe = pd.read_csv('20180101-20210101.csv').iloc[:,1:]
fundamental = pd.read_csv('fundamental_20180101.csv').iloc[:,1:]

# Process Data
1. filter ma_v_120 top 500 stocks
2. add 'date' column as datetime type, and deascanding time
3. add industry infomation and boll indicator to stock 

In [2]:
from helper.factor_helper import IndicatorHelper

ind_helper = IndicatorHelper(universe)

# pick average amount 120 days top 500
universe = ind_helper.top(500, index='trade_date', ticker_column='ts_code', value_column='ma_v_120')

# add bollinger as indicator which will be used as a custom factor later
# the bollinger indicator make up by stockstats package which depends on column nameed close as default
tech_indicator_list = ['boll_ub','boll_lb']
universe = ind_helper.add_technical_indicator(tech_indicator_list)

# add industry and stock name
universe = ind_helper.add_by_basetable('ts_code', fundamental, ['industry', 'name'])

add tech indicators: 100%|████████████████████| 500/500 [00:55<00:00,  8.96it/s]
add fundamental info: 100%|███████████████████| 500/500 [00:21<00:00, 22.83it/s]


# Construct Factors
### Overnight Returns and Firm-Specific Investor Sentiment
the overnight return calculate by $\frac{open_t - close_{t-1}}{close_{t-1}}$
 
 paper calculate price by sum average 5 days as long factor, we just average 5 days
 
 use average 20 days of overnight return as a short factor

In [4]:
from helper.factor_helper import CloseToOpen

# cal close to open average moving 5day as long facor and 20day as short factor
cto = CloseToOpen(universe).calculate()
universe = cto.get_factors()
universe['close_to_open_25_sma'] = - universe['close_to_open_25_sma']

close_to_open: 100%|██████████████████████████| 500/500 [00:11<00:00, 43.12it/s]
add tech indicators: 100%|████████████████████| 500/500 [00:28<00:00, 17.47it/s]


###  Winners and Losers in Momentum Investing
 The stock price tragectories can be expressed by $p=\mu*time + \beta*time^2$ 
 
 We convert time as linner values and get $\mu$ and $\beta$ by regression method between price and constant values. 
 
 Final facotor expressed $\beta * \mu$
 
 This factor can express each stock tragectories relative convex. The $\mu$ be viewed as return direction and $\beta$ be viwed as return velocity

In [7]:
# regression use `statsmodels.formula.api` package
from helper.factor_helper import WinnerAndLoser
wl = WinnerAndLoser(universe).calculate()
universe = wl.get_factor()

 processing factors step/total 1/293968 processing factors step/total 2/293968 processing factors step/total 3/293968 processing factors step/total 4/293968 processing factors step/total 5/293968 processing factors step/total 6/293968 processing factors step/total 7/293968 processing factors step/total 8/293968 processing factors step/total 9/293968 processing factors step/total 10/293968 processing factors step/total 11/293968 processing factors step/total 12/293968 processing factors step/total 13/293968 processing factors step/total 14/293968 processing factors step/total 15/293968 processing factors step/total 16/293968 processing factors step/total 17/293968 processing factors step/total 18/293968 processing factors step/total 19/293968 processing factors step/total 20/293968 processing factors step/total 21/293968 processing factors step/total 22/293968 processing factors step/total 23/293968 processing factors step/total 24/293968 processing factors step

 processing factors step/total 293968/293968

###  Expected Skewness and Momentum
The skewness of returns distribution and media in a period time(20 trade day) can combine to be a factor.
 
factor = $skew * median$

In [8]:
from helper.factor_helper import SkewandMomentum
sm = SkewandMomentum(universe).calculate()
universe = sm.get_factor()

skew and momentum: 100%|██████████████████████| 500/500 [00:14<00:00, 34.92it/s]


In [1]:
# tmp file load
import pandas as pd
import numpy as np
universe = pd.read_csv('factor_finished.csv').iloc[:,1:]
universe.date = pd.to_datetime(universe.date)

### Arbitrage Asymmetry and the Idiosyncratic Volatility Puzzle
Based on the last parer, we use idiosyncratic martix and bollinger indicator to construct custom factors. 

### PCA risk model
we use log return to calculate covariance matrix $F=\frac{1}{N-1}rr^T$

In [2]:
from sklearn.decomposition import PCA
%matplotlib inline
import matplotlib.pyplot as plt
# Set the default figure size
plt.rcParams['figure.figsize'] = [10.0, 6.0]

class RiskModel(object):
    def __init__(self, returns, ann_factor, num_factor_exposures):
        
        self.num_factor_exposures = num_factor_exposures
        self.pca = PCA(n_components=num_factor_exposures, svd_solver=svd_solver)
        self.pca.fit(returns)
        
        self.factor_betas_ = self.factor_betas(self.pca, returns.columns.values, np.arange(num_factor_exposures))
        self.factor_returns_ = self.factor_returns(self.pca, returns, returns.index, np.arange(num_factor_exposures))
        self.factor_cov_matrix_ = self.factor_cov_matrix(self.factor_returns_, ann_factor)
        
        self.idiosyncratic_var_matrix_ = self.idiosyncratic_var_matrix(returns, 
                                            self.factor_returns_, self.factor_betas_, ann_factor)
        self.idiosyncratic_var_vector = pd.DataFrame(data=np.diag(self.idiosyncratic_var_matrix_),
                                                     index=returns.columns)
    
    # got new exposure expressed by pca model
    def factor_betas(self, pca, factor_beta_indices, factor_beta_columns):
        return pd.DataFrame(pca.components_.T, factor_beta_indices, factor_beta_columns)
    
    # got new factor returns expressed by pca model
    def factor_returns(self, pca, returns, factor_return_indices, factor_return_columns):
        return pd.DataFrame(pca.transform(returns), factor_return_indices, factor_return_columns)
    
    # got new factor covariance matirx by pca expressed returns
    def factor_cov_matrix(self, factor_returns, ann_factor):
        return np.diag(factor_returns.var(axis=0, ddof=1) * ann_factor)
    
    # calculate idiosyncratic need to got factor_returns, factor_betas which calculate by pca model first
    def idiosyncratic_var_matrix(self, returns, factor_returns, factor_betas, ann_factor):
        estimate_returns = pd.DataFrame(np.dot(factor_returns, factor_betas.T), returns.index, returns.columns)
        residuals = returns - estimate_returns
        return pd.DataFrame(np.diag(np.var(residuals))*ann_factor, returns.columns, returns.columns)
    
    def plot_principle_risk(self):
        # Make the bar plot
        plt.bar(np.arange(self.num_factor_exposures), self.pca.explained_variance_ratio_);
    

In [3]:
# got pivot dataframe index=time, columns=ticker values=pct_chg 
returns_df = universe.pivot(index='date', columns='ts_code', values='pct_chg').fillna(0)

# Set the annualized factor
ann_factor = 252

# Set the number of factor exposures (principal components) for the PCA algorithm
num_factor_exposures = 30

# Set the svd solver for the PCA algorithm
svd_solver = 'full'

# Create a RiskModel object
rm = RiskModel(returns_df, ann_factor, num_factor_exposures)

### view portfolio variance and idiosyncratic values

In [4]:
B = rm.factor_betas_
F = rm.factor_cov_matrix_
S = rm.idiosyncratic_var_matrix_
# temperaory set all equal weights
universe_tickers = universe.ts_code.unique()
X = pd.DataFrame(np.repeat(1/len(universe_tickers), len(universe_tickers)), universe_tickers)

variance = np.dot(X.T, (np.dot(B, F).dot(B.T) + S)).dot(X)
variance = np.sqrt(variance[0][0])

In [5]:
print(f'portfolio variance is: {variance}')
print(rm.idiosyncratic_var_vector)

portfolio variance is: 26.40259478460406
                     0
ts_code               
000008.SZ   435.514861
000009.SZ   793.770173
000012.SZ   656.311326
000016.SZ  1294.350040
000021.SZ  1208.279783
...                ...
603000.SH  1266.176547
603019.SH   786.615301
603077.SH   492.767952
603128.SH  1330.147180
603323.SH   348.248072

[500 rows x 1 columns]


In [6]:
print(rm.idiosyncratic_var_vector.loc[rm.idiosyncratic_var_vector.index=='603128.SH'])
universe[['date','ts_code','boll_ub','boll_lb','close','vol','amount','ma_v_10']].loc[universe.ts_code == '603128.SH']

                    0
ts_code              
603128.SH  1330.14718


Unnamed: 0,date,ts_code,boll_ub,boll_lb,close,vol,amount,ma_v_10
323,2018-07-02,603128.SH,4.472884,4.300916,4.3565,64033.00,37237.059,68209.581
656,2018-07-03,603128.SH,4.472884,4.300916,4.4173,55676.00,32095.672,60343.946
994,2018-07-04,603128.SH,4.445116,4.318551,4.3717,72523.02,42120.490,61331.048
1337,2018-07-05,603128.SH,4.500715,4.189585,4.2351,57909.00,32784.633,60308.347
1683,2018-07-06,603128.SH,4.511127,4.113913,4.1820,78270.82,43156.038,62564.992
...,...,...,...,...,...,...,...,...
292020,2020-12-25,603128.SH,9.257949,7.256051,8.9800,270456.09,241172.394,409585.801
292507,2020-12-28,603128.SH,9.395489,7.310511,9.3700,317396.69,293523.157,415008.137
292994,2020-12-29,603128.SH,9.430844,7.411156,8.9500,410048.60,371000.994,417293.015
293480,2020-12-30,603128.SH,9.516620,7.477380,9.2500,286479.92,262792.224,419339.498


### Based on Bollinger Factor
As a simple view, I guess the each stock residual value imply a magnitude of excess return. I will combine residuals and Bollinger indicators.

Note that, the residuals what we have calculated cross all the time. Indeed, we can't use it as a factor like that. Actually, we can't use any data as a factor which would not over pass the time we can get. For example, if we make up a factor in time T to predict T+1 return. we can't make up this factor by T+1 or further time.

But, I use it cross all the time just verify my hypotheses.

factor = (boll_ub + boll_lb - 2 * close) * residuals / 1000


In [7]:
from helper.factor_helper import BollingerAndResidual
br = BollingerAndResidual(universe, rm.idiosyncratic_var_vector).calculate()
universe = br.get_factor()
universe.head()

custom factor: 100%|██████████████████████████| 500/500 [00:15<00:00, 31.93it/s]


Unnamed: 0,ts_code,trade_date,open,high,low,close,pre_close,change,pct_chg,vol,...,boll_ub,boll_lb,industry,name,close_to_open,close_to_open_5_sma,close_to_open_25_sma,win_lose,skew_momentum,custom_factor
556,000009.SZ,20180702,4.047,4.0635,3.8983,3.9479,4.047,-0.0991,-2.4487,94709.98,...,4.106014,3.872386,综合类,中国宝安,0.0,0.0,0.0,-4.816958e-05,-0.00898,0.065565
1167,000012.SZ,20180702,4.4065,4.4243,4.309,4.3356,4.3888,-0.0532,-1.2122,73861.08,...,4.454473,4.278827,玻璃,南 玻Ａ,-0.002053,-0.002053,-0.002053,-1.180457e-05,-0.021403,0.040757
1778,000016.SZ,20180702,5.0745,5.2879,5.026,5.1036,5.0939,0.0097,0.1904,202694.27,...,5.174543,4.955057,家用电器,深康佳Ａ,0.009503,0.009503,0.009503,2.912297e-07,-0.152893,-0.100442
2389,000021.SZ,20180702,6.7204,6.7401,6.4839,6.5627,6.7204,-0.1577,-2.3466,78621.11,...,6.939991,6.382509,电脑设备,深科技,-0.006004,-0.006004,-0.006004,-4.423641e-05,-0.072754,0.238152
3000,000040.SZ,20180702,9.9984,9.9984,9.4842,9.8006,9.9885,-0.1879,-1.8812,70863.95,...,9.8006,9.8006,新型电力,东旭蓝天,-0.00302,-0.00302,-0.00302,-2.842966e-05,-0.025769,0.0


# Evalute Factor
Now, we can evalute these factors performence
### rank factor and zscore
First we group factors by industry, then rank and zscore

In [63]:
# calculate facors and turn to zscore
from tqdm import tqdm
from scipy.stats import zscore


factor_columns = ['close_to_open_5_sma', 'close_to_open_25_sma', 'win_lose', 'skew_momentum', 'custom_factor']
all_factor_df = pd.DataFrame()
for df_tuple in tqdm(universe.groupby('industry'), desc='industrt/industries'):
    df_group = df_tuple[1]
    code_list = df_group.ts_code.unique()
    factor_df = pd.DataFrame()
    for factor_name in factor_columns:
        tmp = df_group.pivot(index='date', columns='ts_code', values=factor_name).fillna(0)
        tmp = tmp.rank(axis=1).apply(zscore, axis=1)
        X = pd.DataFrame()
        for ts_code in code_list:
            x = pd.DataFrame(tmp[ts_code])
            x = x.rename(columns={ts_code:factor_name})
            x['ts_code'] = ts_code
            X = X.append(x)
        X = X.reset_index()
        if factor_df.empty:
            factor_df = X
        else:
            factor_df = factor_df.merge(X[["ts_code", "date", factor_name]], on=["ts_code", "date"], how="left")
    all_factor_df = all_factor_df.append(factor_df)

all_factor_df

industrt/industries: 100%|██████████████████████| 88/88 [03:54<00:00,  2.67s/it]


Unnamed: 0,date,close_to_open_5_sma,ts_code,close_to_open_25_sma,win_lose,skew_momentum,custom_factor
0,2018-07-02,1.379385,002204.SZ,1.379385,-0.638285,1.276569,0.319142
1,2018-07-03,1.276569,002204.SZ,1.276569,0.000000,1.276569,-0.319142
2,2018-07-04,1.276569,002204.SZ,1.276569,-1.595712,1.276569,1.276569
3,2018-07-05,-0.638285,002204.SZ,-0.638285,0.638285,1.276569,-0.319142
4,2018-07-06,0.638285,002204.SZ,0.638285,-1.294678,1.276569,0.638285
...,...,...,...,...,...,...,...
2439,2020-12-25,1.341641,600687.SH,0.447214,-0.447214,0.447214,0.447214
2440,2020-12-28,-0.447214,600687.SH,-1.341641,-1.341641,0.447214,0.447214
2441,2020-12-29,-1.341641,600687.SH,-1.341641,1.341641,0.447214,-1.341641
2442,2020-12-30,-1.341641,600687.SH,-1.341641,-1.341641,0.447214,-0.447214


In [64]:
# process all factors table to multi index table that fit to use in alphalens
all_factor_df = all_factor_df.set_index(['date','ts_code'])
all_factor_df = all_factor_df.sort_values(by=["date", "ts_code"])
all_factor_df

Unnamed: 0_level_0,Unnamed: 1_level_0,close_to_open_5_sma,close_to_open_25_sma,win_lose,skew_momentum,custom_factor
date,ts_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-07-02,000008.SZ,-1.000000,-1.000000,1.000000,1.000000,-1.000000
2018-07-02,000009.SZ,-0.447214,-0.447214,-1.732051,-1.039230,1.039230
2018-07-02,000012.SZ,1.000000,1.000000,1.000000,1.000000,1.000000
2018-07-02,000016.SZ,1.555428,1.555428,1.485221,-1.485221,-1.485221
2018-07-02,000021.SZ,-1.355049,-1.355049,-0.802887,-0.806226,0.535258
...,...,...,...,...,...,...
2020-12-31,603000.SH,0.108465,0.976187,0.976187,-1.193118,-0.759257
2020-12-31,603019.SH,0.267261,-0.267261,0.267261,-0.801784,-1.336306
2020-12-31,603077.SH,1.410048,1.410048,-0.542725,-1.628176,-0.325396
2020-12-31,603128.SH,0.707107,-1.414214,0.000000,1.450953,-1.414214


### process price
Process price table in order to fit using by alphalens. Index=date, columns=ts_code

In [66]:
prices = universe.pivot(index='date', columns='ts_code', values='close').fillna(0)
prices

ts_code,000008.SZ,000009.SZ,000012.SZ,000016.SZ,000021.SZ,000040.SZ,000050.SZ,000058.SZ,000060.SZ,000066.SZ,...,601880.SH,601890.SH,601918.SH,601929.SH,601952.SH,603000.SH,603019.SH,603077.SH,603128.SH,603323.SH
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-07-02,0.00,3.9479,4.3356,5.1036,6.5627,9.8006,13.6682,4.9923,4.5392,6.8446,...,1.8739,4.2572,3.6216,2.2208,6.3720,7.9707,23.3888,1.73,4.3565,5.1891
2018-07-03,0.00,4.0305,4.3977,5.0260,6.7598,9.8006,14.1725,5.0616,4.6163,7.0898,...,1.8933,4.4064,3.6612,2.2507,6.4388,8.1474,24.3687,1.74,4.4173,5.2669
2018-07-04,0.00,3.9562,4.3090,4.9774,6.6810,9.7215,13.8264,5.0715,4.4621,6.6877,...,1.8836,4.3168,3.7305,2.2208,6.6009,8.0099,23.7493,1.74,4.3717,5.2237
2018-07-05,0.00,3.8983,4.2115,4.8222,6.4051,9.2171,13.4703,4.8140,4.2501,6.4033,...,1.8545,4.2074,3.5227,2.1710,6.3911,7.9216,23.3838,1.70,4.2351,5.1112
2018-07-06,0.00,3.8983,4.1671,4.7640,6.6219,9.0688,13.6978,4.8635,4.2597,6.8250,...,1.8642,4.2472,3.5029,2.1909,6.9252,7.9020,23.7036,1.70,4.1820,5.0594
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-25,2.56,7.5900,7.8600,6.4400,18.7100,3.4300,14.7300,6.3600,4.8700,17.1900,...,1.9400,6.2300,3.1300,1.9800,13.9100,16.7600,32.1900,1.38,8.9800,4.7500
2020-12-28,2.51,7.3500,7.6000,6.2100,18.1200,3.4700,14.2300,6.2300,4.8700,17.2800,...,1.9200,6.4500,3.1700,2.0000,14.7900,16.5000,31.6100,1.52,9.3700,4.7700
2020-12-29,2.51,7.3300,7.0700,6.4600,18.4000,3.4000,14.2300,6.3300,4.6600,17.8500,...,1.9600,6.2000,3.0100,2.0000,13.7600,16.6500,32.7700,1.47,8.9500,4.8500
2020-12-30,2.52,7.5800,7.3400,6.4800,18.3800,3.4800,14.5900,6.4700,4.8400,17.7800,...,1.9400,6.0900,3.1900,2.0000,13.8000,16.6100,33.7100,1.41,9.2500,4.9000


In [67]:
prices.to_csv('prices.csv')
all_factor_df.to_csv('all_factors.csv')

In [73]:
import pandas as pd
import numpy as np
from tqdm import tqdm
prices = pd.read_csv('prices.csv')
prices = prices.set_index('date')
all_factor_df = pd.read_csv('all_factors.csv')
all_factor_df['date'] = pd.to_datetime(all_factor_df['date'])
all_factor_df = all_factor_df.set_index(['date','ts_code'])
all_factor_df = all_factor_df.sort_values(by=["date", "ts_code"])