# Load Data From Tushare
We load data from 2018-2021. Due to some constraint of platform, we download thses data year by year and save seperately. After that we process these data.

In [None]:
import tushare as ts
import pandas as pd
import numpy as np

print(ts.__version__)

In [None]:
# register token
token = '' # your token
ts.set_token(token)
pro = ts.pro_api()

In [None]:
# got calendar from date range
start_date = '20180101'
end_date = '20210101'
calendar = pro.trade_cal(exchange='SSE', is_open='1', 
                            start_date=start_date, 
                            end_date=end_date, 
                            fields='cal_date')
# check all stocks exist in market today
stocks = pro.query('stock_basic', exchange='', list_status='L', market = '主板') # 主板/创业板/科创板/CDR/北交所
ts_code_list = ','.join(stocks.ts_code.values)
print(calendar.shape, stocks.shape)
calendar.tail()

In [None]:
# get fundamental data
# pick up market cap [5,30] billion
base_universe = pro.bak_daily(trade_date='20180102', 
                   fields='trade_date, ts_code, name, float_mv, total_mv, pe, turn_over, industry')
filte_stock = base_universe.loc[(base_universe.total_mv>=50) & (base_universe.total_mv<=300)]
filte_stock

In [None]:
from helper import download_helper

In [None]:
# load total stock daily date for one year
ts_code_list = filte_stock.ts_code.values
all_stocks = download_helper.get_Daily_All(ts, ts_code_list, start_date, end_date)
print(all_stocks.shape)
all_stocks

In [None]:
# save data
universe = all_stocks.drop_duplicates()
universe.to_csv(start_date +'-'+ end_date + '.csv')
filte_stock.drop_duplicates().to_csv('fundamental_' + start_date +'.csv')

# Load Data by File
if we load data from saved file. 

In [None]:
# load data from csv
import pandas as pd
import numpy as np
universe = pd.read_csv('20180101-20210101.csv').iloc[:,1:]
fundamental = pd.read_csv('fundamental_20180101.csv').iloc[:,1:]

# Process Data
1. filter ma_v_120 top 500 stocks
2. add 'date' column as datetime type, and deascanding time
3. add industry infomation and boll indicator to stock 

In [None]:
from helper.factor_helper import IndicatorHelper

ind_helper = IndicatorHelper(universe)

# pick average amount 120 days top 500
universe = ind_helper.top(500, index='trade_date', ticker_column='ts_code', value_column='ma_v_120')

In [None]:
# add bollinger as indicator which will be used as a custom factor later
# the bollinger indicator make up by stockstats package which depends on column nameed close as default
tech_indicator_list = ['boll_ub','boll_lb']
universe = ind_helper.add_technical_indicator(tech_indicator_list)

# add industry and stock name
universe = ind_helper.add_by_basetable('ts_code', fundamental, ['industry', 'name'])

# Construct Factors
### Overnight Returns and Firm-Specific Investor Sentiment
the overnight return calculate by $\frac{open_t - close_{t-1}}{close_{t-1}}$
 
 paper calculate price by sum average 5 days as long factor, we just average 5 days
 
 use average 20 days of overnight return as a short factor

In [None]:
from helper.factor_helper import CloseToOpen

# cal close to open average moving 5day as long facor and 20day as short factor
cto = CloseToOpen(universe).calculate()
universe = cto.get_factors()

###  Winners and Losers in Momentum Investing
 The stock price tragectories can be expressed by $p=\mu*time + \beta*time^2$ 
 
 We convert time as linner values and get $\mu$ and $\beta$ by regression method between price and constant values. 
 
 Final facotor expressed $\beta * \mu$
 
 This factor can express each stock tragectories relative convex. The $\mu$ be viewed as return direction and $\beta$ be viwed as return velocity

In [None]:
# regression use `statsmodels.formula.api` package
from helper.factor_helper import WinnerAndLoser
wl = WinnerAndLoser(universe).calculate()
universe = wl.get_factor()

In [None]:
universe.ts_code.values

In [None]:
universe.loc[universe.ts_code=='603000.SH']

###  Expected Skewness and Momentum
The skewness of returns distribution and media in a period time(20 trade day) can combine to be a factor.
 
factor = $skew * median$

In [None]:
from helper.factor_helper import SkewandMomentum
sm = SkewandMomentum(universe).calculate()
universe = sm.get_factor()

### Arbitrage Asymmetry and the Idiosyncratic Volatility Puzzle
Based on the last parer, we use idiosyncratic martix and bollinger indicator to construct custom factors. 

### PCA risk model
we use log return to calculate covariance matrix $F=\frac{1}{N-1}rr^T$

In [None]:
from sklearn.decomposition import PCA
%matplotlib inline
import matplotlib.pyplot as plt
# Set the default figure size
plt.rcParams['figure.figsize'] = [10.0, 6.0]

class RiskModel(object):
    def __init__(self, returns, ann_factor, num_factor_exposures):
        
        self.num_factor_exposures = num_factor_exposures
        self.pca = PCA(n_components=num_factor_exposures, svd_solver=svd_solver)
        self.pca.fit(returns)
        
        self.factor_betas_ = self.factor_betas(self.pca, returns.columns.values, np.arange(num_factor_exposures))
        self.factor_returns_ = self.factor_returns(self.pca, returns, returns.index, np.arange(num_factor_exposures))
        self.factor_cov_matrix_ = self.factor_cov_matrix(self.factor_returns_, ann_factor)
        
        self.idiosyncratic_var_matrix_ = self.idiosyncratic_var_matrix(returns, 
                                            self.factor_returns_, self.factor_betas_, ann_factor)
        self.idiosyncratic_var_vector = pd.DataFrame(data=np.diag(self.idiosyncratic_var_matrix_),
                                                     index=returns.columns)
    
    # got new exposure expressed by pca model
    def factor_betas(self, pca, factor_beta_indices, factor_beta_columns):
        return pd.DataFrame(pca.components_.T, factor_beta_indices, factor_beta_columns)
    
    # got new factor returns expressed by pca model
    def factor_returns(self, pca, returns, factor_return_indices, factor_return_columns):
        return pd.DataFrame(pca.transform(returns), factor_return_indices, factor_return_columns)
    
    # got new factor covariance matirx by pca expressed returns
    def factor_cov_matrix(self, factor_returns, ann_factor):
        return np.diag(factor_returns.var(axis=0, ddof=1) * ann_factor)
    
    # calculate idiosyncratic need to got factor_returns, factor_betas which calculate by pca model first
    def idiosyncratic_var_matrix(self, returns, factor_returns, factor_betas, ann_factor):
        estimate_returns = pd.DataFrame(np.dot(factor_returns, factor_betas.T), returns.index, returns.columns)
        residuals = returns - estimate_returns
        return pd.DataFrame(np.diag(np.var(residuals))*ann_factor, returns.columns, returns.columns)
    
    def plot_principle_risk(self):
        # Make the bar plot
        plt.bar(np.arange(self.num_factor_exposures), self.pca.explained_variance_ratio_);
    

In [None]:
# got pivot dataframe index=time, columns=ticker values=pct_chg 
returns_df = universe.pivot(index='date', columns='ts_code', values='pct_chg').fillna(0)

# Set the annualized factor
ann_factor = 252

# Set the number of factor exposures (principal components) for the PCA algorithm
num_factor_exposures = 30

# Set the svd solver for the PCA algorithm
svd_solver = 'full'

# Create a RiskModel object
rm = RiskModel(returns_df, ann_factor, num_factor_exposures)

### view portfolio variance and idiosyncratic values

In [None]:
B = rm.factor_betas_
F = rm.factor_cov_matrix_
S = rm.idiosyncratic_var_matrix_
# temperaory set all equal weights
universe_tickers = universe.ts_code.unique()
X = pd.DataFrame(np.repeat(1/len(universe_tickers), len(universe_tickers)), universe_tickers)

variance = np.dot(X.T, (np.dot(B, F).dot(B.T) + S)).dot(X)
variance = np.sqrt(variance[0][0])

In [None]:
print(f'portfolio variance is: {variance}')
print(rm.idiosyncratic_var_vector)

In [None]:
print(rm.idiosyncratic_var_vector.loc[rm.idiosyncratic_var_vector.index=='603128.SH'])
universe[['date','ts_code','boll_ub','boll_lb','close','vol','amount','ma_v_10']].loc[universe.ts_code == '603128.SH']

### Based on Bollinger Factor
As a simple view, I guess the each stock residual value imply a magnitude of excess return. I will combine residuals and Bollinger indicators.

Note that, the residuals what we have calculated cross all the time. Indeed, we can't use it as a factor like that. Actually, we can't use any data as a factor which would not over pass the time we can get. For example, if we make up a factor in time T to predict T+1 return. we can't make up this factor by T+1 or further time.

But, I use it cross all the time just verify my hypotheses.

factor = (boll_ub + boll_lb - 2 * close) * residuals / 1000


In [None]:
from helper.factor_helper import BollingerAndResidual
br = BollingerAndResidual(universe, rm.idiosyncratic_var_vector).calculate()
universe = br.get_factor()
universe

# Evalute Factor
Now, we can evalute these factors performence
### rank factor and zscore
First we group factors by industry, then rank and zscore

In [None]:
# calculate facors and turn to zscore
from tqdm import tqdm
from scipy.stats import zscore

# factor_columns = ['close_to_open_5_sma', 'close_to_open_25_sma', 'win_lose', 'skew_momentum', 'custom_factor']
factor_columns = ['close_to_open_5_sma', 'close_to_open_25_sma', 'skew_momentum', 'custom_factor']
all_factor_df = pd.DataFrame()
for df_tuple in tqdm(universe.groupby('industry'), desc='industrt/industries'):
    df_group = df_tuple[1]
    code_list = df_group.ts_code.unique()
    factor_df = df_group[['date', 'ts_code']]
    for factor_name in factor_columns:
        tmp = df_group.pivot(index='date', columns='ts_code', values=factor_name).fillna(0)
        tmp = tmp.rank(axis=1).apply(zscore, axis=1)
        X = pd.DataFrame()
        for ts_code in tmp.columns.values:
            x = pd.DataFrame(tmp[ts_code])
            x = x.rename(columns={ts_code:factor_name})
            x['ts_code'] = ts_code
            X = X.append(x)
        X = X.reset_index()
        factor_df = factor_df.merge(X[["ts_code", "date", factor_name]], on=["ts_code", "date"], how="left")
    all_factor_df = all_factor_df.append(factor_df)

all_factor_df

In [None]:
# process all factors table to multi index table that fit to use in alphalens
all_factor_df = all_factor_df.set_index(['date','ts_code'])
all_factor_df = all_factor_df.sort_values(by=["date", "ts_code"])
all_factor_df

### process price
Process price table in order to fit using by alphalens. Index=date, columns=ts_code

In [None]:
prices = universe.pivot(index='date', columns='ts_code', values='close')
prices

In [None]:
prices.to_csv('prices.csv')
all_factor_df.to_csv('all_factors.csv')

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
prices = pd.read_csv('prices.csv')
prices = prices.set_index('date')
all_factor_df = pd.read_csv('all_factors.csv')
all_factor_df['date'] = pd.to_datetime(all_factor_df['date'])
all_factor_df = all_factor_df.set_index(['date','ts_code'])
all_factor_df = all_factor_df.sort_values(by=["date", "ts_code"])