## Instruction
In this notebook, we will complete backtest which is the last step.

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import scipy
from scipy.optimize import fmin_l_bfgs_b
from sklearn.decomposition import PCA

from statistics import median
from scipy.stats import gaussian_kde
from statsmodels.formula.api import ols

from matplotlib import pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = [8, 4]

import warnings
warnings.filterwarnings('ignore')

### Load Factors
we have factors store in csv file which process and combine from privious steps.

In [2]:
#fundamental_df = pd.read_csv('tmp_factor.csv').iloc[:,1:]
universe = pd.read_csv('zscore_train.csv').iloc[:,1:]
universe['date'] = pd.to_datetime(universe['trade_date'],format='%Y%m%d')
universe = universe.set_index(['date']).sort_values(by=['date'])
print(universe.shape)
universe.head()

(9833, 65)


Unnamed: 0_level_0,ts_code,trade_date,turnover_rate,amount,pe,pb,total_share,total_mv,volume,open,...,close_10_kama_5_30,close_2_kama,alpha_close2open_5_sma,alpha_close2open_25_sma,alpha_supertrend,alpha_kama,alpha_winlos,alpha_skew2sentiment,alpha_cci,alpha_fundamental
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02,000676.SZ,20180102,-1.392399,20542.23,-0.159959,-0.818168,96571.0782,-0.484301,2402.802341,8.52638,...,8.54928,8.54928,-0.007976,0.007976,-1.193773,1.035419,-1.684637,0.63335,0.0,1.572341
2018-01-02,603567.SH,20180102,-0.708659,9238.25,-0.840791,-0.689717,84916.0,0.880949,682.35872,13.39639,...,13.5387,13.5387,-0.327946,0.327946,0.176926,1.085923,-1.501932,0.197525,0.0,-0.341562
2018-01-02,002287.SZ,20180102,-0.831211,26462.49,1.596732,1.958854,40600.0,1.387706,1076.396194,23.98126,...,24.58434,24.58434,1.761169,-1.761169,-1.899138,-1.184975,-0.030169,0.197525,0.0,0.591635
2018-01-02,002728.SZ,20180102,0.010089,11232.64,0.586987,-0.121361,20000.0,-1.32483,759.825694,14.73429,...,14.78318,14.78318,0.371362,-0.371362,0.795571,-0.704747,1.125887,-1.221502,0.0,-0.92897
2018-01-02,600056.SH,20180102,-1.0,135659.24,-1.0,1.0,106848.5534,1.0,8516.371476,15.98056,...,15.92923,15.92923,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,0.0,1.0


### Factor Exposures and Factor Returns
The facort values in cross section should view as a type of exposure. We can calculate factor returns bettwen exposures of each ticker and daily return. We also did this in backtestig animate notebook.

In [3]:
## shif return 5 times
universe['returns_5'] = universe.groupby('ts_code')['log-ret'].shift(-5).fillna(method='ffill')
universe = universe.sort_values(by=['date'])
all_factors = universe.copy(deep=True)
all_factors = all_factors.replace([np.inf, -np.inf], np.nan)
all_factors = all_factors.fillna(0)
print(universe.shape, all_factors.shape)

(9833, 66) (9833, 66)


In [None]:
def wins(x,a,b):
    return np.where(x <= a,a, np.where(x >= b, b, x))

def density_plot(data): 
    # data is series
    density = gaussian_kde(data)
    xs = np.linspace(np.min(data),np.max(data),200)
    density.covariance_factor = lambda : .2
    density._compute_covariance()
    plt.plot(xs,density(xs))
    plt.xlabel('times Returns')
    plt.ylabel('Density')
    plt.show()

def get_formula(factors, Y):
    L = ["0"]
    L.extend(factors)
    return Y + " ~ " + " + ".join(L)

def factors_from_names(n, name):
    return list(filter(lambda x: name in x, n))

def estimate_factor_returns(df, name='alpha_'): 
    ## winsorize returns for fitting 
    estu = df.copy(deep=True)
    estu['returns_5'] = wins(estu['returns_5'], -0.2, 0.19)
    all_factors = factors_from_names(list(df), name)
    form = get_formula(all_factors, "returns_5")
    model = ols(form, data=estu)
    results = model.fit()
    return results

estimate_factor_returns(all_factors.loc[all_factors['trade_date']==20220505]).params

In [None]:
from scipy.stats import zscore
alpha_factors = ['alpha_close2open_5_sma', 'alpha_close2open_25_sma', 'alpha_supertrend', 
                 'alpha_cci', 'alpha_kama', 'alpha_skew2sentiment', 'alpha_fundamental', 'alpha_AI']
date_and_code = [ 'trade_date','ts_code']
calendar = all_factors.trade_date.unique() # int64
alpha_df_f = all_factors[alpha_factors+date_and_code]
alpha_df_f['returns_5'] = universe['returns_5']
facret = {}
for dt in tqdm(calendar, desc='regression factor returns'):
    facret[dt] = estimate_factor_returns(alpha_df_f.loc[alpha_df_f['trade_date']==dt]).params
facret[20220505]

#### Veiw Factor Returns

In [None]:
date_list = all_factors.index.unique()
facret_df = pd.DataFrame(index = date_list)

alpha_factors = ['alpha_close2open_5_sma', 'alpha_close2open_25_sma', 'alpha_supertrend', 
                 'alpha_cci', 'alpha_kama', 'alpha_skew2sentiment', 'alpha_fundamental','alpha_AI']

for ii, dt in zip(calendar,date_list): 
    for alp in alpha_factors: 
        facret_df.at[dt, alp] = facret[ii][alp]

for column in facret_df.columns:
    plt.plot(facret_df[column].cumsum(), label=column)
plt.legend(loc='upper left')
plt.xlabel('Date')
plt.ylabel('Cumulative Factor Returns')
plt.show()

### pick alpha factors

In [None]:
combine_factors = ['alpha_cci', 'alpha_skew2sentiment', 'alpha_fundamental', 'alpha_AI']
alpha_df = alpha_df_f[combine_factors+date_and_code]
alpha_df.loc[alpha_df.ts_code=='603538.SH']['alpha_AI'].plot(grid=True)
