In [105]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import bottleneck
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from datetime import datetime
import gc
import os,sys,pdb,glob,math
import warnings
import statsmodels.api as sm

warnings.filterwarnings('ignore')


In [106]:
#######################################################parameter setting############################################################

In [107]:
v = 'factor'
ret = 'return'
start_date = '2014-01-01'
end_date = '2021-01-01'
ifrank = False

In [108]:
#######################################################loading data###########################################################################

In [109]:
data = pd.read_csv('data.csv')
data['return'] = data.groupby('ticker')['last'].apply(lambda x:(x/x.shift(1)-1).shift(-1))
# use volume/sum of volume in the past 20 days as factor 
data['factor'] = data.groupby('ticker')['volume'].apply(lambda x:x/x.rolling(20).sum()).shift(1,axis = 0) 

In [110]:
##########################################################data process####################################################################

In [111]:
# remove extreme values
def extreme_process_MAD(x): 
    median = x.median()
    MAD = abs(x - median).median()
    x[x>(median+3*1.4826*MAD)] = median+3*1.4826*MAD
    x[x<(median-3*1.4826*MAD)] = median-3*1.4826*MAD
    return x    

data['factor'] = data.groupby('date')['factor'].apply(extreme_process_MAD)
data[ret] = np.where(abs(data[ret])>0.5, np.nan, data[ret])

In [112]:
#if we had data, we should do neutralize 

In [113]:
# standardize 
train_data = data[(data['date']<start_date)]
test_data = data[(data['date']>=start_date) &  (data['date']<=end_date)]
factor_mean = train_data['factor'].mean()
factor_std = train_data['factor'].std()
test_data.loc[:,['factor']] = (test_data['factor']-factor_mean)/factor_std

In [114]:
##########################################################backtest####################################################################

In [115]:
df = test_data[['date', 'ticker', ret,v]].copy()
df['long'] = df[ret][df['factor'] > 0] * df['factor'][df['factor'] > 0] 
df['short'] = df[ret][df['factor'] < 0] * df['factor'][df['factor'] < 0] 

df_tmp = df[df['factor'] > 0]
long_sum = df_tmp.groupby(['date'])['factor'].sum() # sum of long factor
y_mean = df.groupby(['date'])[ret].mean() # mean of return of all stocks for each day and mintime
aa = df.groupby(['date'])['long'].sum() # get long return
longret = aa / long_sum - y_mean # return minus avg of market

df_tmp = df[df['factor'] < 0]
short_sum = df_tmp.groupby(['date'])['factor'].sum()
y_mean = df.groupby(['date'])[ret].mean()
aa = df.groupby(['date'])['short'].sum()
shortret = - aa / short_sum + y_mean

overall_ret = longret.add(shortret,fill_value = 0)/2 # all return (market basic included)

tmp_long = longret
tmp_all = overall_ret
#tmp_all.to_csv(v + "_pnl.csv")

In [116]:
pnl_file = pd.DataFrame(tmp_all)
pnl_file = pnl_file.stack()
pnl_file = pnl_file.reset_index()
pnl_file.columns= ['date', 'tme','pnl1']
pnl_file = pnl_file.loc[:,['date','pnl1']]

longNum = df[df['factor'] > 0].groupby(['date'])['factor'].count()
shortNum = df[df['factor'] < 0].groupby(['date'])['factor'].count()
longsum = df[df['factor'] > 0].groupby(['date'])['factor'].sum()
shortsum = df[df['factor'] < 0].groupby(['date'])['factor'].sum()

df_longshort = pd.DataFrame([longsum, shortsum, longNum, shortNum]).T.reset_index()

df_longshort.columns = ['date', 'long', 'short', 'longNum', 'shortNum']

pnl_file = pd.merge(pnl_file, df_longshort, on = ['date'], how = 'left')
pnl_file['pnl'] = pnl_file['pnl1']

pnl_file = pnl_file[['date', 'long', 'short', 'longNum', 'shortNum', 'pnl']]
pnl_file = pnl_file.set_index('date')

pnl_file.to_csv(v + "_pnl1.txt")

In [117]:
tmp_all = pd.DataFrame(tmp_all,columns = ['tmp_all'])
tmp_all['average']=tmp_all.mean(axis=1)

tmp_all['year'] = [int(x[:4]) for x in tmp_all.index.tolist()]
tmp_all['date'] = tmp_all.index.tolist()

In [118]:
def get_drawdown(x):
    drawdown = 0
    down = 0
    st = x['date'].tolist()[0]
    start = x['date'].tolist()[0]
    end = x['date'].tolist()[0]
    ret_list = x['average'].tolist()
    for i in range (0, len(x)):
        if ret_list[i] < 0:
            down = down + ret_list[i]
        else:
            if down < drawdown:
                drawdown = down
                end = x['date'].tolist()[i]
                start = st
                st = x['date'].tolist()[i]
                down = 0
            else:
                down = 0
                st = x['date'].tolist()[i]
    return (drawdown, start, end)

In [119]:
output = pd.DataFrame(index = list(set(tmp_all['year'].tolist())), columns = ['from', 'to', 'return', 'pnl_per_day', 'win_rate', 'sharpe', 'drawdown'])
output['return'] = tmp_all.groupby('year')['average'].sum()
output['pnl_per_day'] = tmp_all.groupby('year')['average'].mean()
output['from'] = tmp_all.groupby('year')['date'].min()
output['to'] = tmp_all.groupby('year')['date'].max()
output['win_rate'] = tmp_all[tmp_all['average'] > 0].groupby('year')['average'].count()/tmp_all.groupby('year')['average'].count()
output['drawdown'] = tmp_all.groupby('year')['average', 'date'].apply(get_drawdown)
output[['drawdown', 'dd_start', 'dd_end']] = output['drawdown'].apply(pd.Series)
#output['dd_start'] = tmp_all.groupby('year')['average', 'date'].apply(get_drawdown)
#output['dd_end'] = tmp_all.groupby('year')['average', 'date'].apply(get_drawdown)
output['sharpe'] = tmp_all.groupby('year')['average'].mean()/tmp_all.groupby('year')['average'].std()*np.sqrt(242)
output['win/loss'] = -tmp_all[tmp_all['average'] > 0].groupby('year')['average'].mean()/tmp_all[tmp_all['average'] <0].groupby('year')['average'].mean()

sum_ret = tmp_all['average'].mean()*242
sum_pnl_per_day = tmp_all['average'].mean()
sum_winrate = tmp_all[tmp_all['average'] > 0]['average'].count()/tmp_all['average'].count()
sum_drawdown = get_drawdown(tmp_all)
sumdd_start = sum_drawdown[1]
sumdd_end = sum_drawdown[2]
sum_dd = sum_drawdown[0]
sum_sharpe = tmp_all['average'].mean()/tmp_all['average'].std()*np.sqrt(250)
sum_win_ret = -tmp_all[tmp_all['average'] > 0]['average'].mean()/tmp_all[tmp_all['average'] <0]['average'].mean()
output.loc['summary'] = [min(tmp_all['date'].tolist()), max(tmp_all['date'].tolist()), sum_ret, sum_pnl_per_day, sum_winrate, sum_sharpe, sum_dd,sumdd_start,sumdd_end,sum_win_ret]

output.sort_values(by = ['to'])


Unnamed: 0,from,to,return,pnl_per_day,win_rate,sharpe,drawdown,dd_start,dd_end,win/loss
2014,2014-01-06,2014-12-30,0.012953,5.3e-05,0.540984,0.425418,-0.019671,2014-10-31,2014-11-06,0.917545
2015,2015-01-05,2015-12-30,0.048766,0.0002,0.536885,1.351748,-0.014702,2015-07-08,2015-07-17,1.08167
2016,2016-01-04,2016-12-30,-0.033971,-0.000139,0.481633,-0.76654,-0.022489,2016-06-23,2016-06-29,0.927935
2017,2017-01-04,2017-12-29,-0.005794,-2.3e-05,0.473684,-0.20661,-0.011442,2017-08-02,2017-08-15,1.070663
2018,2018-01-04,2018-12-28,-0.033791,-0.000138,0.485714,-1.152548,-0.010239,2018-02-26,2018-03-07,0.873492
2019,2019-01-04,2019-12-30,0.004947,2.1e-05,0.53527,0.15953,-0.009124,2019-08-05,2019-08-08,0.891891
2020,2020-01-06,2020-12-30,0.04771,0.000197,0.495868,0.830275,-0.01468,2020-04-17,2020-04-22,1.201577
summary,2014-01-06,2020-12-30,0.005784,2.4e-05,0.507026,0.155931,-0.022489,2016-06-23,2016-06-29,1.000765
