In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import statsmodels.api as sm
import statsmodels.tsa as tsa
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import coint
import scipy.stats as stats
from hurst import compute_Hc
# import copulas as cop

warnings.filterwarnings('ignore')

## 데이터 로딩(ETF)

In [18]:
etf_df = pd.read_csv('./Data/us_etf_price.csv')
etf_df['Date'] = pd.to_datetime(etf_df['Date'])
etf_df = etf_df.set_index(['Date', 'tic'])
etf_df = etf_df.unstack()

price_df = etf_df['Close']
logret_df = np.log(price_df).diff()
volume_df = etf_df['Volume']
stock_names = price_df.columns
dateindex = price_df.index

## 페어링

In [19]:
def pairing(fs, fe, price_df, logret_df):
    pass

## 다양한 metric들

In [20]:
def logspread_metric(fs, fe, ts, te, tickers, logret_df):
    
    logret_p1 = logret_df[tickers[0]].loc[fs:te]
    logret_p2 = logret_df[tickers[1]].loc[fs:te]
    
    cum_logret_p1 = logret_p1.cumsum()
    cum_logret_p2 = logret_p2.cumsum()

    logspread_ts = cum_logret_p1 - cum_logret_p2

    #시작점 0으로 만들어주기(달러중립조건)    
    logspread_ts_form = logspread_ts.loc[fs:fe] - logspread_ts.loc[fs]
    #시작점 0으로 만들어주기(달러중립조건)
    logspread_ts_trd = logspread_ts.loc[ts:te]- logspread_ts.loc[ts]

    return logspread_ts_form, logspread_ts_trd

In [21]:
def priceratio_metric(fs, fe, ts, te, tickers, price_df):

    p1 = price_df[tickers[0]].loc[fs:te]
    p2 = price_df[tickers[1]].loc[fs:te]
   
    #헷지비율 활용해 가격비율 계산
    priceratio_ts = p1 / p2
    
    #시작점 1로 만들어주기(달러중립조건)
    priceratio_ts_form = priceratio_ts.loc[fs:fe] / priceratio_ts.loc[fs]
    #시작점 1로 만들어주기(달러중립조건)
    priceratio_ts_trd = priceratio_ts.loc[ts:te] / priceratio_ts.loc[ts]

    return priceratio_ts_form, priceratio_ts_trd

In [22]:
def metric_calc(fs, fe, ts, te, pairs, price_df, logret_df):
    pairs_metric = {}

    for tickers in pairs:
        logspread_form, logspread_trd = logspread_metric(fs, fe, ts, te, tickers, logret_df)
        p_ratio_form, p_ratio_trd = priceratio_metric(fs, fe, ts, te, tickers, price_df)
        pairs_metric[tickers] = [logspread_form, logspread_trd, p_ratio_form, p_ratio_trd]

    return pairs_metric


In [23]:
# def copula_metric(fs, fe, ts, te, tickers, logret_df):
#     p1, p2 = tickers[0], tickers[1]

#     logret_p1_form = logret_df[p1].loc[fs:fe]
#     logret_p2_form = logret_df[p2].loc[fs:fe]
#     logret_p1 = logret_df[p1].loc[fs:te]
#     logret_p2 = logret_df[p2].loc[fs:te]

#     # marginal distribution이 t분포를 따른다고 가정하고 피팅
#     params_p1 = stats.t.fit(logret_p1_form)
#     dist_p1 = stats.t(*params_p1)
#     u = dist_p1.cdf(logret_p1_form)

#     params_p2 = stats.t.fit(logret_p2_form)
#     dist_p2 = stats.t(*params_p2)    
#     v = dist_p2.cdf(logret_p2_form)

#     # 2개 marginal distribution에 대해 가장 잘맞는 copula 피팅
#     best_aic = np.inf
#     best_copula = None
#     copulas = [cop.GaussianCopula(), cop.ClaytonCopula(), cop.GumbelCopula(), cop.FrankCopula(), cop.JoeCopula()]
    
#     for copula in copulas:
#         copula.fit(u,v)
#         L = copula.log_likelihood(u,v)
#         aic = 2 * copula.num_params - 2 * L
#         if aic < best_aic:
#             best_aic = aic
#             best_copula = copula
            
#     # 피팅한 copula 이용해서 조건부확률 계산(trade 기간)
#     prob_p1 = []
#     prob_p2 = []

#     for u,v in zip(dist_p1.cdf(logret_p1), dist_p2.cdf(logret_p2)):
#         prob_p1.append(best_copula.cdf_u_given_v(u,v))
#         prob_p2.append(best_copula.cdf_v_given_u(u,v))
        
#     probs_ts = pd.DataFrame(np.vstack([prob_p1, prob_p2]).T, index=logret_p1.index, columns=tickers)
#     probs_ts_form = probs_ts.loc[fs:fe]
#     probs_ts_trd = probs_ts.loc[ts:te]

#     return probs_ts_form, probs_ts_trd

## 특성 연구

## 베팅사이즈 조절 + 손절선 결정

In [24]:
def Kelly_calc(fs, fe, tickers, price_df, logspread_form, enter, close):
    p1 = price_df[tickers[0]].loc[fs:fe]
    p2 = price_df[tickers[1]].loc[fs:fe]

    #로그가격의 cointegration 유의성 강도(p_value)
    log_p1 = np.log(p1)
    log_p2 = np.log(p2)
    _, p_value, _ = coint(log_p1, log_p2, method='aeg')
    
    #로그스프레드 표준편차
    SD = logspread_form.std()

    #로그스프레드 Hurst 지수(0~1사이, 0.5보다 작을수록 평균회귀 강함)
    Hurst_exponent, _, _ = compute_Hc(logspread_form)
    
    #손절선 계산
    abs_spread = np.abs(logspread_form)
    loss_cut = np.nanpercentile(abs_spread, 99)
    
    #승리시 수익률, 패배시 손실률(로그스프레드->실제수익률 전환)
    win = np.exp(enter[0]*SD - close[0]*SD)-1 #진입-청산
    loss = 1-np.exp(enter[0]*SD - loss_cut) # 진입-손절
    
    #승패확률 어림짐작
    if p_value <= 0.02 and Hurst_exponent<=0.3:
        win_rate = 0.9
    elif p_value<=0.02 or Hurst_exponent<=0.3:
        win_rate = 0.7
    else:
        win_rate = 0.5
    loss_rate = 1-win_rate
    
    #켈리공식
    Kelly = win_rate/loss - loss_rate/win

    return Kelly

In [25]:
def weight_calc(fs, fe, pairs, pairs_metric, price_df, enter, close):
    
    Kellys = []

    for tickers in pairs:
        #켈리공식
        logspread_form = pairs_metric[tickers][0]
        Kelly = Kelly_calc(fs, fe, tickers, price_df, logspread_form, enter, close)
        Kellys.append(Kelly)    

    # 켈리공식 결과값의 비율로 weight 조절
    # weight가 0.1보다 작은 경우(음수포함) 0으로 만들고 나머지 합쳐 1되게 조정
    
    Kellys = np.array(Kellys)
    weight = Kellys/Kellys.sum()    
    weight[weight<0.1] = 0
    weight = weight/weight.sum()

    pairs_weight = {}
    
    for i in range(len(pairs)):
        pairs_weight[pairs[i]] = weight[i]

    return pairs_weight

## 시그널 검색

In [26]:
def signal_calc(pairs, pairs_metric, enter, close):
    pairs_signal = {}
    
    for tickers in pairs:
        logspread_form, logspread_trd = pairs_metric[tickers][0], pairs_metric[tickers][1]
        SD = logspread_form.std()

        #진입, 청산
        open_cut = SD*enter[0]
        close_cut = SD*close[0]
        
        #손절
        abs_spread = np.abs(logspread_form)
        loss_cut = np.nanpercentile(abs_spread, 99)

        # open, close, losscut 상태
        abs_spread = np.abs(logspread_trd)
        open_state = (abs_spread <= open_cut) # 스프레드가 open_cut 아래로 내려갈 때 잡기
        close_state = (abs_spread <= close_cut) # 스프레드가 close_cut 아래로 내려갈 때 잡기
        losscut_state = (abs_spread >= loss_cut) # loss_cut 위로 올라갈 때 잡기
        
        # open, close, losscut에 진입하는 시점
        open_signal = (open_state-open_state.shift(1) == 1)
        close_signal = (close_state-close_state.shift(1) == 1)
        losscut_signal = (losscut_state-losscut_state.shift(1) == 1)
        close_signal[-1] = True #마지막날 무조건 종료

        #시그널(1,0)을 숫자 위치로 전환
        numidx = np.arange(len(open_signal))
        open_signal = numidx[open_signal]
        close_signal = numidx[close_signal]
        losscut_signal = numidx[losscut_signal]

        pairs_signal[tickers] = (open_signal, close_signal, losscut_signal)

    return pairs_signal

In [27]:
def catch_signal(pairs, pairs_metric, enter, close):  

    pairs_signal = signal_calc(pairs, pairs_metric, enter, close)  

    trade_dates={}

    for tickers in pairs:
        logspread_trd = pairs_metric[tickers][1]
        dateindex_trd = logspread_trd.index
        o_signal, c_signal, l_signal = pairs_signal[tickers]

        trades = []

        #open 2개 사이의 close, loss signal 중 가장 작은것 찾기
        for i in range(len(o_signal)-1):
            o_date = dateindex_trd[o_signal[i]]
            c_dates = c_signal[(c_signal>o_signal[i]) * (c_signal<o_signal[i+1])]
            l_dates = l_signal[(l_signal>o_signal[i]) * (l_signal<o_signal[i+1])]
            
            #open 2개 사이에 close, loss signal 있는지 확인
            if len(c_dates) == 0 and len(l_dates) == 0:
                continue
            elif len(c_dates) == 0:
                l_date = dateindex_trd[l_dates.min()]
                trade = [o_date, l_date, 'losscut']    
                trades.append(trade)
                continue

            elif len(l_dates) == 0:
                c_date = dateindex_trd[c_dates.min()]
                trade = [o_date, c_date, 'close']
                trades.append(trade)
                continue
            
            else:
                c_date = dateindex_trd[c_dates.min()]
                l_date = dateindex_trd[l_dates.min()]

                if c_dates.min()<l_dates.min():
                    trade = [o_date, c_date, 'close']
                else:
                    trade = [o_date, l_date, 'losscut']
            trades.append(trade)    #trading period 중 거래일 모으기
        trade_dates[tickers] = trades  #페어별 거래일 입력
    
    return trade_dates

## 거래실행 및 수익률 계산

1) 달러중립 or 베타중립

2) 거래비용 처리

3) 공매비용 처리

In [28]:
def pnl_calc(pairs, pairs_metric, pairs_weight, pairs_trade, price_df):
    pairs_pnl = {}
    #매 시점 총투자액은 페어 개수(2개->2달러, 3개->3달러, ..)
    total_invest = len(pairs)

    for tickers in pairs:
        logspread_trd = pairs_metric[tickers][1]
        weight = pairs_weight[tickers]
        pair_invest = total_invest*weight

        trades = pairs_trade[tickers]
        pnls = []

        for trade in trades:
            open, close = trade[0], trade[1]
            holding_days = (close-open).days
            
            if logspread_trd.loc[open]>0: 
                long, short = tickers[0], tickers[1] # 스프레드>0이면 앞에꺼 숏 뒤에꺼 롱
            else: 
                long, short = tickers[1], tickers[0] # 스프레드<0이면 앞에꺼 롱 뒤에꺼 숏

            #롱/숏 거래가격(진입/청산/손절선 지나야지 시그널 나오므로 시그널 있는 날 종가 사용)
            long_buy, long_sell = price_df.loc[open, long], price_df.loc[close, long]
            short_buy, short_sell = price_df.loc[open, short], price_df.loc[close, short]
            
            #롱/숏 주식 개수
            long_shares, short_shares = pair_invest/long_buy, pair_invest/short_buy

            # 롱/숏 포지션에 pair_invest 달러만큼 투자할 때(달러중립) 손익
            long_pnl = (long_sell-long_buy)*long_shares
            short_pnl = (short_buy-short_sell)*short_shares
            
            # 거래비용은 거래가격의 0.2%, 공매비용 연 5%
            trading_cost = (long_buy+long_sell)*0.002*long_shares *  + (short_buy+short_sell)*0.002*short_shares
            short_cost = holding_days/240*0.05

            # 전체 손익 = 롱 + 숏 - 거래비용 - 공매비용
            total_pnl = long_pnl + short_pnl - trading_cost - short_cost

            temp = [open, close, holding_days, long, short, long_shares, short_shares, total_pnl, long_pnl, short_pnl, -trading_cost, -short_cost]
            pnls.append(temp) # 거래관련 정보들 모으기

        pairs_pnl[tickers] = pnls # 페어별 거래정보 dict 입력
    
    result_df = pd.DataFrame([])
    cols = ['open', 'close', 'holding_days', 'long', 'short', 'long_shares', 'short_shares', 'pnl', 'long_pnl', 'short_pnl', 'trading_cost', 'short_cost']

    for tickers in pairs:
        df = pd.DataFrame(pairs_pnl[tickers], columns = cols)
        df['pair'] = str(tickers[0]) + '_' + str(tickers[1])
        result_df = pd.concat([result_df, df], axis=0)

    return result_df

In [29]:
# def total_account(pairs, pairs_metric, pairs_result, price_df):
#     logspread_trd = pairs_metric[pairs[0]][1]
#     dateindex_trd = logspread_trd.index #전체 거래기간
    
#     price_trd = price_df.loc[dateindex_trd]

#     longs = []
#     shorts = []

#     for tickers in pairs:
#         longs.append(tickers[0])
#         shorts.append(tickers[0])

#     cols = ['ticker', 'shares', 'price', 'position size', 't_cost', 'short_cost', 'PNL']
#     account_df = pd.DataFrame([], columns = cols, index = dateindex_trd)

#     for tickers in pairs:
#         tickers_result = pairs_result[pairs_result['pair'] == tickers]
        
#         for result in tickers_result:
#             open, close, h_days, long, short, long_shares, short_shares = result[['open', 'close', 'holding_days', 'long', 'short', 'long_shares', 'short_shares']].values
            
#             long_price = price_df.loc[open:close, long]
#             short_price = price_df.loc[open:close, short]
#             dateindex_trd_pair = long_price.index

#             long_shares = np.ones(h_days)*long_shares
#             short_shares = np.ones(h_days)*short_shares            
            
#             long_shares = pd.Series(long_shares, index = dateindex_trd_pair)
#             short_shares = pd.Series(-short_shares, index = dateindex_trd_pair)

#             t_cost = np.zeros(h_days)
#             t_cost[0] = 

#             short_cost = 
#             short_cost[0] = 

#     return account_result

## 백테스트

### 구간설정

In [30]:
def interval_split(dateindex, formation_period, trading_period, mode, interval_num):
    T = len(dateindex)
    number_index = np.arange(T)

    date_start = formation_period
    date_end = T - trading_period
    
    if mode == 'sliding': # sliding window
        train_cycle = int((T - formation_period - trading_period)/(interval_num-1))
        startpoint_list = number_index[date_start:date_end:train_cycle]
        
    if mode == 'random':
        startpoint_list = np.random.choice(number_index, size = interval_num, replace=False)

    intervals = []

    for num_idx in startpoint_list:
        form_start = dateindex[num_idx-formation_period]
        form_end = dateindex[num_idx-1]
        trd_start = dateindex[num_idx]
        trd_end = dateindex[num_idx+trading_period]
        interval = (form_start, form_end, trd_start, trd_end)
        intervals.append(interval)
        
    return intervals

In [31]:
def train_test_split(dateindex, train_test_ratio, formation_period, trading_period, mode='sliding', train_num=20, test_num=5):
    T = len(dateindex)
    T_train = int(T * train_test_ratio)

    dateindex_train = dateindex[:T_train] 
    dateindex_test = dateindex[T_train:]

    #훈련구간 앞뒤로 formation, trading 기간 떼어놓고 시작지점 선택
    train_intervals = interval_split(dateindex_train, formation_period, trading_period, mode, train_num)
    test_intervals = interval_split(dateindex_test, formation_period, trading_period, mode, test_num)

    return train_intervals, test_intervals

### 백테스트 실행

In [32]:
#입력변수들
dateindex = price_df.index[-1500:]
formation_period = 180
trading_period = 90
train_test_ratio = 0.8
train_num=10
test_num=5

#진입청산 포인트(로그스프레드, 가격비율)
enter=[1.5, 1.02]
close=[0.5, 1]

#train, test 인터벌나누기
train_intervals, test_intervals = train_test_split(dateindex, train_test_ratio, formation_period, trading_period, 'sliding', train_num, test_num)

interval_results_pair = pd.DataFrame()
interval_num = 1

#각 인터벌에 대한 페어구성, 스프레드계산, 포지션설정, 거래일결정
for interval in train_intervals:
    fs, fe, ts, te = interval
    interval_num += 1

    # 주어진 기간에서 페어 뽑아서 리스트 만들기, 페어 리스트는 [(a1, a2), (b1, b2), ...] 형태
    pairs = pairing(fs, fe, price_df, logret_df) 
    
    # 페어링 알고리즘 없어서 임시로 비슷한 섹터/스타일 3개 페어 뽑아보았음
    pairs = [('XLV', 'VHT'), ('USMV', 'SCHX'), ('IWF', 'IWM')]
    pairs_metric = metric_calc(fs, fe, ts, te, pairs, price_df, logret_df) #페어 기초통계량 계산
    pairs_weight = weight_calc(fs, fe, pairs, pairs_metric, price_df, enter, close) #페어별 포지션사이즈 결정
    
    #동일가중 평균 사용할 경우 equal weight
    pairs_equal_weight = {}
    for tickers in pairs:
        pairs_equal_weight[tickers] = 1/len(pairs)

    pairs_trade = catch_signal(pairs, pairs_metric, enter, close) #페어별 거래날짜(진입, 청산, 손절) 결정
    
    pairs_result = pnl_calc(pairs, pairs_metric, pairs_weight, pairs_trade, price_df) #페어별 손익 계산
    pairs_result['interval_num'] = interval_num

    interval_results_pair = pd.concat([interval_results_pair, pairs_result], axis=0) # 모든 인터벌에 대해
    

    # account_result = total_account(pairs, pairs_metric, pairs_result, price_df)
    # interval_results_account = pd.concat(interval_results_account, account_result, axis=0) # 모든 인터벌에 대해 


In [33]:
interval_results_pair

Unnamed: 0,open,close,holding_days,long,short,long_shares,short_shares,pnl,long_pnl,short_pnl,trading_cost,short_cost,pair,interval_num
0,2017-10-10,2017-10-25,15,VHT,XLV,0.00555,0.010333,-0.006026,0.001554,-0.004443,-1.2e-05,-0.003125,XLV_VHT,2
0,2017-08-30,2017-09-01,2,USMV,SCHX,0.042878,0.073073,-0.009516,0.008147,-0.017172,-7.4e-05,-0.000417,USMV_SCHX,2
1,2017-09-11,2017-09-13,2,USMV,SCHX,0.042261,0.072176,-0.016573,-0.006339,-0.009744,-7.4e-05,-0.000417,USMV_SCHX,2
2,2017-10-19,2017-11-06,18,SCHX,USMV,0.070251,0.041801,0.021818,0.025642,0.0,-7.4e-05,-0.00375,USMV_SCHX,2
0,2018-01-24,2018-01-26,2,VHT,XLV,0.0,0.0,-0.000417,0.0,-0.0,-0.0,-0.000417,XLV_VHT,3
0,2018-11-08,2018-11-09,1,SCHX,USMV,0.034963,0.020705,-0.010797,-0.011188,0.000621,-2.2e-05,-0.000208,USMV_SCHX,5
1,2018-11-26,2018-11-28,2,USMV,SCHX,0.021223,0.036619,-0.007121,0.023346,-0.030028,-2.2e-05,-0.000417,USMV_SCHX,5
0,2019-11-19,2019-11-29,10,XLV,VHT,0.01757,0.009414,-0.009542,0.029869,-0.03728,-4.8e-05,-0.002083,XLV_VHT,7
0,2020-02-21,2020-02-26,5,VHT,XLV,0.009612,0.018198,-0.006415,-0.120147,0.114827,-5.3e-05,-0.001042,XLV_VHT,8
1,2020-03-23,2020-03-27,4,XLV,VHT,0.025223,0.013448,-0.007483,0.261567,-0.268152,-6.5e-05,-0.000833,XLV_VHT,8
