In [1]:
import pandas as pd
import numpy as np
import warnings
import pyodbc
import random
import os
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from collections import Counter
import multiprocessing as mp
import ray
import datetime

warnings.filterwarnings(action = 'ignore')

2023-12-07 15:47:37,382	INFO util.py:159 -- Outdated packages:
  ipywidgets==7.6.3 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


#### 유전 알고리즘 활용한 페어 트레이딩_섹터 확장_v1

* v8
 - 비중 제한 없애기
 - child 생성 시 볼린저 밴드 계산 기간과 비중 모두 BLX-alpha crossover 적용
 - mutation 시 랜덤넘버 생성을 통해 각 파라미터에 더하기
 - 변이 이후 수선 기능 추가. 1) 롱/숏 비중 각 1로 맞추기
 - train set 기간 0.7에 대해 시뮬레이션 해보기
 - 정상성 테스트 제외: rolling base 기반 표준화를 사용하기 때문

* v2
 - 시간청산 추가

In [2]:
conn_pcor = pyodbc.connect('driver={Oracle in OraClient18Home1};dbq=PCOR;uid=EF0SEL;pwd=EF0SEL#076')
conn_quant = pyodbc.connect('driver={SQL Server};server=46.2.90.172;database=quant;uid=index;pwd=samsung@00')
conn_wisefn = pyodbc.connect('driver={SQL Server};server=46.2.90.172;database=wisefn;uid=index;pwd=samsung@00')

In [3]:
os.chdir("..")
os.chdir("..")

os.chdir('00_data')

#df_const = pd.read_json('230927_index_constituent.json')
df_prc_raw = pd.read_json('230927_stk_prc_daily.json')
#df_turnover = pd.read_json('230927_stk_turnover.json')
#df_sector = pd.read_json('230927_stk_sector.json')
df_mktcap = pd.read_json('230927_stk_mktcap.json')
#df_turnover_daily = pd.read_json('231011_stk_turnover_daily.json')

os.chdir("..")
os.chdir('02_Trading Strategies')
os.chdir('231026_유전 알고리즘을 활용한 페어트레이딩')

In [4]:
sector_code = 'IKS013'

In [5]:
sql_dt = f'''
        SELECT TRD_DT, CLOSE_PRC
        FROM TS_IDX_DAILY
        WHERE 1=1
        AND SEC_CD = '{sector_code}'
        AND TRD_DT > '20091231'
        ORDER BY TRD_DT ASC
'''

df_dt = pd.read_sql(sql_dt, conn_wisefn)

In [6]:
df_dt['TRD_DT'] =  pd.to_datetime(df_dt['TRD_DT'])
df_dt['M'] = df_dt['TRD_DT'].dt.to_period('M')
dt_m = [max(df_dt[df_dt['M']==p]['TRD_DT']) for p in df_dt['M'].unique()]
dt_m = [int(x.strftime('%Y%m%d')) for x in dt_m]

In [27]:
def objective_func(df, params):
    
    #stk_list = sorted(list(set(df['COMP_CD'])))
    
    params_period = params[0]
    params_init = params[1]
    params_exit = params[2]
    params_wt = params[3:]

    df_sp = np.log(df) * params_wt
    df_sp = df_sp.sum(axis=1)
    
    df_sp_norm = (df_sp - df_sp.rolling(params_period).mean())/df_sp.rolling(params_period).std()
    df_sp_norm.dropna(axis=0, inplace=True)
    
    init_dt = []
    exit_dt = []
    init_yn = 0
    for i in range(len(df_sp_norm)):
        if init_yn == 0:
            if df_sp_norm.iloc[i] < params_init:
                init_dt.append(df_sp_norm.index[i])
                init_yn = 1
        
        elif init_yn == 1:
            if df_sp_norm.iloc[i] > params_exit:
                exit_dt.append(df_sp_norm.index[i])
                init_yn = 0
        
    if len(init_dt) > len(exit_dt):
        if len(init_dt) - len(exit_dt) > 1:
            print("error")
        else:
            init_dt = init_dt[:-1]
            
    
    ## 시간청산 기준
    for i in range(len(init_dt)):
        init_idx = list(df_sp_norm.index).index(init_dt[i])
        exit_idx = list(df_sp_norm.index).index(exit_dt[i])

        if exit_idx - init_idx > 250:
            exit_dt[i] = df_sp_norm.index[init_idx + 250]

            
    
    ret_list = []
    cum_ret = 1
    for i in range(len(init_dt)):
        dt_1 = init_dt[i]
        dt_2 = exit_dt[i]
        
        prc_1 = df.loc[dt_1]
        prc_2 = df.loc[dt_2]
        
        ret = prc_2/prc_1 - 1
        ret = (ret * params_wt).sum()
        ret_list.append(ret)
        cum_ret *= (1+ret)
    
    return cum_ret

In [28]:
def generate_weight(length):
    
    pos_len = random.randint(1,length - 1)
    neg_len = length - pos_len
    
    pos_wt = []
    for i in range(pos_len):
        p = random.uniform(0,1)
        pos_wt.append(p)
    
    pos_wt = [x/sum(pos_wt) for x in pos_wt]
    
    neg_wt = []
    for i in range(neg_len):
        n = random.uniform(0,1)
        neg_wt.append(n)
    
    neg_wt = [-x/sum(neg_wt) for x in neg_wt]
    
    wt = pos_wt + neg_wt
    
    random.shuffle(wt)
    
    return wt  
        

def generate_params(length):
    
    params_period = random.randint(20,20*36)
    params_init = random.uniform(-4,0)
    params_exit = random.uniform(0,4)
    params_wt = generate_weight(length)
    
    params = [params_period, params_init, params_exit, *params_wt]
            
    return params
    

In [29]:
def generate_pop(size, length):
    population = []
    
    for i in range(size):
        
        params = generate_params(length)
        population.append(params)
        
    return population

In [30]:
def compute_performance(df, population):
    
    obj_list = []
    for individual in population:
        obj = objective_func(df, individual)
        obj_list.append([individual,obj])
    
    pop_sorted = sorted(obj_list, key=lambda x: x[1], reverse=True)
    
    return pop_sorted

In [31]:
def select_survivors(population_sorted, best_sample, lucky_few, length):
    
    next_gen = []
    
    for i in range(best_sample):
        if population_sorted[i][1] > 0:
            next_gen.append(population_sorted[i][0])
    
    lucky_index = np.random.choice(list(range(len(population_sorted))), lucky_few, replace=False)
    for i in lucky_index:
        next_gen.append(population_sorted[i][0])
    
    next_gen_2 = []
    for item in next_gen:
        if item not in next_gen_2:
            next_gen_2.append(item)
    
    
    while len(next_gen_2) < best_sample + lucky_few:
        next_gen_2.append(generate_params(length))
        
    random.shuffle(next_gen_2)
    
    return next_gen_2

In [32]:
def create_child(param1, param2, alpha):
    
    child = param1.copy()
    
    for i in range(len(child)):
        
        dist = abs(param1[i]-param2[i])
        
        l = min(param1[i],param2[i]) - alpha * dist
        u = max(param1[i],param2[i]) + alpha * dist

        child[i] = l + random.random() * (u-l)
    
    child[0] = int(np.round(child[0]))
    child[0] = min(20*36, child[0])
    child[0] = max(20, child[0])
    
    return child


def create_children(df, population, n_child, alpha, r_param):
    
    k = 0
    children = []

    fitness = [objective_func(df, x) for x in population]
    roulette = [x - min(fitness) + (max(fitness) - min(fitness))/(r_param - 1) for x in fitness]
    #fitness_2 = [x if x >= 0 else 0 for x in fitness]
    
    prob = [x/sum(roulette) for x in roulette]
    
    while k < n_child:
        try:
            sample = np.random.choice(list(range(len(prob))), 2, replace=True, p = prob)
        except:
            sample = np.random.choice(list(range(len(prob))), 2, replace=True)
        parents = [population[sample[0]], population[sample[1]]]
        
        child = create_child(parents[0], parents[1], alpha)
        children.append(child)
        
        k += 1

    return children

In [33]:
def mutation1(param, prob):
    
    r1 = random.random()
    
    param_mutate = param.copy()
    
    if r1 < prob:
        m1 = np.random.normal(0, 20)
        new_period = m1 + param[0]
        new_period = max(20, new_period)
        new_period = min(20*36, new_period)
        
        param_mutate[0] = int(np.round(new_period))
        
        m2 = np.random.normal(0,0.02,len(param) - 1)
        param_mutate[1:] = [x+y for (x,y) in zip(param_mutate[1:], m2)]
    
    return param_mutate


################################################
def mutate_pop(population, prob1):
    
    for i in range(len(population)):
        population[i] = mutation1(population[i], prob1)
    
    return population

In [34]:
def fix_1(population, length):
    
    # 수선: 비중 조절
    
    fixed_pop = []
    
    for child in population:
    
        child_wt = child[3:]
        child_pos = [x if x >= 0 else 0 for x in child_wt]
        child_neg = [x if x < 0 else 0 for x in child_wt]

        try:
            child_pos_adj = [x/sum(child_pos) for x in child_pos]
            child_neg_adj = [-x/sum(child_neg) for x in child_neg]
            child_wt_adj = [x+y for (x,y) in zip(child_pos_adj, child_neg_adj)]
            adj_child = child[:3] + child_wt_adj
        
        except:
            new_wt = generate_params(length)[3:]
            adj_child = child[:3] + new_wt
        
        fixed_pop.append(adj_child)
    
    return fixed_pop

In [35]:
def simulation_func(df, params):
    
    #stk_list = sorted(list(set(df['COMP_CD'])))
    
    params_period = params[0]
    params_init = params[1]
    params_exit = params[2]
    params_wt = params[3:]

    df_sp = np.log(df) * params_wt
    df_sp = df_sp.sum(axis=1)
    
    df_sp_norm = (df_sp - df_sp.rolling(params_period).mean())/df_sp.rolling(params_period).std()
    df_sp_norm.dropna(axis=0, inplace=True)
    
    init_dt = []
    exit_dt = []
    init_yn = 0
    for i in range(len(df_sp_norm)):
        if init_yn == 0:
            if df_sp_norm.iloc[i] < params_init:
                init_dt.append(df_sp_norm.index[i])
                init_yn = 1
        
        elif init_yn == 1:
            if df_sp_norm.iloc[i] > params_exit:
                exit_dt.append(df_sp_norm.index[i])
                init_yn = 0
        
    if len(init_dt) > len(exit_dt):
        if len(init_dt) - len(exit_dt) > 1:
            print("error")
        else:
            init_dt = init_dt[:-1]
    
    settle_type_list = []
    ## 시간청산 기준

    for i in range(len(init_dt)):
        init_idx = list(df_sp_norm.index).index(init_dt[i])
        exit_idx = list(df_sp_norm.index).index(exit_dt[i])

        if exit_idx - init_idx > 250:
            exit_dt[i] = df_sp_norm.index[init_idx + 250]
            settle_type_list.append('time limit')
        else:
            settle_type_list.append('reversion')
    
    
    ret_list = []
    long_ret_list = []
    short_ret_list = []
    cum_ret = 1
    for i in range(len(init_dt)):
        dt_1 = init_dt[i]
        dt_2 = exit_dt[i]
        
        prc_1 = df.loc[dt_1]
        prc_2 = df.loc[dt_2]

        
        ret = prc_2/prc_1 - 1
        ret_ls = (ret * params_wt).sum()
        ret_list.append(ret_ls)        
        cum_ret *= (1+ret_ls)
        
        long_wt = [x if x >= 0 else 0 for x in params_wt]
        long_ret = (ret * long_wt).sum()
        long_ret_list.append(long_ret)
        
        short_wt = [x if x < 0 else 0 for x in params_wt]
        short_ret = (ret * short_wt).sum()
        short_ret_list.append(short_ret)
        
        
    
    return cum_ret, init_dt, exit_dt, ret_list, long_ret_list, short_ret_list, settle_type_list

In [17]:
sector_code_list = ['IKS005', 'IKS006', 'IKS007', 'IKS008', 'IKS009', 'IKS010', 'IKS011', 'IKS012', 
                    'IKS013', 'IKS014', 'IKS015', 'IKS016', 'IKS017', 'IKS018', 'IKS019', 'IKS020', 
                    'IKS024', 'IKS025', 'IKS026']

In [18]:
sector_name_list = ['음식료품', '섬유의복', '종이목재', '화학', '의약품', '비금속광물', '철강금속', 
                    '기계', '전기전자', '의료정밀', '운수장비', '유통업', '전기가스업', '건설업', 
                    '운수창고업', '통신업', '증권', '보험업', '서비스업']

In [19]:
dict_sector_name = dict(zip(sector_code_list, sector_name_list))

In [20]:
i = 0.70
cap_hurdle = 200 * 1000000000
max_stk_cnt = 10

base_d = dt_m[int(np.round(len(dt_m)*i,0))]
train_begin_dt = dt_m[0]
train_end_dt = base_d

cap_filter = df_mktcap.copy()
cap_filter = cap_filter[cap_filter['BASE_D'] == base_d]
cap_filter = cap_filter[cap_filter['MKTCAP'] > cap_hurdle]
cap_filter.sort_values('MKTCAP', ascending = False, inplace = True)

dict_train = {}
dict_test = {}
dict_stk = {}

for sector_code in sector_code_list:
    
    sql_const = f'''
                SELECT TRD_DT, CONCAT('A',STK_CD) COMP_CD, STK_NM_KOR COMP_NM
                FROM TS_STK_ISSUE
                WHERE 1=1
                AND KS200_TYP = 1
                AND TRD_DT = '{base_d}'
                AND KSC_CD = '{sector_code}'
    '''

    df_const = pd.read_sql(sql_const, conn_wisefn)

    df_train = df_prc_raw.copy()
    df_train = df_train[df_train['TRD_DT'] >= train_begin_dt]
    df_train = df_train[df_train['TRD_DT'] <= train_end_dt]
    df_train = df_train[df_train['COMP_CD'].isin(df_const['COMP_CD'])]
    df_train = df_train.sort_values(by = ['TRD_DT','COMP_CD'])
    #stk_list = sorted(list(set(df_train['COMP_CD'])))


    df_train = df_train.pivot(index = 'TRD_DT', columns = 'COMP_CD')
    df_train = df_train.droplevel(axis = 1, level = 0)
    #df_train = df_train[stk_list]
    df_train.dropna(axis=1, inplace=True)
    stk_list = sorted(list(df_train.columns))
    
    # 시가총액 필터링
    stk_list = cap_filter[cap_filter['COMP_CD'].isin(stk_list)]['COMP_CD'][:max_stk_cnt].to_list()

    df_train = df_train[stk_list]
    
    df_test = df_prc_raw.copy()
    df_test = df_test[df_test['TRD_DT'] >= train_end_dt]
    #df_test = df_prc[df_prc['TRD_DT'] <= train_end_dt]
    df_test = df_test[df_test['COMP_CD'].isin(df_const['COMP_CD'])]
    df_test = df_test.sort_values(by = ['TRD_DT','COMP_CD'])

    df_test = df_test.pivot(index = 'TRD_DT', columns = 'COMP_CD')
    df_test = df_test.droplevel(axis = 1, level = 0)
    df_test = df_test[stk_list]
    
    dict_train[sector_code] = df_train
    dict_test[sector_code] = df_test
    dict_stk[sector_code] = stk_list

#### 병렬처리

In [21]:
ray.init(num_cpus=8)

2023-12-07 15:53:12,397	INFO worker.py:1664 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.8.8
Ray version:,2.8.0
Dashboard:,http://127.0.0.1:8265


In [22]:
filtered_sector = []
for sector_code in dict_stk.keys():
    if len(dict_stk[sector_code]) < 2:
        pass
    else:
        filtered_sector.append(sector_code)

In [23]:
dict_train_2 = {k:dict_train[k] for k in filtered_sector if k in dict_train}
dict_test_2 = {k:dict_test[k] for k in filtered_sector if k in dict_test}
dict_stk_2 = {k:dict_stk[k] for k in filtered_sector if k in dict_stk}
dict_sector_name_2 = {k:dict_sector_name[k] for k in filtered_sector if k in dict_sector_name}

In [36]:
@ray.remote
def get_optimized_params(sector_code, dict_train, dict_test, dict_stk, dict_sector_name):
    
    print(f'섹터: {dict_sector_name[sector_code]}')
    #print(' ')
    
    df_train = dict_train[sector_code]
    df_test = dict_test[sector_code]
    stk_list = dict_stk[sector_code]
    
    
    #print(f'  Train set: {train_begin_dt} ~ {train_end_dt}')
    #print(f'  Test set: {train_end_dt} ~ {max(df_test.index)}')
    #print(' ')
    #print(f'  유전 알고리즘 시작')
    n_generation = 200
    population = 1000
    best_sample = int(population/4)
    lucky_few = int(population/4)
    prob1 = 0.01
    #prob2 = 0.01
    alpha = 0.10
    r_param = 4
    param_length = len(stk_list)

    pop = generate_pop(size = population, length = param_length)

    best_gene = []
    best_perf = []

    g = 0
    
    cut = int(np.round(len(df_train)/3))

    while True:
        
        r = random.randint(1,len(df_train) - cut)
        
        df_train_sub = df_train.iloc[r:r+cut]

        pop_sorted = compute_performance(df_train_sub, pop)

        survivors = select_survivors(pop_sorted, best_sample, lucky_few, param_length)

        children = create_children(df_train_sub, survivors, population, alpha, r_param)

        new_generation = mutate_pop(children, prob1)

        if len(new_generation) < population:
            #print(f'  세대수 부족: {len(new_generation)}')
            break

        fixed_generation = fix_1(new_generation, param_length)

        pop = fixed_generation
        best_gene.append(pop_sorted[0][0])
        best_perf.append(pop_sorted[0][1])


        #print(f'====== {g}th generation ends ======')
        #print(pop_sorted[0])


        pop_wt = [x[0][3:] for x in pop_sorted]
        dist_wt = [sum([(x-y)**2 for (x,y) in zip(pop_wt[0],z)]) for z in pop_wt]
        hurdle = 0.025
        dist_wt_hurdle = [1 for x in dist_wt if x < hurdle]

        #print(f'    최다 출현 개체수: {sum(dist_wt_hurdle)}')
        if sum(dist_wt_hurdle) > population * 0.7:
            break

        g += 1

        if g > n_generation:
            break

        print(' ')

    #print(f'  유전 알고리즘 종료')
    #print(' ')
    
    return pop_sorted[0][0], g

In [37]:
futures = [get_optimized_params.remote(sector_code, dict_train_2, dict_test_2, dict_stk_2, dict_sector_name_2) for sector_code in filtered_sector]

[36m(get_optimized_params pid=32672)[0m 섹터: 전기전자
[36m(get_optimized_params pid=4724)[0m 섹터: 음식료품
[36m(get_optimized_params pid=12456)[0m 섹터: 섬유의복
[36m(get_optimized_params pid=17736)[0m 섹터: 화학
[36m(get_optimized_params pid=32740)[0m 섹터: 비금속광물
[36m(get_optimized_params pid=20196)[0m 섹터: 의약품
[36m(get_optimized_params pid=31076)[0m 섹터: 철강금속
[36m(get_optimized_params pid=24276)[0m 섹터: 기계


In [38]:
t1 = datetime.datetime.today()
optimized_params = ray.get(futures)

t2 = datetime.datetime.today()

lt = (t2 - t1).seconds

[36m(get_optimized_params pid=24276)[0m  
[36m(get_optimized_params pid=24276)[0m  [32m [repeated 8x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
[36m(get_optimized_params pid=24276)[0m  [32m [repeated 8x across cluster][0m
[36m(get_optimized_params pid=24276)[0m  [32m [repeated 8x across cluster][0m
[36m(get_optimized_params pid=24276)[0m  [32m [repeated 8x across cluster][0m
[36m(get_optimized_params pid=32740)[0m  [32m [repeated 8x across cluster][0m
[36m(get_optimized_params pid=32740)[0m  [32m [repeated 8x across cluster][0m
[36m(get_optimized_params pid=4724)[0m  [32m [repeated 6x across cluster][0m
[36m(get_optimized_params pid=32740)[0m  [32m [repeated 2x across cluster][0m
[36m(get_optimized_params pid=32672)[0m  [32m [repeated 4x across cluster][0m
[36m(get_optimized_para

[36m(get_optimized_params pid=4724)[0m  [32m [repeated 3x across cluster][0m
[36m(get_optimized_params pid=24276)[0m  [32m [repeated 5x across cluster][0m
[36m(get_optimized_params pid=12456)[0m  [32m [repeated 3x across cluster][0m
[36m(get_optimized_params pid=24276)[0m  [32m [repeated 5x across cluster][0m
[36m(get_optimized_params pid=20196)[0m  [32m [repeated 2x across cluster][0m
[36m(get_optimized_params pid=31076)[0m  [32m [repeated 7x across cluster][0m
[36m(get_optimized_params pid=20196)[0m  
[36m(get_optimized_params pid=12456)[0m  
[36m(get_optimized_params pid=31076)[0m  [32m [repeated 6x across cluster][0m
[36m(get_optimized_params pid=12456)[0m 섹터: 운수장비
[36m(get_optimized_params pid=24276)[0m  [32m [repeated 3x across cluster][0m
[36m(get_optimized_params pid=4724)[0m  [32m [repeated 3x across cluster][0m
[36m(get_optimized_params pid=24276)[0m  [32m [repeated 5x across cluster][0m
[36m(get_optimized_params pid=12456)[0m  

[36m(get_optimized_params pid=32672)[0m  [32m [repeated 3x across cluster][0m
[36m(get_optimized_params pid=12456)[0m 섹터: 건설업
[36m(get_optimized_params pid=17736)[0m  [32m [repeated 2x across cluster][0m
[36m(get_optimized_params pid=32672)[0m  [32m [repeated 5x across cluster][0m
[36m(get_optimized_params pid=20196)[0m  [32m [repeated 3x across cluster][0m
[36m(get_optimized_params pid=32672)[0m  [32m [repeated 5x across cluster][0m
[36m(get_optimized_params pid=12456)[0m  
[36m(get_optimized_params pid=20196)[0m  
[36m(get_optimized_params pid=17736)[0m  [32m [repeated 3x across cluster][0m
[36m(get_optimized_params pid=20196)[0m  [32m [repeated 4x across cluster][0m
[36m(get_optimized_params pid=32672)[0m  [32m [repeated 3x across cluster][0m
[36m(get_optimized_params pid=24276)[0m  [32m [repeated 6x across cluster][0m
[36m(get_optimized_params pid=31076)[0m  [32m [repeated 3x across cluster][0m
[36m(get_optimized_params pid=32740)[0m 

[36m(get_optimized_params pid=4724)[0m  [32m [repeated 4x across cluster][0m
[36m(get_optimized_params pid=32740)[0m  [32m [repeated 5x across cluster][0m
[36m(get_optimized_params pid=17736)[0m  [32m [repeated 2x across cluster][0m
[36m(get_optimized_params pid=32672)[0m  [32m [repeated 4x across cluster][0m
[36m(get_optimized_params pid=4724)[0m  [32m [repeated 4x across cluster][0m
[36m(get_optimized_params pid=12456)[0m  [32m [repeated 7x across cluster][0m
[36m(get_optimized_params pid=32672)[0m  [32m [repeated 2x across cluster][0m
[36m(get_optimized_params pid=12456)[0m  [32m [repeated 6x across cluster][0m
[36m(get_optimized_params pid=17736)[0m  [32m [repeated 7x across cluster][0m
[36m(get_optimized_params pid=4724)[0m  
[36m(get_optimized_params pid=32672)[0m  
[36m(get_optimized_params pid=24276)[0m  [32m [repeated 7x across cluster][0m
[36m(get_optimized_params pid=17736)[0m  [32m [repeated 6x across cluster][0m
[36m(get_opt

[36m(get_optimized_params pid=20196)[0m  [32m [repeated 3x across cluster][0m
[36m(get_optimized_params pid=24276)[0m  [32m [repeated 3x across cluster][0m
[36m(get_optimized_params pid=20196)[0m  [32m [repeated 4x across cluster][0m
[36m(get_optimized_params pid=32672)[0m  [32m [repeated 2x across cluster][0m
[36m(get_optimized_params pid=31076)[0m  [32m [repeated 4x across cluster][0m
[36m(get_optimized_params pid=24276)[0m  
[36m(get_optimized_params pid=4724)[0m  
[36m(get_optimized_params pid=4724)[0m  [32m [repeated 5x across cluster][0m
[36m(get_optimized_params pid=20196)[0m  [32m [repeated 2x across cluster][0m
[36m(get_optimized_params pid=24276)[0m  [32m [repeated 4x across cluster][0m
[36m(get_optimized_params pid=31076)[0m  [32m [repeated 5x across cluster][0m
[36m(get_optimized_params pid=4724)[0m  [32m [repeated 4x across cluster][0m
[36m(get_optimized_params pid=32672)[0m  [32m [repeated 2x across cluster][0m
[36m(get_opt

[36m(get_optimized_params pid=4724)[0m  [32m [repeated 2x across cluster][0m
[36m(get_optimized_params pid=20196)[0m  
[36m(get_optimized_params pid=4724)[0m  
[36m(get_optimized_params pid=20196)[0m  
[36m(get_optimized_params pid=4724)[0m  
[36m(get_optimized_params pid=4724)[0m  
[36m(get_optimized_params pid=20196)[0m  
[36m(get_optimized_params pid=4724)[0m  
[36m(get_optimized_params pid=20196)[0m  
[36m(get_optimized_params pid=4724)[0m  
[36m(get_optimized_params pid=20196)[0m  
[36m(get_optimized_params pid=20196)[0m  [32m [repeated 2x across cluster][0m
[36m(get_optimized_params pid=20196)[0m  [32m [repeated 2x across cluster][0m
[36m(get_optimized_params pid=4724)[0m  [32m [repeated 2x across cluster][0m
[36m(get_optimized_params pid=4724)[0m  [32m [repeated 2x across cluster][0m
[36m(get_optimized_params pid=4724)[0m  [32m [repeated 2x across cluster][0m
[36m(get_optimized_params pid=20196)[0m  
[36m(get_optimized_params pid=47

In [39]:
lt/60

59.416666666666664

In [40]:
optimized_params_1 = dict(zip(filtered_sector,[x[0] for x in optimized_params]))
optimized_params_2 = dict(zip(filtered_sector,[x[1] for x in optimized_params]))

In [99]:
result_train = {}
result_test = {}
list_summary = []

for sector_code in filtered_sector:
    
    if len(dict_stk[sector_code]) == 0:
        continue
        
    params = optimized_params_1[sector_code]
    df_train = dict_train[sector_code]
    df_test = dict_test[sector_code]
    
    temp_train = simulation_func(df_train, params)
    temp_test = simulation_func(df_test, params)
    
    ret_train = pd.DataFrame([temp_train[1], temp_train[2], temp_train[3],temp_train[4],temp_train[5]], 
                             index = ['init_dt','exit_dt','ls','long','short']).T
    ret_test = pd.DataFrame([temp_test[1], temp_test[2], temp_test[3],temp_test[4],temp_test[5]], 
                            index = ['init_dt','exit_dt','ls','long','short']).T
    
    ret_train['init_dt'] = pd.to_datetime([str(int(x)) for x in ret_train['init_dt']])
    ret_train['exit_dt'] = pd.to_datetime([str(int(x)) for x in ret_train['exit_dt']])
    ret_test['init_dt'] = pd.to_datetime([str(int(x)) for x in ret_test['init_dt']])
    ret_test['exit_dt'] = pd.to_datetime([str(int(x)) for x in ret_test['exit_dt']])
    
    ret_train['holding period'] = ret_train['exit_dt'] - ret_train['init_dt']
    ret_test['holding period'] = ret_test['exit_dt'] - ret_test['init_dt']
    
    ret_train['settle'] = temp_train[-1]
    ret_test['settle'] = temp_test[-1]
    
    result_train[sector_code] = ret_train
    result_test[sector_code] = ret_test
    
    
    case_train = len(ret_train)
    
    if case_train == 0:
        hr_train = 0
        settle_train = 0
    else:
        hr_train = len(ret_train[ret_train['ls'] >= 0])/len(ret_train)
        settle_train = len(ret_train[ret_train['settle'] == 'reversion'])/case_train
        
    cumret_train = temp_train[0]
    
    holding_period_train = ret_train['holding period'].median().days
    
    case_test = len(ret_test)
    
    if case_test == 0:
        hr_test = 0
        settle_test = 0
    else:
        hr_test = len(ret_test[ret_test['ls'] >= 0])/len(ret_test)
        settle_test = len(ret_test[ret_test['settle'] == 'reversion'])/case_test
    
    cumret_test = temp_test[0]
    
    holding_period_test = ret_test['holding period'].median().days
                   
    
    list_summary.append([case_train, hr_train, settle_train, cumret_train, holding_period_train, 
                         case_test, hr_test, settle_test, cumret_test, holding_period_test])
    

In [101]:
df_summary = pd.DataFrame(list_summary, columns = ['cnt_train','hr_train','settle_train','ret_train','hp_train',
                                                   'cnt_test','hr_test','settle_test','ret_test','hp_test'], 
                          index = [dict_sector_name[x] for x in result_train.keys()])
df_summary['keys'] = result_train.keys()

In [102]:
df_summary

Unnamed: 0,cnt_train,hr_train,settle_train,ret_train,hp_train,cnt_test,hr_test,settle_test,ret_test,hp_test,keys
음식료품,4,0.75,0.5,2.171016,264,1,0.0,0.0,0.895571,370.0,IKS005
섬유의복,3,1.0,0.666667,3.10977,199,1,1.0,1.0,1.174018,244.0,IKS006
화학,8,1.0,1.0,5.298796,125,1,1.0,1.0,1.01412,282.0,IKS008
의약품,3,1.0,0.333333,5.138455,368,0,0.0,0.0,1.0,,IKS009
비금속광물,30,0.633333,1.0,0.808642,65,8,0.625,0.875,0.806504,39.0,IKS010
철강금속,6,1.0,1.0,3.972472,155,1,0.0,1.0,0.79148,364.0,IKS011
기계,4,0.25,0.5,0.067708,352,1,1.0,1.0,1.182624,51.0,IKS012
전기전자,3,1.0,1.0,1.828753,263,1,1.0,1.0,1.191046,225.0,IKS013
운수장비,6,1.0,1.0,6.21713,181,0,0.0,0.0,1.0,,IKS015
유통업,7,1.0,1.0,6.527033,141,2,0.5,0.5,1.175575,308.0,IKS016


In [97]:
result_train['IKS025']

Unnamed: 0,init_dt,exit_dt,ls,long,short,holding period,settle
0,2010-04-20,2010-04-29,0.072805,0.056632,0.016173,9 days,reversion
1,2010-05-12,2010-07-05,0.019065,0.131884,-0.112819,54 days,reversion
2,2010-08-02,2010-08-11,0.058807,0.001387,0.057420,9 days,reversion
3,2010-08-23,2010-09-01,0.040140,0.008996,0.031144,9 days,reversion
4,2010-09-10,2010-09-14,0.034180,0.031250,0.002930,4 days,reversion
...,...,...,...,...,...,...,...
60,2018-10-18,2018-12-07,0.032749,-0.002874,0.035622,50 days,reversion
61,2019-03-13,2019-05-07,-0.001772,0.016058,-0.017831,55 days,reversion
62,2019-07-03,2019-08-07,0.043706,-0.041045,0.084751,35 days,reversion
63,2019-09-24,2019-10-08,0.022647,-0.013725,0.036372,14 days,reversion


In [98]:
result_test['IKS025']

Unnamed: 0,init_dt,exit_dt,ls,long,short,holding period,settle
0,2020-01-29,2020-04-23,-0.059411,-0.002315,-0.057096,85 days,reversion
1,2020-06-23,2020-07-01,0.087557,0.0125,0.075057,8 days,reversion
2,2020-08-24,2020-12-10,-0.030422,-0.016722,-0.013699,108 days,reversion
3,2020-12-16,2021-01-18,0.016948,-0.081967,0.098915,33 days,reversion
4,2021-01-20,2021-01-25,0.023641,0.002472,0.021169,5 days,reversion
5,2021-01-26,2021-02-25,0.01381,0.041457,-0.027647,30 days,reversion
6,2021-03-09,2021-03-19,0.030812,0.033937,-0.003125,10 days,reversion
7,2021-06-21,2021-06-25,0.041695,0.103378,-0.061683,4 days,reversion
8,2021-10-27,2021-11-30,0.010277,-0.17611,0.186387,34 days,reversion
9,2022-02-15,2022-03-28,0.039238,0.136808,-0.097569,41 days,reversion


In [53]:
ray.shutdown()

In [55]:
list(df_train.index).index(20100211)

9

In [62]:
optimized_params_1['IKS026']

[91,
 -1.1206119789182847,
 2.2750927497315887,
 0.473130320549313,
 0.11406661734146026,
 0.06567370810854335,
 -0.24184283416494107,
 -0.005994128957857785,
 -0.3449175025591483,
 -0.022830891781561915,
 0.3471293540006834,
 -0.015084541546682238,
 -0.3693301009898086]

In [77]:
df = df_test.copy()
params = optimized_params_1['IKS026']

In [87]:
params_period = params[0]
params_init = params[1]
params_exit = params[2]
params_wt = params[3:]

df_sp = np.log(df) * params_wt
df_sp = df_sp.sum(axis=1)

df_sp_norm = (df_sp - df_sp.rolling(params_period).mean())/df_sp.rolling(params_period).std()
df_sp_norm.dropna(axis=0, inplace=True)

init_dt = []
exit_dt = []
init_yn = 0
for i in range(len(df_sp_norm)):
    if init_yn == 0:
        if df_sp_norm.iloc[i] < params_init:
            init_dt.append(df_sp_norm.index[i])
            init_yn = 1

    elif init_yn == 1:
        if df_sp_norm.iloc[i] > params_exit:
            exit_dt.append(df_sp_norm.index[i])
            init_yn = 0

if len(init_dt) > len(exit_dt):
    if len(init_dt) - len(exit_dt) > 1:
        print("error")
    else:
        init_dt = init_dt[:-1]



In [88]:
init_dt

[20201111, 20210430]

In [89]:
exit_dt

[20210216, 20220624]

In [81]:
settle_type_list = []
for i in range(len(init_dt)):
    init_idx = list(df_sp_norm.index).index(init_dt[i])
    exit_idx = list(df_sp_norm.index).index(exit_dt[i])
    print(exit_idx - init_idx)
    if exit_idx - init_idx > 250:
        exit_dt[i] = df_sp_norm.index[init_idx + 250]
        settle_type_list.append('time limit')
    else:
        settle_type_list.append('reversion')


64
283


In [82]:
settle_type_list

['reversion', 'time limit']

In [83]:
exit_dt

[20210216, 20220506]

In [90]:
ret_list = []
long_ret_list = []
short_ret_list = []
cum_ret = 1
for i in range(len(init_dt)):
    dt_1 = init_dt[i]
    dt_2 = exit_dt[i]

    prc_1 = df.loc[dt_1]
    prc_2 = df.loc[dt_2]


    ret = prc_2/prc_1 - 1
    ret_ls = (ret * params_wt).sum()
    ret_list.append(ret_ls)        
    cum_ret *= (1+ret_ls)

    long_wt = [x if x >= 0 else 0 for x in params_wt]
    long_ret = (ret * long_wt).sum()
    long_ret_list.append(long_ret)

    short_wt = [x if x < 0 else 0 for x in params_wt]
    short_ret = (ret * short_wt).sum()
    short_ret_list.append(short_ret)

In [91]:
ret_list

[0.21824969042607179, -0.22334528666585185]

In [None]:
params_period = params[0]
params_init = params[1]
params_exit = params[2]
params_wt = params[3:]

df_sp = np.log(df) * params_wt
df_sp = df_sp.sum(axis=1)

df_sp_norm = (df_sp - df_sp.rolling(params_period).mean())/df_sp.rolling(params_period).std()
df_sp_norm.dropna(axis=0, inplace=True)

init_dt = []
exit_dt = []
init_yn = 0
for i in range(len(df_sp_norm)):
    if init_yn == 0:
        if df_sp_norm.iloc[i] < params_init:
            init_dt.append(df_sp_norm.index[i])
            init_yn = 1

    elif init_yn == 1:
        if df_sp_norm.iloc[i] > params_exit:
            exit_dt.append(df_sp_norm.index[i])
            init_yn = 0

if len(init_dt) > len(exit_dt):
    if len(init_dt) - len(exit_dt) > 1:
        print("error")
    else:
        init_dt = init_dt[:-1]

settle_type_list = []
## 시간청산 기준

for i in range(len(init_dt)):
    init_idx = list(df_sp_norm.index).index(init_dt[i])
    exit_idx = list(df_sp_norm.index).index(exit_dt[i])

    if exit_idx - init_idx > 250:
        exit_dt[i] = df_sp_norm.index[init_idx + 250]
        settle_type_list.append('time limit')
    else:
        settle_type_list.append('reversion')


ret_list = []
long_ret_list = []
short_ret_list = []
cum_ret = 1
for i in range(len(init_dt)):
    dt_1 = init_dt[i]
    dt_2 = exit_dt[i]

    prc_1 = df.loc[dt_1]
    prc_2 = df.loc[dt_2]


    ret = prc_2/prc_1 - 1
    ret_ls = (ret * params_wt).sum()
    ret_list.append(ret_ls)        
    cum_ret *= (1+ret_ls)

    long_wt = [x if x >= 0 else 0 for x in params_wt]
    long_ret = (ret * long_wt).sum()
    long_ret_list.append(long_ret)

    short_wt = [x if x < 0 else 0 for x in params_wt]
    short_ret = (ret * short_wt).sum()
    short_ret_list.append(short_ret)