In [1]:
import pandas as pd
import numpy as np
import warnings
import pyodbc
import random
import os
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans

warnings.filterwarnings(action = 'ignore')

#### 유전 알고리즘 활용한 페어 트레이딩 v4

* v2
 - parameter 생성 시 공적분 검정 테스트를 추가하자 (train set 한도내에서)
 - child 생성 시 볼린저밴드 파라미터와 비중 2개로 나누어 crossover
 - mutation 시 볼린저밴드 파라미터 조정과 비중 shuffle

* v3
 - train set/test set 기간을 여러개로 나눠서 각 시뮬레이션 내 종목 비중과 결과 비교
 - 변이 이후 수선 기능을 추가. 각 파라미터의 정상성 테스트 후 p-value 0.05 이하인 것만 고르고 모자라다면 새로 생성
 
* v4
 - v2 + v3, 단 train set 기간 0.7에 대해 먼저 시뮬레이션 해보기
 - Clustering 기법을 도입해서 각 클러스터링 내 가장 score가 좋은 부모 쌍을 고르고 교배. 부모 + 자식 중 best fit을 고르고 변이를 적용해 클러스터링 내 개체 갯수만큼 숫자 채우기
 - Clustering은 KMeans 활용

In [2]:
conn_pcor = pyodbc.connect('driver={Oracle in OraClient18Home1};dbq=PCOR;uid=EF0SEL;pwd=EF0SEL#076')
conn_quant = pyodbc.connect('driver={SQL Server};server=46.2.90.172;database=quant;uid=index;pwd=samsung@00')
conn_wisefn = pyodbc.connect('driver={SQL Server};server=46.2.90.172;database=wisefn;uid=index;pwd=samsung@00')

In [3]:
os.chdir("..")
os.chdir("..")

os.chdir('00_data')

#df_const = pd.read_json('230927_index_constituent.json')
df_prc_raw = pd.read_json('230927_stk_prc_daily.json')
#df_turnover = pd.read_json('230927_stk_turnover.json')
#df_sector = pd.read_json('230927_stk_sector.json')
#df_mktcap = pd.read_json('230927_stk_mktcap.json')
#df_turnover_daily = pd.read_json('231011_stk_turnover_daily.json')

os.chdir("..")
os.chdir('02_Trading Strategies')
os.chdir('231026_유전 알고리즘을 활용한 페어트레이딩')

In [4]:
sector_code = 'IKS013'

In [5]:
sql_dt = f'''
        SELECT TRD_DT, CLOSE_PRC
        FROM TS_IDX_DAILY
        WHERE 1=1
        AND SEC_CD = '{sector_code}'
        AND TRD_DT > '20091231'
        ORDER BY TRD_DT ASC
'''

df_dt = pd.read_sql(sql_dt, conn_wisefn)

In [6]:
df_dt['TRD_DT'] =  pd.to_datetime(df_dt['TRD_DT'])
df_dt['M'] = df_dt['TRD_DT'].dt.to_period('M')
dt_m = [max(df_dt[df_dt['M']==p]['TRD_DT']) for p in df_dt['M'].unique()]
dt_m = [int(x.strftime('%Y%m%d')) for x in dt_m]

In [7]:
def objective_func(df, params):
    
    #stk_list = sorted(list(set(df['COMP_CD'])))
    
    params_period = params[0]
    params_init = params[1]
    params_exit = params[2]
    params_wt = params[3:]

    df_sp = df * params_wt
    df_sp = df_sp.sum(axis=1)
    
    df_sp_norm = (df_sp - df_sp.rolling(params_period).mean())/df_sp.rolling(params_period).std()
    df_sp_norm.dropna(axis=0, inplace=True)
    
    init_dt = []
    exit_dt = []
    init_yn = 0
    for i in range(len(df_sp_norm)):
        if init_yn == 0:
            if df_sp_norm.iloc[i] < params_init:
                init_dt.append(df_sp_norm.index[i])
                init_yn = 1
        
        elif init_yn == 1:
            if df_sp_norm.iloc[i] > params_exit:
                exit_dt.append(df_sp_norm.index[i])
                init_yn = 0
        
    if len(init_dt) > len(exit_dt):
        if len(init_dt) - len(exit_dt) > 1:
            print("error")
        else:
            init_dt = init_dt[:-1]
    
    ret_list = []
    cum_ret = 1
    for i in range(len(init_dt)):
        dt_1 = init_dt[i]
        dt_2 = exit_dt[i]
        
        prc_1 = df.loc[dt_1]
        prc_2 = df.loc[dt_2]
        
        ret = prc_2/prc_1 - 1
        ret = (ret * params_wt).sum()
        ret_list.append(ret)
        cum_ret *= (1+ret)
    
    return cum_ret

In [8]:
def generate_weight(length):
    
    pos_len = random.randint(1,length - 1)
    neg_len = length - pos_len
    
    pos_wt = []
    for i in range(pos_len):
        p = random.uniform(0,1)
        pos_wt.append(p)
    
    pos_wt = [x/sum(pos_wt) for x in pos_wt]
    
    neg_wt = []
    for i in range(neg_len):
        n = random.uniform(0,1)
        neg_wt.append(n)
    
    neg_wt = [-x/sum(neg_wt) for x in neg_wt]
    
    wt = pos_wt + neg_wt
    
    random.shuffle(wt)
    
    return wt  
        

def generate_params(df, length):
    
    params_period = random.randint(20,20*36)
    params_init = random.uniform(-4,0)
    params_exit = random.uniform(0,4)
    
    while True:
        
        params_wt = generate_weight(length)
        
        df_sp = df * params_wt
        df_sp = df_sp.sum(axis=1)

        result = adfuller(df_sp)
        p_val = result[1]
        
        if p_val < 0.05:        
            params = [params_period, params_init, params_exit, *params_wt]
            
            break

            
    return params
    

In [9]:
def generate_pop(df, size, length):
    population = []
    
    for i in range(size):
        
        params = generate_params(df,length)
        population.append(params)
        
    return population

In [10]:
def compute_performance(df, population):
    
    obj_list = []
    for individual in population:
        obj = objective_func(df, individual)
        obj_list.append([individual,obj])
    
    pop_sorted = sorted(obj_list, key=lambda x: x[1], reverse=True)
    
    return pop_sorted

In [34]:
def pop_clustering(pop_sorted):
    
    pop_wt = [x[0][3:] for x in pop_sorted]
    
    n_cluster = int(np.round(np.log(len(pop_wt))/np.log(2)))*2
    
    clustering = KMeans(n_cluster).fit(pop_wt)
    
    pop_clustered = [[x,y] for (x,y) in zip(pop_sorted, clustering.labels_)]
    
    pop_dict = {}
    
    for i in range(n_cluster):
        pop_dict[i] = [x[:-1][0] for x in pop_clustered if x[1] == i]
    
    return pop_dict

In [35]:
def select_parents(pop_dict):
    
    dict_parents = {}
    
    for i in range(len(pop_dict.keys())):
        
        temp = pop_dict[i]
        random.shuffle(temp)
        
        temp_1 = temp[:int(len(temp)/2)]
        temp_2 = temp[int(len(temp)/2):]
        
        temp_1 = sorted(temp_1, key=lambda x: x[1], reverse=True)
        temp_2 = sorted(temp_2, key=lambda x: x[1], reverse=True)
        
        dict_parents[i] = [temp_1[0], temp_2[0], len(temp)]
    
    return dict_parents
        

In [36]:
def create_child(param1, param2):
    
    r1 = random.random()
    r2 = random.random()
    r3 = random.random()
    
    if r1 < 0.5:
        child = param1[:3] + param2[3:]
    else:
        child = param2[:3] + param1[3:]
            
    return child


def select_survivors(dict_parents, df):
    
    k = 1
    dict_survivors = {}
    
    for i in range(len(dict_parents.keys())):

        parents1 = dict_parents[i][0][0]
        parents2 = dict_parents[i][1][0]
        cluster_size = dict_parents[i][2]
        
        child = create_child(parents1, parents2)
        
        child_perf = objective_func(df, child)
        child = [*child, child_perf]
        
        candidate = [dict_parents[i][0],dict_parents[i][1], child]
        candidate = sorted(candidate, key=lambda x: x[1], reverse=True)
        
        best_candidate = candidate[0][0]
    
        dict_survivors[i] = [best_candidate, cluster_size]        
    
    return dict_survivors

In [42]:
def mutation1(param, prob):
    
    r1 = random.random()
    r2 = random.random()
    r3 = random.random()
    
     
    if r1 < prob:
        param_period = random.randint(20,20*36)
    else:
        param_period = param[0]
           
    if r2 < prob:
        param_init = random.uniform(-4,0)
    else:
        param_init = param[1]
        
    if r3 < prob:
        param_exit = random.uniform(0,4)
    else:
        param_exit = param[2]
        
        
    param_mutate = [param_period, param_init, param_exit, *param[3:]]
    
    return param_mutate


def mutation2(param, prob):
    
    r1 = random.random()
    
    param_wt = param[3:]
    
    if r1 < prob:
        random.shuffle(param_wt)

    param_mutate = param[:3] + param_wt
    
    return param_mutate



################################################
def mutate_pop(dict_survivors, prob1, prob2):
    
    next_gen = []
    
    for i in range(len(dict_survivors.keys())):
        
        best_candidate = dict_survivors[i][0]
        cluster_size = dict_survivors[i][1]
        
        next_gen.append(best_candidate)
        
        for i in range(1, cluster_size):
            
            r1 = random.random()
            r2 = random.random()
            
            if r1 < 0.05:
                mutated_candidate = mutation1(best_candidate, prob1)
                mutated_candidate = mutation2(mutated_candidate, prob2)
            else:
                if r2 < 0.5:
                    mutated_candidate = mutation1(best_candidate, prob1)
                else:
                    mutated_candidate = mutation2(best_candidate, prob2)
                
            
            next_gen.append(mutated_candidate)          
        
    
    return next_gen

In [43]:
def mutation_fix(population, df, length):
    
    population_fixed = []
    
    fixed_cnt = 0
    
    for i in population:
        
        params_wt = i[3:]
        
        df_sp = df * params_wt
        df_sp = df_sp.sum(axis=1)
        
        result = adfuller(df_sp)
        p_val = result[1]
        
        if p_val < 0.05:        
            individual_fixed = i
        else:
            new_wt = generate_params(df, length)[3:]
            individual_fixed = i[:3] + new_wt
            fixed_cnt += 1
                
        population_fixed.append(individual_fixed)
        
    return population_fixed, fixed_cnt


In [44]:
def simulation_func(df, params):
    
    #stk_list = sorted(list(set(df['COMP_CD'])))
    
    params_period = params[0]
    params_init = params[1]
    params_exit = params[2]
    params_wt = params[3:]

    df_sp = df * params_wt
    df_sp = df_sp.sum(axis=1)
    
    df_sp_norm = (df_sp - df_sp.rolling(params_period).mean())/df_sp.rolling(params_period).std()
    df_sp_norm.dropna(axis=0, inplace=True)
    
    init_dt = []
    exit_dt = []
    init_yn = 0
    for i in range(len(df_sp_norm)):
        if init_yn == 0:
            if df_sp_norm.iloc[i] < params_init:
                init_dt.append(df_sp_norm.index[i])
                init_yn = 1
        
        elif init_yn == 1:
            if df_sp_norm.iloc[i] > params_exit:
                exit_dt.append(df_sp_norm.index[i])
                init_yn = 0
        
    if len(init_dt) > len(exit_dt):
        if len(init_dt) - len(exit_dt) > 1:
            print("error")
        else:
            init_dt = init_dt[:-1]
    
    ret_list = []
    long_ret_list = []
    short_ret_list = []
    cum_ret = 1
    for i in range(len(init_dt)):
        dt_1 = init_dt[i]
        dt_2 = exit_dt[i]
        
        prc_1 = df.loc[dt_1]
        prc_2 = df.loc[dt_2]

        
        ret = prc_2/prc_1 - 1
        ret_ls = (ret * params_wt).sum()
        ret_list.append(ret_ls)        
        cum_ret *= (1+ret_ls)
        
        long_wt = [x if x >= 0 else 0 for x in params_wt]
        long_ret = (ret * long_wt).sum()
        long_ret_list.append(long_ret)
        
        short_wt = [x if x < 0 else 0 for x in params_wt]
        short_ret = (ret * short_wt).sum()
        short_ret_list.append(short_ret)
        
        
    
    return cum_ret, init_dt, exit_dt, ret_list, long_ret_list, short_ret_list

In [45]:
i = 0.7
base_d = dt_m[int(np.round(len(dt_m)*i,0))]
train_begin_dt = dt_m[0]
train_end_dt = base_d

sql_const = f'''
            SELECT TRD_DT, CONCAT('A',STK_CD) COMP_CD, STK_NM_KOR COMP_NM
            FROM TS_STK_ISSUE
            WHERE 1=1
            AND KS200_TYP = 1
            AND TRD_DT = '{base_d}'
            AND KSC_CD = '{sector_code}'
'''

df_const = pd.read_sql(sql_const, conn_wisefn)

df_train = df_prc_raw.copy()
df_train = df_train[df_train['TRD_DT'] >= train_begin_dt]
df_train = df_train[df_train['TRD_DT'] <= train_end_dt]
df_train = df_train[df_train['COMP_CD'].isin(df_const['COMP_CD'])]
df_train = df_train.sort_values(by = ['TRD_DT','COMP_CD'])
#stk_list = sorted(list(set(df_train['COMP_CD'])))


df_train = df_train.pivot(index = 'TRD_DT', columns = 'COMP_CD')
df_train = df_train.droplevel(axis = 1, level = 0)
#df_train = df_train[stk_list]
df_train.dropna(axis=1, inplace=True)
stk_list = sorted(list(df_train.columns))


df_test = df_prc_raw.copy()
df_test = df_test[df_test['TRD_DT'] >= train_end_dt]
#df_test = df_prc[df_prc['TRD_DT'] <= train_end_dt]
df_test = df_test[df_test['COMP_CD'].isin(df_const['COMP_CD'])]
df_test = df_test.sort_values(by = ['TRD_DT','COMP_CD'])

df_test = df_test.pivot(index = 'TRD_DT', columns = 'COMP_CD')
df_test = df_test.droplevel(axis = 1, level = 0)
df_test = df_test[stk_list]

In [None]:
print(f'Train set: {train_begin_dt} ~ {train_end_dt}')
print(f'Test set: {train_end_dt} ~ {max(df_test.index)}')

print(f'유전 알고리즘 시작')
n_generation = 200
population = 300
#best_sample = int(population/3)
#lucky_few = int(population/3)
prob1 = 1
prob2 = 1
param_length = len(stk_list)

pop = generate_pop(df_train, size = population, length = param_length)

best_gene = []
best_perf = []

for g in range(n_generation):

    pop_sorted = compute_performance(df_train, pop)
    
    pop_clustered = pop_clustering(pop_sorted)
    
    dict_parents = select_parents(pop_clustered)
    
    survivors =  select_survivors(dict_parents, df_train)

    new_generation = mutate_pop(survivors, prob1, prob2)

    fixed = mutation_fix(new_generation, df_train, param_length)
    fixed_generation = fixed[0]
    fixed_cnt = fixed[1]
    
    if len(new_generation) < population:
        print(f'  세대수 부족: {len(new_generation)}')
        break
        
    
    print(f'  수선 개체 수: {fixed_cnt}')

    pop = fixed_generation
    best_gene.append(pop_sorted[0][0])
    best_perf.append(pop_sorted[0][1])

    temp_sp = df_train * pop_sorted[0][0][3:]
    temp_sp = temp_sp.sum(axis=1)
    result = adfuller(temp_sp)
    p_val = result[1]

    print(f'====== {g}th generation ends ======')
    print(pop_sorted[0])
    print(f'  정상성 체크: p value = {p_val}')
    print(' ')

  Train set: 20100129 ~ 20191031
  Test set: 20191031 ~ 20230927
  유전 알고리즘 시작
수선 개체 수: 130
[[147, -2.260519786953909, 0.8835411180407258, 0.037244778546216994, 0.1514687752842217, 0.004977812274811666, 0.10432129288463733, 0.06587205878408524, 0.12671414534585732, 0.1139580947042817, 0.14168933416318463, 0.09428103630306209, 0.039786373165448786, 0.11968629854419244, -1.0], 2.745041430791617]
정상성 체크: p value = 0.0280844772273439
수선 개체 수: 129
[[271, -0.8119219644626687, 2.0249346914699955, -0.18496093975619898, 0.21525449376035527, 0.13264215417576297, 0.10442906125547148, 0.14521128419252266, 0.14618113262591934, 0.07604708986240732, -0.3212914739828114, 0.1651200085537194, 0.015114775573841752, -0.20431114787564308, -0.28943643838534655], 3.286183126165919]
정상성 체크: p value = 0.03950272538571422
수선 개체 수: 141
[[346, -0.42294419424579166, 1.2492515260575785, 0.019403993444849867, 0.013331776141106306, -0.07192251033540327, 0.1649841991096739, 0.1559687570369475, 0.09543257608783169, 0.02

수선 개체 수: 137
[[261, -0.33072005242523606, 1.177347624286881, 0.019403993444849867, 0.013331776141106306, -0.07192251033540327, 0.1649841991096739, 0.1559687570369475, 0.09543257608783169, 0.024945767378961273, 0.10539808939921623, 0.2113273738406293, 0.028510918699096496, 0.18069654886168754, -0.9280774896645967], 5.3103243771840045]
정상성 체크: p value = 0.007440655556327635
수선 개체 수: 142
[[261, -0.33072005242523606, 1.177347624286881, 0.019403993444849867, 0.013331776141106306, -0.07192251033540327, 0.1649841991096739, 0.1559687570369475, 0.09543257608783169, 0.024945767378961273, 0.10539808939921623, 0.2113273738406293, 0.028510918699096496, 0.18069654886168754, -0.9280774896645967], 5.3103243771840045]
정상성 체크: p value = 0.007440655556327635
수선 개체 수: 135
[[261, -0.33072005242523606, 1.177347624286881, 0.019403993444849867, 0.013331776141106306, -0.07192251033540327, 0.1649841991096739, 0.1559687570369475, 0.09543257608783169, 0.024945767378961273, 0.10539808939921623, 0.2113273738406293,

수선 개체 수: 121
[[461, -0.14056446793212496, 0.46176518046904214, -0.06123070519415581, 0.3756671604433739, -0.06674838733575339, 0.0636078832737498, -0.0019501948505388166, -0.18227776910072477, -0.0630160194389638, -0.24012319676568433, 0.5607249562828764, -0.041110917776641706, -0.07632586966440802, -0.2672169398731293], 5.392782524689797]
정상성 체크: p value = 0.04053549494225215
수선 개체 수: 143
[[461, -0.14056446793212496, 0.46176518046904214, -0.06123070519415581, 0.3756671604433739, -0.06674838733575339, 0.0636078832737498, -0.0019501948505388166, -0.18227776910072477, -0.0630160194389638, -0.24012319676568433, 0.5607249562828764, -0.041110917776641706, -0.07632586966440802, -0.2672169398731293], 5.392782524689797]
정상성 체크: p value = 0.04053549494225215
수선 개체 수: 146
[[461, -0.14056446793212496, 0.46176518046904214, -0.06123070519415581, 0.3756671604433739, -0.06674838733575339, 0.0636078832737498, -0.0019501948505388166, -0.18227776910072477, -0.0630160194389638, -0.24012319676568433, 0.56

In [33]:
survivors[0][0]

[486,
 -1.7910192692596532,
 1.2667909527784236,
 0.013489517996537803,
 0.16273618268127066,
 -0.15028164179957684,
 -0.13485426038922732,
 0.12288262696086731,
 -0.40698558056989587,
 0.2052346642329582,
 0.09736076803333783,
 -0.3078785172413,
 0.23744166589066176,
 0.06062323162849292,
 0.10023134257587353]