In [None]:
# import

import matplotlib.pyplot as plt
import matplotlib
import itertools
from pykrx import stock
import pandas as pd
import numpy as np
import random
import os
from tqdm import tqdm
from statsmodels.tsa.arima.model import ARIMA
import warnings
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [None]:
# ADR 함수

def ADR():
    train = pd.read_csv('./train.csv')
    unique_codes = train['일자'].unique()

    pm = []
    for date in tqdm(unique_codes):
        df_kospi = stock.get_market_ohlcv(str(date), market="KOSPI")
        df_kosdaq = stock.get_market_ohlcv(str(date), market="KOSDAQ")
        cnt_kos_plus = 0
        cnt_kos_minus = 0
        cnt_kos_zero = 0
        for kos in df_kospi["등락률"]:
            if kos > 0:
                cnt_kos_plus += 1
            elif kos < 0:
                cnt_kos_minus += 1
            else:
                cnt_kos_zero += 1
        cnt_kod_plus = 0
        cnt_kod_minus = 0
        cnt_kod_zero = 0
        for kod in df_kosdaq["등락률"]:
            if kod > 0:
                cnt_kod_plus += 1
            elif kod < 0:
                cnt_kod_minus += 1
            else:
                cnt_kod_zero += 1 
        pm.append([date, cnt_kos_plus, cnt_kos_minus, cnt_kos_zero, cnt_kod_plus, cnt_kod_minus, cnt_kod_minus])

    result = pd.DataFrame(pm, columns=["Date", "코스피 상승", "코스피 하락", "코스피 등가", "코스닥 상승", "코스닥 하락", "코스닥 등가"])

    result["kospi_adr"] = result["코스피 상승"].rolling(20).sum()/result["코스피 하락"].rolling(20).sum()*100
    result["kosdaq_adr"] = result["코스닥 상승"].rolling(20).sum()/result["코스닥 하락"].rolling(20).sum()*100

    return result




In [None]:
# ADR지표 분석

def ADR_analysis():
    code_list_test = []
    coef_test = []
    code_list = []
    coef = []
    
    train = pd.read_csv('./train.csv')
    adr_final = ADR() # ADR값 계산
    adr_final = adr_final.iloc[19:]
    adr_final.drop(["Unnamed: 0"], axis=1)
    kospi_list = stock.get_index_portfolio_deposit_file("1001")

    # 추론 결과를 저장하기 위한 dataframe 생성
    results_df = pd.DataFrame()
    # train 데이터에 존재하는 독립적인 종목코드 추출
    unique_codes = train['종목코드'].unique()

    for code in tqdm(unique_codes):
    #code = "A103840"
        ############ test
        # 20일선 가격의 상승분과 비교
        train_close = train[train['종목코드'] == code][['일자', '종가', '종목코드']].iloc[:-15]
        #train_close['일자'] = pd.to_datetime(train_close['일자'], format='%Y%m%d')
        train_close.set_index(np.arange(len(train_close)), inplace=True)
        train_close.columns = ['Date', '종가','종목코드']
        train_close["shift"] = train_close["종가"].shift(10)
        train_close["증감"] = ((train_close["종가"] - train_close["shift"]) / train_close["shift"])*100  # 당일종가-전일종가/전일종가 로 등락폭 계산
        train_close["Date"] = train_close['Date'].astype(int)
        adr_final["Date"]=adr_final["Date"].astype(int)
        train_adr = pd.merge(left = train_close, right = adr_final, how = "inner", on = "Date")
        
        if code[1:] in kospi_list:
            x = train_adr['kospi_adr'].iloc[10:].values.reshape(-1,1)
            y = train_adr['증감'].iloc[10:]
        else:
            x = train_adr['kosdaq_adr'].iloc[10:].values.reshape(-1,1)
            y = train_adr['증감'].iloc[10:]

        fit = lr.fit(x, y)
        code_list_test.append(code)
        coef_test.append(fit.coef_)
        ########## origin
        # 20일선 가격의 상승분과 비교
        train_close = train[train['종목코드'] == code][['일자', '종가', '종목코드']]
        #train_close['일자'] = pd.to_datetime(train_close['일자'], format='%Y%m%d')
        train_close.set_index(np.arange(len(train_close)), inplace=True)
        train_close.columns = ['Date', '종가','종목코드']
        train_close["shift"] = train_close["종가"].shift(10)
        train_close["증감"] = ((train_close["종가"] - train_close["shift"]) / train_close["shift"])*100  # 당일종가-전일종가/전일종가 로 등락폭 계산
        train_close["Date"] = train_close['Date'].astype(int)
        adr_final["Date"]=adr_final["Date"].astype(int)
        train_adr = pd.merge(left = train_close, right = adr_final, how = "inner", on = "Date")
        
        if code[1:] in kospi_list:
            x = train_adr['kospi_adr'].iloc[10:].values.reshape(-1,1)
            y = train_adr['증감'].iloc[10:]
        else:
            x = train_adr['kosdaq_adr'].iloc[10:].values.reshape(-1,1)
            y = train_adr['증감'].iloc[10:]

        fit = lr.fit(x, y)
        code_list.append(code)
        coef.append(fit.coef_)




    # predict = map(lambda x,y: 95*x+y, coef, intercept)
    result_test = pd.DataFrame()
    result_test['종목코드'] = pd.Series(code_list_test)
    result_test['기울기'] =  pd.Series(coef_test)

    result_test['순위'] = result_test['기울기'].rank(method='first', ascending=False).astype('int') # 각 순위를 중복없이 생성
    result_test = result_test.sort_values(by='순위')

    result = pd.DataFrame()
    result['종목코드'] = pd.Series(code_list)
    result['기울기'] =  pd.Series(coef)

    result['순위'] = result['기울기'].rank(method='first', ascending=False).astype('int') # 각 순위를 중복없이 생성
    result = result.sort_values(by='순위')
    
    return [result_test, result]

In [None]:
# 이평선 지표 분석
def MA_analysis():
    train = pd.read_csv('./train.csv')
    
    # train 데이터에 존재하는 독립적인 종목코드 추출
    unique_codes = train['종목코드'].unique()


    # fig, axes = plt.subplots(5,5,figsize =(20,20))
    # cnt = 0
    ## 각 종목코드에 대해서 모델 학습 및 추론 반복

    for code in tqdm(unique_codes):
    #code = "A103840"
        ########## test
        # 20일선 가격의 상승분과 비교
        train_close = train[train['종목코드'] == code][['일자', '종가', '종목코드']].iloc[:-15]
        #train_close['일자'] = pd.to_datetime(train_close['일자'], format='%Y%m%d')
        train_close.set_index(np.arange(len(train_close)), inplace=True)
        train_close.columns = ['Date', '종가','종목코드']



        # n일 이동평균선
        train_close["MA5_mean"]=train_close["종가"].rolling(5).mean()
        train_close["MA10_mean"]=train_close["종가"].rolling(10).mean()
        train_close["MA20_mean"]=train_close["종가"].rolling(20).mean()
        train_close["MA60_mean"]=train_close["종가"].rolling(60).mean()

        reg = []
        rev = []
        for i in range(len(train_close)):
            if train_close["MA5_mean"].iloc[i] > train_close["MA10_mean"].iloc[i] and train_close["MA10_mean"].iloc[i] > train_close["MA20_mean"].iloc[i] and train_close["MA20_mean"].iloc[i] > train_close["MA60_mean"].iloc[i]:
                reg.append(1)
                rev.append(0)
            elif train_close["MA5_mean"].iloc[i] < train_close["MA10_mean"].iloc[i] and train_close["MA10_mean"].iloc[i] < train_close["MA20_mean"].iloc[i] and train_close["MA20_mean"].iloc[i] < train_close["MA60_mean"].iloc[i]:
                rev.append(1)
                reg.append(0)
            else:
                reg.append(0)
                rev.append(0)
        
        train_close["regular"] = pd.Series(reg)
        train_close["reverse"] = pd.Series(rev)

        train_close["reg_sum"] = train_close["regular"].rolling(20).sum()
        train_close["rev_sum"] = train_close["reverse"].rolling(20).sum()
        # train_close["MA20-MA60"] = (train_close["MA20_mean"]-train_close["MA60_mean"])/train_close["MA60_mean"]*50
        train_close["MA5-MA10"] = (train_close["MA5_mean"]-train_close["MA10_mean"])/train_close["MA10_mean"]*100
        train_close["score"] = (train_close["reg_sum"]-train_close["rev_sum"]+train_close["MA5-MA10"])
        # train_close["score"] = train_close["reg_sum"]-train_close["rev_sum"]
        
        
        # x = train_close["percent"].iloc[20:]
        # y = train_close["종가"].iloc[20:]
        # p = range(0, 5)
        # d = range(0, 5)
        # pdq = list(itertools.product(p, d))
        # axes[pdq[cnt][0], pdq[cnt][1]].scatter(x, y)
        # cnt += 1


        tc = train_close["score"]

        model = ARIMA(tc, order=(2, 1, 2))
        model.initialize_approximate_diffuse() 
        model_fit = model.fit()
        predictions = model_fit.forecast(steps=15) # 향후 15개의 거래일에 대해서 예측

        # 최종 수익률 계산
        final_return_test = predictions.iloc[0]
        # 가장 마지막에 있는 것의 배열상태에 따라 순위 매긴다.

        ########## origin
        # 20일선 가격의 상승분과 비교
        train_close = train[train['종목코드'] == code][['일자', '종가', '종목코드']]
        #train_close['일자'] = pd.to_datetime(train_close['일자'], format='%Y%m%d')
        train_close.set_index(np.arange(len(train_close)), inplace=True)
        train_close.columns = ['Date', '종가','종목코드']



        # n일 이동평균선
        train_close["MA5_mean"]=train_close["종가"].rolling(5).mean()
        train_close["MA10_mean"]=train_close["종가"].rolling(10).mean()
        train_close["MA20_mean"]=train_close["종가"].rolling(20).mean()
        train_close["MA60_mean"]=train_close["종가"].rolling(60).mean()

        reg = []
        rev = []
        for i in range(len(train_close)):
            if train_close["MA5_mean"].iloc[i] > train_close["MA10_mean"].iloc[i] and train_close["MA10_mean"].iloc[i] > train_close["MA20_mean"].iloc[i] and train_close["MA20_mean"].iloc[i] > train_close["MA60_mean"].iloc[i]:
                reg.append(1)
                rev.append(0)
            elif train_close["MA5_mean"].iloc[i] < train_close["MA10_mean"].iloc[i] and train_close["MA10_mean"].iloc[i] < train_close["MA20_mean"].iloc[i] and train_close["MA20_mean"].iloc[i] < train_close["MA60_mean"].iloc[i]:
                rev.append(1)
                reg.append(0)
            else:
                reg.append(0)
                rev.append(0)
        
        train_close["regular"] = pd.Series(reg)
        train_close["reverse"] = pd.Series(rev)

        train_close["reg_sum"] = train_close["regular"].rolling(20).sum()
        train_close["rev_sum"] = train_close["reverse"].rolling(20).sum()
        # train_close["MA20-MA60"] = (train_close["MA20_mean"]-train_close["MA60_mean"])/train_close["MA60_mean"]*50
        train_close["MA5-MA10"] = (train_close["MA5_mean"]-train_close["MA10_mean"])/train_close["MA10_mean"]*100
        train_close["score"] = (train_close["reg_sum"]-train_close["rev_sum"]+train_close["MA5-MA10"])
        # train_close["score"] = train_close["reg_sum"]-train_close["rev_sum"]
        
        
        # x = train_close["percent"].iloc[20:]
        # y = train_close["종가"].iloc[20:]
        # p = range(0, 5)
        # d = range(0, 5)
        # pdq = list(itertools.product(p, d))
        # axes[pdq[cnt][0], pdq[cnt][1]].scatter(x, y)
        # cnt += 1


        tc = train_close["score"]

        model = ARIMA(tc, order=(2, 1, 2))
        model.initialize_approximate_diffuse() 
        model_fit = model.fit()
        predictions = model_fit.forecast(steps=15) # 향후 15개의 거래일에 대해서 예측

        # 최종 수익률 계산
        final_return = predictions.iloc[-1]
        # 가장 마지막에 있는 것의 배열상태에 따라 순위 매긴다.
        
        # 추론 결과를 저장하기 위한 dataframe 생성
        results_df_test = pd.DataFrame()
        results_df = pd.DataFrame()
        # 결과 저장
        results_df_test = results_df_test.append({'종목코드': code, 'final_return': final_return_test}, ignore_index=True)
        results_df = results_df.append({'종목코드': code, 'final_return': final_return}, ignore_index=True)



    results_df_test['final_return'] = results_df_test['final_return'].fillna(0)
    results_df_test['순위'] = results_df_test['final_return'].rank(method='first', ascending=False).astype('int') # 각 순위를 중복없이 생성
    results_df_test = results_df_test.sort_values(by='순위')
    results_df['final_return'] = results_df['final_return'].fillna(0)
    results_df['순위'] = results_df['final_return'].rank(method='first', ascending=False).astype('int') # 각 순위를 중복없이 생성
    results_df = results_df.sort_values(by='순위')

    return [results_df_test, results_df]




In [None]:
def Boll_analysis():

    # 볼린저밴드 지표 분석
    train = pd.read_csv('./train.csv')

    # train 데이터에 존재하는 독립적인 종목코드 추출
    unique_codes = train['종목코드'].unique()


    # fig, axes = plt.subplots(5,5,figsize =(20,20))
    # cnt = 0
    ## 각 종목코드에 대해서 모델 학습 및 추론 반복
    for code in tqdm(unique_codes):
    #code = "A103840"
        ############## test
        # 20일선 가격의 상승분과 비교
        train_close = train[train['종목코드'] == code][['일자', '종가', '종목코드']].iloc[:-15]
        #train_close['일자'] = pd.to_datetime(train_close['일자'], format='%Y%m%d')
        train_close.set_index(np.arange(len(train_close)), inplace=True)
        train_close.columns = ['Date', '종가','종목코드']

        w= 20 # 기준 이동평균일 
        k= 2 # 기준 상수

        #중심선 (MBB) : n일 이동평균선
        train_close["mbb"]=train_close["종가"].rolling(w).mean()
        train_close["MA20_std"]=train_close["종가"].rolling(w).std()
        
        #상한선 (UBB) : 중심선 + (표준편차 × K)
        #하한선 (LBB) : 중심선 - (표준편차 × K)
        train_close["ubb"]=train_close.apply(lambda x: x["mbb"]+k*x["MA20_std"],1)
        train_close["lbb"]=train_close.apply(lambda x: x["mbb"]-k*x["MA20_std"],1)
        
        # train_close[['종가','mbb', 'ubb', 'lbb']][-200:].plot.line()
        
        #train_close[["mbb","MA20_std","ubb","lbb"]].fillna(0, inplace=True)
        train_close["percent"] = pd.Series(map(lambda x, y, z: (z-y)/(x-y)*100 if (x-y)>0 else -1 , train_close["ubb"], train_close["lbb"], train_close["종가"]))
        
        # x = train_close["percent"].iloc[20:]
        # y = train_close["종가"].iloc[20:]
        # p = range(0, 5)
        # d = range(0, 5)
        # pdq = list(itertools.product(p, d))
        # axes[pdq[cnt][0], pdq[cnt][1]].scatter(x, y)
        # cnt += 1


        tc = train_close["percent"]

        model = ARIMA(tc, order=(2, 1, 2))
        model.initialize_approximate_diffuse() 
        model_fit = model.fit()
        predictions = model_fit.forecast(steps=15) # 향후 15개의 거래일에 대해서 예측

        # 최종 수익률 계산: 높은데 예측이 오르면 올라가는 걸로, 높은데 예측이 내리면 내리는 것으로, 낮은데 예측이 내리면 내리는 것으로, 낮은데 예측이 오르면 오르는 것으로  
        # final_return = (predictions.iloc[-1] - predictions.iloc[0])*abs(predictions.iloc[0])
        # 밑에서 올라가는 것에 가중치: 밑에서 올라가는 것(가중치 최고), 밑에서 내려가는것(가중치 중하), 위에서 올라가는 것(가중치 중상), 위에서 내려가는 것(가중치 최저)
        final_return_test = predictions.iloc[-1]-2*predictions.iloc[0] # 변동폭-첫 예측값을 뺀다. 처음값이 높으면 하락할 가능성이 높다.

        ############## origin
        # 20일선 가격의 상승분과 비교
        train_close = train[train['종목코드'] == code][['일자', '종가', '종목코드']]
        #train_close['일자'] = pd.to_datetime(train_close['일자'], format='%Y%m%d')
        train_close.set_index(np.arange(len(train_close)), inplace=True)
        train_close.columns = ['Date', '종가','종목코드']

        w= 20 # 기준 이동평균일 
        k= 2 # 기준 상수

        #중심선 (MBB) : n일 이동평균선
        train_close["mbb"]=train_close["종가"].rolling(w).mean()
        train_close["MA20_std"]=train_close["종가"].rolling(w).std()
        
        #상한선 (UBB) : 중심선 + (표준편차 × K)
        #하한선 (LBB) : 중심선 - (표준편차 × K)
        train_close["ubb"]=train_close.apply(lambda x: x["mbb"]+k*x["MA20_std"],1)
        train_close["lbb"]=train_close.apply(lambda x: x["mbb"]-k*x["MA20_std"],1)
        
        # train_close[['종가','mbb', 'ubb', 'lbb']][-200:].plot.line()
        
        #train_close[["mbb","MA20_std","ubb","lbb"]].fillna(0, inplace=True)
        train_close["percent"] = pd.Series(map(lambda x, y, z: (z-y)/(x-y)*100 if (x-y)>0 else -1 , train_close["ubb"], train_close["lbb"], train_close["종가"]))
        
        # x = train_close["percent"].iloc[20:]
        # y = train_close["종가"].iloc[20:]
        # p = range(0, 5)
        # d = range(0, 5)
        # pdq = list(itertools.product(p, d))
        # axes[pdq[cnt][0], pdq[cnt][1]].scatter(x, y)
        # cnt += 1


        tc = train_close["percent"]

        model = ARIMA(tc, order=(2, 1, 2))
        model.initialize_approximate_diffuse() 
        model_fit = model.fit()
        predictions = model_fit.forecast(steps=15) # 향후 15개의 거래일에 대해서 예측

        # 최종 수익률 계산: 높은데 예측이 오르면 올라가는 걸로, 높은데 예측이 내리면 내리는 것으로, 낮은데 예측이 내리면 내리는 것으로, 낮은데 예측이 오르면 오르는 것으로  
        # final_return = (predictions.iloc[-1] - predictions.iloc[0])*abs(predictions.iloc[0])
        # 밑에서 올라가는 것에 가중치: 밑에서 올라가는 것(가중치 최고), 밑에서 내려가는것(가중치 중하), 위에서 올라가는 것(가중치 중상), 위에서 내려가는 것(가중치 최저)
        final_return = predictions.iloc[-1]-2*predictions.iloc[0] # 변동폭-첫 예측값을 뺀다. 처음값이 높으면 하락할 가능성이 높다.



        # 추론 결과를 저장하기 위한 dataframe 생성
        results_df_test = pd.DataFrame()
        results_df = pd.DataFrame()
        # 결과 저장
        results_df_test = results_df_test.append({'종목코드': code, 'final_return': final_return_test}, ignore_index=True)
        results_df = results_df.append({'종목코드': code, 'final_return': final_return}, ignore_index=True)



    results_df_test['final_return'] = results_df_test['final_return'].fillna(0)
    results_df_test['순위'] = results_df_test['final_return'].rank(method='first', ascending=False).astype('int') # 각 순위를 중복없이 생성
    results_df_test = results_df_test.sort_values(by='순위')
    results_df['final_return'] = results_df['final_return'].fillna(0)
    results_df['순위'] = results_df['final_return'].rank(method='first', ascending=False).astype('int') # 각 순위를 중복없이 생성
    results_df = results_df.sort_values(by='순위')

    return [results_df_test, results_df]


In [None]:
# test 함수(train 데이터의 마지막 15일 데이터를 얼마나 잘 예측하는 지를 구한다. 이를 통해 각 지표별 가중치 산정)

def test(Dataframe):
    ADR = ADR_analysis()
    Boll = Boll_analysis()
    MA = MA_analysis()
    tests = [ADR[0], Boll[0], MA[0]]
    train = pd.read_csv('./train.csv')
    unique_codes = train['종목코드'].unique()

    pm = []
    for code in tqdm(unique_codes):
        train_close = train[train['종목코드'] == code][['일자','시가', '종가', '종목코드']]
        train_close = train_close.iloc[-15:]
        if train_close.iloc[0]['시가'] == 0:
            pm.append([code, train_close.iloc[0]['시가']])
        else:    
            pm.append([code, (train_close.iloc[-1]['종가'] - train_close.iloc[0]['시가'])/train_close.iloc[0]['시가']]) # 수익률

    result = pd.DataFrame(pm, columns=['종목코드', '수익률'])
    
    sum_result = []
    for test in tests: 
        rank = 0
        sum = 0
        for code in result['종목코드']:
            sum += (int(test[test['종목코드'] == code]['순위']) - rank+1)**2
            rank += 1
        sum_result.append(40000-np.sqrt(sum))
    
    return [[ADR[1], sum_result[0]],[Boll[1], sum_result[1]],[MA[1], sum_result[2]]]


In [None]:
# 가중치 함수

def Weight(Weight_list):    # 리스트의 각 요소에는 [데이터프레임, 가중치]가 입력된다. 
    
    train = pd.read_csv('./train.csv')
    unique_codes = train['종목코드'].unique()

    rank = []
    rank_sum = 0
    # Weight_list 구조: [weight, 순위데이터프레임(-15), 데이터프레임]
    for code in tqdm(unique_codes):
        for i in Weight_list:
            rank_sum += 2001 - int(i[0][i[0]["종목코드"]==code]["순위"])*i[1]
        rank.append([code, rank_sum])

    result = pd.DataFrame(rank, columns = ["종목코드", "순위"])   
    result['순위'] = result['순위'].rank(method='first', ascending=False).astype('int') # 각 순위를 중복없이 생성
    result = result.sort_values(by='순위')
    
    sample_submission = pd.read_csv('./sample_submission.csv')
    baseline_submission = sample_submission[['종목코드']].merge(result[['종목코드', '순위']], on='종목코드', how='right')
    baseline_submission.to_csv('0728FINAL.csv', index=False)

Weight_list = test()
Weight(Weight_list)
    
