In [22]:
import matplotlib.pyplot as plt
import matplotlib
import itertools
import talib

import pandas as pd
import numpy as np
import random
import os
from tqdm import tqdm
from statsmodels.tsa.arima.model import ARIMA
import warnings
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

warnings.filterwarnings("ignore")

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

#################

train = pd.read_csv('./train.csv')
 
# 추론 결과를 저장하기 위한 dataframe 생성
results_df = pd.DataFrame()

# train 데이터에 존재하는 독립적인 종목코드 추출
unique_codes = train['종목코드'].unique()


# fig, axes = plt.subplots(5,5,figsize =(20,20))
# cnt = 0
## 각 종목코드에 대해서 모델 학습 및 추론 반복
reg = []
rev = []
for code in tqdm(unique_codes):
#code = "A103840"

    # 20일선 가격의 상승분과 비교
    train_close = train[train['종목코드'] == code][['일자', '종가', '종목코드']]
    #train_close['일자'] = pd.to_datetime(train_close['일자'], format='%Y%m%d')
    train_close.set_index(np.arange(len(train_close)), inplace=True)
    train_close.columns = ['Date', '종가','종목코드']



    # n일 이동평균선
    train_close["MA5_mean"]=train_close["종가"].rolling(5).mean()
    train_close["MA10_mean"]=train_close["종가"].rolling(10).mean()
    train_close["MA20_mean"]=train_close["종가"].rolling(20).mean()
    train_close["MA60_mean"]=train_close["종가"].rolling(60).mean()

    reg = []
    rev = []
    for i in range(len(train_close)):
        if train_close["MA5_mean"].iloc[i] > train_close["MA10_mean"].iloc[i] and train_close["MA10_mean"].iloc[i] > train_close["MA20_mean"].iloc[i] and train_close["MA20_mean"].iloc[i] > train_close["MA60_mean"].iloc[i]:
            reg.append(1)
            rev.append(0)
        elif train_close["MA5_mean"].iloc[i] < train_close["MA10_mean"].iloc[i] and train_close["MA10_mean"].iloc[i] < train_close["MA20_mean"].iloc[i] and train_close["MA20_mean"].iloc[i] < train_close["MA60_mean"].iloc[i]:
            rev.append(1)
            reg.append(0)
        else:
            reg.append(0)
            rev.append(0)
    
    train_close["regular"] = pd.Series(reg)
    train_close["reverse"] = pd.Series(rev)

    train_close["reg_sum"] = train_close["regular"].rolling(20).sum()
    train_close["rev_sum"] = train_close["reverse"].rolling(20).sum()
    # train_close["MA20-MA60"] = (train_close["MA20_mean"]-train_close["MA60_mean"])/train_close["MA60_mean"]*50
    train_close["MA5-MA20"] = (train_close["MA5_mean"]-train_close["MA20_mean"])/train_close["MA20_mean"]*100
    train_close["score"] = (train_close["reg_sum"]-train_close["rev_sum"]+train_close["MA5-MA20"])
    # train_close["score"] = train_close["reg_sum"]-train_close["rev_sum"]
    
    
    # x = train_close["percent"].iloc[20:]
    # y = train_close["종가"].iloc[20:]
    # p = range(0, 5)
    # d = range(0, 5)
    # pdq = list(itertools.product(p, d))
    # axes[pdq[cnt][0], pdq[cnt][1]].scatter(x, y)
    # cnt += 1


    tc = train_close["score"]

    model = ARIMA(tc, order=(2, 1, 2))
    model.initialize_approximate_diffuse() 
    model_fit = model.fit()
    predictions = model_fit.forecast(steps=15) # 향후 15개의 거래일에 대해서 예측

    # 최종 수익률 계산
    final_return = predictions.iloc[-1]
    # 가장 마지막에 있는 것의 배열상태에 따라 순위 매긴다.
    # 결과 저장
    results_df = results_df.append({'종목코드': code, 'final_return': final_return}, ignore_index=True)


# +로 계속 유지되는것으로 예측되면 순위를 낮추기

# p = range(0, 5)
# d = range(0, 5)
# q = range(0, 5)
# pdq = list(itertools.product(p, d, q))

# AIC = []
# aic = {}
# for i in pdq :
#     model = ARIMA(tc, order=(i))
#     model_fit = model.fit()
#     print(f'ARIMA pdq : {i} >> AIC : {round(model_fit.aic, 2)}')
#     AIC.append(round(model_fit.aic, 2))
#     aic[round(model_fit.aic, 2)] = i

# min(aic)

results_df['final_return'] = results_df['final_return'].fillna(0)
results_df['순위'] = results_df['final_return'].rank(method='first', ascending=False).astype('int') # 각 순위를 중복없이 생성
results_df = results_df.sort_values(by='순위')
sample_submission = pd.read_csv('./sample_submission.csv')
baseline_submission = sample_submission[['종목코드']].merge(results_df[['종목코드', '순위']], on='종목코드', how='right')
baseline_submission.to_csv('0726_MA_test_submission.csv', index=False)

 68%|██████▊   | 1359/2000 [09:02<04:39,  2.29it/s]

In [20]:
train_close.tail(20)

Unnamed: 0,Date,종가,종목코드,MA5_mean,MA10_mean,MA20_mean,MA60_mean,regular,reverse,reg_sum,rev_sum,MA5-MA20,score
474,20230428,6400,A238490,6422.0,6621.0,6868.5,6559.833333,0,0,5.0,0.0,-6.500692,11.500692
475,20230502,6560,A238490,6426.0,6574.0,6841.0,6562.333333,0,0,5.0,0.0,-6.066365,11.066365
476,20230503,6470,A238490,6420.0,6530.0,6800.5,6560.833333,0,0,4.0,0.0,-5.595177,9.595177
477,20230504,6430,A238490,6434.0,6478.0,6761.0,6556.333333,0,0,3.0,0.0,-4.836563,7.836563
478,20230508,6630,A238490,6498.0,6471.0,6740.0,6555.166667,0,0,2.0,0.0,-3.590504,5.590504
479,20230509,6450,A238490,6508.0,6465.0,6699.5,6552.0,0,0,1.0,0.0,-2.858422,3.858422
480,20230510,6490,A238490,6494.0,6460.0,6679.5,6552.833333,0,0,0.0,0.0,-2.777154,2.777154
481,20230511,6490,A238490,6498.0,6459.0,6652.0,6553.5,0,0,0.0,0.0,-2.315093,2.315093
482,20230512,6230,A238490,6458.0,6446.0,6605.0,6548.166667,0,0,0.0,0.0,-2.225587,2.225587
483,20230515,6130,A238490,6358.0,6428.0,6555.5,6539.0,0,0,0.0,0.0,-3.012737,3.012737


In [21]:
predictions

494    -9.930475
495   -11.960305
496   -13.409793
497   -14.357477
498   -14.877736
499   -15.039998
500   -14.908198
501   -14.540463
502   -13.988974
503   -13.299981
504   -12.513939
505   -11.665744
506   -10.785041
507    -9.896583
508    -9.020629
Name: predicted_mean, dtype: float64