## Import

In [1]:
import pandas as pd
import numpy as np
import random
import os

from tqdm import tqdm
from statsmodels.tsa.arima.model import ARIMA

import warnings
warnings.filterwarnings("ignore")

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Data Load

In [3]:
train = pd.read_csv('./open/train.csv')

## Model Define, Train and Inference

In [4]:
print(train)

              일자     종목코드     종목명      거래량     시가     고가     저가     종가
0       20210601  A060310      3S   166690   2890   2970   2885   2920
1       20210601  A095570  AJ네트웍스    63836   5860   5940   5750   5780
2       20210601  A006840   AK홀딩스   103691  35500  35600  34150  34400
3       20210601  A054620     APS   462544  14600  14950  13800  14950
4       20210601  A265520   AP시스템   131987  29150  29150  28800  29050
...          ...      ...     ...      ...    ...    ...    ...    ...
987995  20230530  A189980  흥국에프엔비   272284   3005   3035   2955   2980
987996  20230530  A000540    흥국화재    50218   3250   3255   3195   3215
987997  20230530  A003280    흥아해운   130664   1344   1395   1340   1370
987998  20230530  A037440      희림   141932   9170   9260   9170   9200
987999  20230530  A238490      힘스  2611843   6410   8220   6300   8220

[988000 rows x 8 columns]


In [5]:
## UDF for ADF test
from statsmodels.tsa.stattools import adfuller
import pandas as pd
import itertools

def adf_test(timeseries):
    # print("Results of Dickey-Fuller Test:")
    dftest = adfuller(timeseries, autolag="AIC")
    dfoutput = pd.Series(
        dftest[0:4],
        index=[
            "Test Statistic",
            "p-value",
            "#Lags Used",
            "Number of Observations Used",
        ],
    )
    for key, value in dftest[4].items():
        dfoutput["Critical Value (%s)" % key] = value
    # print(dfoutput)
    return dfoutput

def pdf_test(train_df):
    p = range(0,3)
    d = range(1,2)
    q = range(0,3)
    pdq = list(itertools.product(p, d, q))

    aic = []
    for i in pdq:
        model = ARIMA(train_df, order = (i))
        model_fit = model.fit()
        # print(f'ARIMA: {i} >> AIC : {round(model_fit.aic,2)}')
        aic.append(round(model_fit.aic, 2))
        
    # Search optimal parameters
    optimal = [(pdq[i],j) for i, j in enumerate(aic) if j == min(aic)]
    return optimal

In [126]:
# 추론 결과를 저장하기 위한 dataframe 생성
results_df = pd.DataFrame(columns=['종목코드', 'final_return'])

# train 데이터에 존재하는 독립적인 종목코드 추출
unique_codes = train['종목코드'].unique()

# 각 종목코드에 대해서 모델 학습 및 추론 반복
for idx, code in enumerate(tqdm(unique_codes)):
    # 학습 데이터 생성
    train_close = train[train['종목코드'] == code][['일자', '거래량', '시가', '종가']]
    train_close['일자'] = pd.to_datetime(train_close['일자'], format='%Y%m%d')
    train_close.set_index('일자', inplace=True)
    tc = train_close['종가'].dropna()
    
    model = ARIMA(tc, order=(2, 1, 2))
    model_fit = model.fit()
    predictions = model_fit.forecast(steps=15) # 향후 15개의 거래일에 대해서 예측
    predictions[493] = tc[-1]
    predict_diff = predictions.sort_index().diff().dropna()
    # 최종 수익률 계산
    # final_return = (predictions.iloc[-1] - predictions.iloc[0])
    final_return = sum(predict_diff)
    # short_return = (predictions.iloc[-1])
    
    # print(results_df)
    # 결과 저장
    # results_df = results_df.append({'종목코드': code, 'final_return': final_return}, ignore_index=True)
    results_df = pd.concat([results_df, pd.DataFrame({'종목코드': [code], 'final_return': [final_return]})], ignore_index=True)

100%|██████████| 2000/2000 [16:13<00:00,  2.05it/s]


In [None]:
# train 데이터에 존재하는 독립적인 종목코드 추출
unique_codes = train['종목코드'].unique()[1354:]

# 각 종목코드에 대해서 모델 학습 및 추론 반복
for code in tqdm(unique_codes):
    # 학습 데이터 생성
    train_close = train[train['종목코드'] == code][['일자', '거래량', '시가', '종가']]
    train_close['일자'] = pd.to_datetime(train_close['일자'], format='%Y%m%d')
    train_close.set_index('일자', inplace=True)
    tc = train_close['종가'].dropna()
    # pdf = pdf_test(tc)
    # model = ARIMA(tc, order=(pdf[0][0]))
    model = ARIMA(tc, order=(2, 1, 0))
    model_fit = model.fit()
    predictions = model_fit.forecast(steps=15) # 향후 15개의 거래일에 대해서 예측
    
    # 최종 수익률 계산
    final_return = (predictions.iloc[-1] - predictions.iloc[0]) / predictions.iloc[0]
    # final_return = sum(predictions.iloc)
    # short_return = (predictions.iloc[-1])
    
    # print(results_df)
    # 결과 저장
    # results_df = results_df.append({'종목코드': code, 'final_return': final_return}, ignore_index=True)
    results_df = pd.concat([results_df, pd.DataFrame({'종목코드': [code], 'final_return': [final_return]})], ignore_index=True)
    print(predictions)
    print(final_return)
print(model_fit.aic)

In [127]:
results_df['순위'] = results_df['final_return'].rank(method='first', ascending=False).astype('int') # 각 순위를 중복없이 생성
results_df

Unnamed: 0,종목코드,final_return,순위
0,A060310,0.000000,758
1,A095570,8.851174,489
2,A006840,-85.259245,1901
3,A054620,0.000000,759
4,A265520,-45.328789,1842
...,...,...,...
1995,A189980,-3.660498,1489
1996,A000540,-1.462707,1412
1997,A003280,6.708146,530
1998,A037440,204.973404,49


## Submit

In [128]:
sample_submission = pd.read_csv('./open/sample_submission.csv')
sample_submission

Unnamed: 0,종목코드,순위
0,A000020,1
1,A000040,2
2,A000050,3
3,A000070,4
4,A000080,5
...,...,...
1995,A375500,1996
1996,A378850,1997
1997,A383220,1998
1998,A383310,1999


In [129]:
baseline_submission = sample_submission[['종목코드']].merge(results_df[['종목코드', '순위']], on='종목코드', how='left')
baseline_submission

Unnamed: 0,종목코드,순위
0,A000020,347
1,A000040,1351
2,A000050,1507
3,A000070,414
4,A000080,1561
...,...,...
1995,A375500,773
1996,A378850,426
1997,A383220,776
1998,A383310,1880


In [130]:
baseline_submission.to_csv('ARIMA_best_return_diff_submission.csv', index=False)