### import module

In [None]:
import pandas as pd
import numpy as np
import random
import os

from tqdm import tqdm
from statsmodels.tsa.arima.model import ARIMA

import warnings
warnings.filterwarnings("ignore")

In [None]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.experimental import enable_hist_gradient_boosting

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [None]:
train = pd.read_csv('/content/drive/MyDrive/krx/train.csv')
train

Unnamed: 0,일자,종목코드,종목명,거래량,시가,고가,저가,종가
0,20210601,A060310,3S,166690,2890,2970,2885,2920
1,20210601,A095570,AJ네트웍스,63836,5860,5940,5750,5780
2,20210601,A006840,AK홀딩스,103691,35500,35600,34150,34400
3,20210601,A054620,APS,462544,14600,14950,13800,14950
4,20210601,A265520,AP시스템,131987,29150,29150,28800,29050
...,...,...,...,...,...,...,...,...
987995,20230530,A189980,흥국에프엔비,272284,3005,3035,2955,2980
987996,20230530,A000540,흥국화재,50218,3250,3255,3195,3215
987997,20230530,A003280,흥아해운,130664,1344,1395,1340,1370
987998,20230530,A037440,희림,141932,9170,9260,9170,9200


In [None]:

unique_codes = train['종목코드'].unique()


 21%|██▏       | 425/2000 [00:33<02:03, 12.78it/s]


KeyboardInterrupt: ignored

In [None]:
results_df = pd.DataFrame(columns=['종목코드','종가'])

for code in tqdm(unique_codes):
    train_close = train[train['종목코드'] == code][['종목코드','일자', '종가']]
    train_close['일자'] = pd.to_datetime(train_close['일자'], format='%Y%m%d')
    train_close.set_index('일자', inplace=True)

    train_close.dropna(axis=0,inplace=True)

    results_df=pd.concat([results_df,train_close],axis=0)

results_df

100%|██████████| 2000/2000 [03:03<00:00, 10.91it/s]


Unnamed: 0,종목코드,종가
2021-06-01,A060310,2920
2021-06-02,A060310,2900
2021-06-03,A060310,2900
2021-06-04,A060310,2950
2021-06-07,A060310,3150
...,...,...
2023-05-23,A238490,6430
2023-05-24,A238490,6320
2023-05-25,A238490,6330
2023-05-26,A238490,6330


In [None]:
results_df.isna().sum()

종목코드    0
종가      0
dtype: int64

In [None]:
results_df['ma'] = results_df['종가'].rolling(window=20).mean()
results_df['std'] = results_df['종가'].rolling(window=20).std()
results_df['ubb'] = results_df['ma'] + (results_df['std']*2)
results_df['lbb'] = results_df['ma'] - (results_df['std']*2)
results_df['등락률'] = (results_df['종가'] -  results_df['종가'].shift(1))/results_df["종가"].shift(1)
# results_df['per_b'] = (results_df['종가'] - results_df['lbb'])/(results_df['ubb']-results_df['lbb'])

In [None]:
sum(results_df['std']==0)

17764

In [None]:
results_df = results_df[results_df['std'] != 0]

In [None]:
results_df['per_b'] = (results_df['종가'] - results_df['lbb'])/(results_df['ubb']-results_df['lbb'])

In [None]:
results_df['per_b']

2021-06-01         NaN
2021-06-02         NaN
2021-06-03         NaN
2021-06-04         NaN
2021-06-07         NaN
                ...   
2023-05-23     0.48427
2023-05-24    0.336295
2023-05-25    0.364397
2023-05-26    0.367191
2023-05-30    1.481056
Name: per_b, Length: 970236, dtype: object

In [None]:
results_df['등락률']

2021-06-01         NaN
2021-06-02   -0.006849
2021-06-03         0.0
2021-06-04    0.017241
2021-06-07    0.067797
                ...   
2023-05-23   -0.041729
2023-05-24   -0.017107
2023-05-25    0.001582
2023-05-26         0.0
2023-05-30    0.298578
Name: 등락률, Length: 970236, dtype: object

In [None]:
results_df

Unnamed: 0,종목코드,종가,ma,std,ubb,lbb,등락률,per_b
2021-06-01,A060310,2920,,,,,,
2021-06-02,A060310,2900,,,,,-0.006849,
2021-06-03,A060310,2900,,,,,0.0,
2021-06-04,A060310,2950,,,,,0.017241,
2021-06-07,A060310,3150,,,,,0.067797,
...,...,...,...,...,...,...,...,...
2023-05-23,A238490,6430,6440.5,166.874518,6774.249035,6106.750965,-0.041729,0.48427
2023-05-24,A238490,6320,6429.5,167.221093,6763.942187,6095.057813,-0.017107,0.336295
2023-05-25,A238490,6330,6421.0,167.768636,6756.537271,6085.462729,0.001582,0.364397
2023-05-26,A238490,6330,6419.5,168.475359,6756.450719,6082.549281,0.0,0.367191


In [None]:
df= results_df.dropna()
df

Unnamed: 0,종목코드,종가,ma,std,ubb,lbb,등락률,per_b
2021-06-28,A060310,3095,3099.75,106.998217,3313.746434,2885.753566,-0.008013,0.488902
2021-06-29,A060310,3070,3107.25,98.668437,3304.586875,2909.913125,-0.008078,0.405618
2021-06-30,A060310,3065,3115.50,86.585826,3288.671652,2942.328348,-0.001629,0.354191
2021-07-01,A060310,3070,3124.00,71.314721,3266.629443,2981.370557,0.001631,0.310698
2021-07-02,A060310,3155,3134.25,58.585857,3251.421714,3017.078286,0.027687,0.588545
...,...,...,...,...,...,...,...,...
2023-05-23,A238490,6430,6440.50,166.874518,6774.249035,6106.750965,-0.041729,0.48427
2023-05-24,A238490,6320,6429.50,167.221093,6763.942187,6095.057813,-0.017107,0.336295
2023-05-25,A238490,6330,6421.00,167.768636,6756.537271,6085.462729,0.001582,0.364397
2023-05-26,A238490,6330,6419.50,168.475359,6756.450719,6082.549281,0.0,0.367191


In [None]:
type(df['per_b'])

pandas.core.series.Series

In [None]:
df['per_b']= df['per_b'].astype(dtype='float',errors='ignore')

In [None]:
uni_codes = df['종목코드'].unique()
len(uni_codes)

2000

In [None]:
#model arima

In [None]:
from statsmodels.tsa.arima.model import ARIMA

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_df, test_df = train_test_split(df, test_size = 0.2, shuffle = False)

In [None]:
X = train_df[['ma', 'ubb', 'lbb']]
y = train_df['종가']

In [None]:
model = HistGradientBoostingRegressor()

In [None]:
model.fit(X, y)

In [None]:
len(train_df['종목코드'].unique())

1601

In [None]:
uni_codes = df['종목코드'].unique()
uni_codes

array(['A060310', 'A095570', 'A006840', ..., 'A003280', 'A037440',
       'A238490'], dtype=object)

In [None]:
results_df = pd.DataFrame(columns=['종목코드', 'final_return'])

uni_codes = df['종목코드'].unique()

for code in tqdm(uni_codes):

    # 학습 데이터 생성
    train_close = df[df['종목코드'] == code][['per_b']]
    tc = train_close['per_b']

    # 모델 선언, 학습 및 추론
    model = ARIMA(tc, order=(2, 1, 2))
    model_fit = model.fit()
    model_fit_pb = model_fit*(df['ubb']-df['lbb'])+df['lbb']
    predictions = model_fit_pb.forecast(steps=15) # 향후 15개의 거래일에 대해서 예측

    # 최종 수익률 계산
    final_return = (predictions.iloc[-1] - predictions.iloc[0]) / predictions.iloc[0]

    # 결과 저장
    results_df = results_df.append({'종목코드': code, 'final_return': final_return}, ignore_index=True)

In [None]:
results_df = pd.DataFrame(columns=['종목코드', 'final_return'])

uni_codes = df['종목코드'].unique()

for code in tqdm(uni_codes):

    # 학습 데이터 생성
    train_close = df[df['종목코드'] == code][['per_b']]
    tc = train_close['per_b']

    # 모델 선언, 학습 및 추론
    model = HistGradientBoostingRegressor()
    X = train_df[['ma', 'ubb', 'lbb']]
    y = train_df['종가']
    model.fit(X, y)
    test_data = test_df[['ma', 'ubb', 'lbb']]
    predictions = model.predict(test_data)
    predictions_df = test_df['종가']
    final_return = (predictions_df.iloc[-1] - predictions_df.iloc[0]) / predictions_df.iloc[0]
    # 결과 저장
    results_df = results_df.append({'종목코드': code, 'final_return': final_return}, ignore_index=True)

  7%|▋         | 132/2000 [07:55<1:57:04,  3.76s/it]

In [None]:
#tc 이름 close 에서 per_b로 바꾸

In [None]:
results_df

Unnamed: 0,종목코드,final_return
0,A060310,-0.496134
1,A095570,1.273727
2,A006840,0.141406
3,A054620,-0.152348
4,A265520,-0.321520
...,...,...
1995,A189980,-0.274611
1996,A000540,0.368245
1997,A003280,-0.342482
1998,A037440,0.491103


In [None]:
results_per_b = results_df.sort_values('final_return', ascending=False)
results_per_b

Unnamed: 0,종목코드,final_return
205,A100840,35.092412
975,A025980,19.279913
488,A088910,14.727571
1938,A005440,12.921439
979,A090430,12.709711
...,...,...
1706,A037070,-20.247426
641,A025560,-23.252144
530,A039840,-32.446746
1178,A003520,-61.341960


In [None]:
results_per_b['순위']= list(range(1,2001))
copy = results_per_b.drop('final_return',axis=1)


In [None]:
copy

Unnamed: 0,종목코드,순위
205,A100840,1
975,A025980,2
488,A088910,3
1938,A005440,4
979,A090430,5
...,...,...
1706,A037070,1996
641,A025560,1997
530,A039840,1998
1178,A003520,1999


In [None]:
sample_submission = copy.to_csv('/content/drive/MyDrive/krx/sample_submission.csv', index=False)

In [None]:
cov = (sum(df['per_b']*df['등락률'])-len(df['per_b'])*np.mean(df['per_b'])*np.mean(df['등락률']))/len(df['per_b'])

In [None]:
cov

0.01928874111311221

In [None]:
np.mean(df['per_b'])

0.46422211812870695

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt