In [1]:
 # 주가 정보를 쉽게 가져올 수 있게 도와주는 FinanceDataReader 패키지 설치
!pip install -U finance-datareader

# 라이브러리 임포트
import pandas as pd
import numpy as np
import os
import FinanceDataReader as fdr

from tqdm import tqdm

Requirement already up-to-date: finance-datareader in /opt/conda/lib/python3.8/site-packages (0.9.31)


In [2]:
list_name = 'stock_list.csv'  # 주식 종목별 종목코드가 담긴 데이터(Dacon 제공)
sample_name = 'sample_submission.csv'

stock_list = pd.read_csv(os.path.join('./',list_name))  # 종목코드 데이터 로드
stock_list['종목코드'] = stock_list['종목코드'].apply(lambda x : str(x).zfill(6))  # 종목코드를 FinanceDataReander 패키지 양식에 맞게 6자리 정보로 변환
stock_list  # 370여개 종목임을 확인

Unnamed: 0,종목명,종목코드,상장시장
0,삼성전자,005930,KOSPI
1,SK하이닉스,000660,KOSPI
2,NAVER,035420,KOSPI
3,카카오,035720,KOSPI
4,삼성바이오로직스,207940,KOSPI
...,...,...,...
365,맘스터치,220630,KOSDAQ
366,다날,064260,KOSDAQ
367,제이시스메디칼,287410,KOSDAQ
368,크리스에프앤씨,110790,KOSDAQ


In [3]:
start_date = '20210104'
end_date = '20211105'

start_weekday = pd.to_datetime(start_date).weekday()  # 시작 날짜의 요일을 숫자로 출력(월요일:0 ~ 일요일:6)
max_weeknum = pd.to_datetime(end_date).strftime('%V')  # 종료 날짜가 그 해의 몇번째 주차에 속하는지를 출력
Business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])  # 주식시장이 열리는 Business day(주말, 공휴일 제외)의 리스트 생성. 고려 기간의 Business day가 총 220일임을 확인.

print(f'WEEKDAY of "start_date" : {start_weekday}')
print(f'NUM of WEEKS to "end_date" : {max_weeknum}')
print(f'HOW MANY "Business_days" : {Business_days.shape}', )
display(Business_days.head())

WEEKDAY of "start_date" : 0
NUM of WEEKS to "end_date" : 44
HOW MANY "Business_days" : (220, 1)


Unnamed: 0,Date
0,2021-01-04
1,2021-01-05
2,2021-01-06
3,2021-01-07
4,2021-01-08


In [4]:
display(Business_days.tail())

Unnamed: 0,Date
215,2021-11-01
216,2021-11-02
217,2021-11-03
218,2021-11-04
219,2021-11-05


## Baseline 모델의 구성 소개 ( Sample )

X : (월 ~ 금) * 43주간

y : (다음주 월 ~ 금) * 43주간

y_0 : 다음주 월요일

y_1 : 다음주 화요일

y_2 : 다음주 수요일

y_3 : 다음주 목요일

y_4 : 다음주 금요일

이번주 월~금요일의 패턴을 학습해 다음주 월요일 ~ 금요일을 각각 예측하는 모델을 생성

이 과정을 모든 종목(370개)에 적용

In [5]:
sample_code = stock_list.loc[0,'종목코드']  # 종목 코드 리스트에서 삼성전자 종목코드 추출

sample = fdr.DataReader(sample_code, start = start_date, end = end_date)  # 삼성전자 주가 정보 열람(Open : 개장가, 시가 High : 고가, Low : 저가, Close : 마감가, 종가, Volume : 거래량)
sample.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-01-04,81000,84400,80200,83000,38655276,0.024691
2021-01-05,81600,83900,81600,83900,35335669,0.010843
2021-01-06,83300,84500,82100,82200,42089013,-0.020262
2021-01-07,82800,84200,82700,82900,32644642,0.008516
2021-01-08,83300,90000,83000,88800,59013307,0.07117


In [6]:
sample = fdr.DataReader(sample_code, start = start_date, end = end_date)[['Close']].reset_index() # 종가 정보 추출 및 리인덱스
print(len(sample))
sample.head()

209


Unnamed: 0,Date,Close
0,2021-01-04,83000
1,2021-01-05,83900
2,2021-01-06,82200
3,2021-01-07,82900
4,2021-01-08,88800


In [7]:
sample = pd.merge(Business_days, sample, how = 'outer')
print(len(sample))
sample.head()

220


Unnamed: 0,Date,Close
0,2021-01-04,83000.0
1,2021-01-05,83900.0
2,2021-01-06,82200.0
3,2021-01-07,82900.0
4,2021-01-08,88800.0


In [8]:
sample['weekday'] = sample.Date.apply(lambda x : x.weekday())  # 요일을 나타내는 column 추가(월요일:0, 일요일:6)
sample['weeknum'] = sample.Date.apply(lambda x : x.strftime('%V'))  # 해당연도 주차를 나타내는 column 추가(1주차 : 01, 53주차 : 53)
sample.head()

Unnamed: 0,Date,Close,weekday,weeknum
0,2021-01-04,83000.0,0,1
1,2021-01-05,83900.0,1,1
2,2021-01-06,82200.0,2,1
3,2021-01-07,82900.0,3,1
4,2021-01-08,88800.0,4,1


In [9]:
sample.Close = sample.Close.ffill()  # 결측값을 직전 row 해당값으로 채움
sample = pd.pivot_table(data = sample, values = 'Close', columns = 'weekday', index = 'weeknum')  # 데이터프레임 양식 변환
print(len(sample))  # 1~44 주차 정보임을 확인
sample.head()

44


weekday,0,1,2,3,4
weeknum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,83000.0,83900.0,82200.0,82900.0,88800.0
2,91000.0,90600.0,89700.0,89700.0,88000.0
3,85000.0,87000.0,87200.0,88100.0,86800.0
4,89400.0,86700.0,85600.0,83700.0,82000.0
5,83000.0,84400.0,84600.0,82500.0,83500.0


In [10]:
x = sample.iloc[0:-2].to_numpy()
x.shape  # x은 1~42주차 월 ~ 금요일 종가

(42, 5)

In [11]:
y = sample.iloc[1:-1].to_numpy()
y_0 = y[:,0]
y_1 = y[:,1]
y_2 = y[:,2]
y_3 = y[:,3]
y_4 = y[:,4]

y_values = [y_0, y_1, y_2, y_3, y_4]  # y_0 ~ y_4는 2~43주차 월 ~ 금요일 종가

In [12]:
x_public = sample.iloc[-2].to_numpy()  # public 예측 기간(11/01~ 11/05) 직전 주의 종가를 x_public로 추출
x_public

array([70200., 71100., 70100., 70700., 69800.])

# 예측

In [16]:
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor(
    n_estimators=1000,
    criterion="mae",
    n_jobs=-1,
    random_state=42,
    verbose=0,
)

In [17]:
# from sklearn.linear_model import ElasticNet
# model = ElasticNet(
#     alpha=1.0,
#     l1_ratio=0.5,
#     normalize=False,
#     max_iter=1000,
#     random_state=42,
# )

In [18]:
predictions = []
for y_value in y_values:
    model.fit(x,y_value)
    prediction = model.predict(np.expand_dims(x_public,0))
    predictions.append(prediction[0])
predictions

[70367.7, 70755.5, 69996.5, 70447.0, 69910.1]

In [15]:
# 실제 public 값
sample.iloc[-1].values

array([69900., 71500., 70400., 70600., 70200.])

## 전체 모델링

In [19]:
sample_name = 'sample_submission.csv'
sample_submission = pd.read_csv(os.path.join('./',sample_name))
sample_submission

Unnamed: 0,Day,000060,000080,000100,000120,000150,000240,000250,000270,000660,...,330860,336260,336370,347860,348150,348210,352820,357780,363280,950130
0,2021-11-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2021-11-02,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2021-11-03,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2021-11-04,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2021-11-05,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,2021-11-29,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,2021-11-30,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,2021-12-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,2021-12-02,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,2021-12-03,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Baseline Code에서는 위에서 구한 public 예측값을 private 예측값으로 재사용
# private 기간에 대해 새롭게 예측을 수행해야함

for code in tqdm(stock_list['종목코드'].values):
    data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']].reset_index()
    data = pd.merge(Business_days, data, how = 'outer')
    data['weekday'] = data.Date.apply(lambda x : x.weekday())
    data['weeknum'] = data.Date.apply(lambda x : x.strftime('%V'))
    data.Close = data.Close.ffill()
    data = pd.pivot_table(data = data, values = 'Close', columns = 'weekday', index = 'weeknum')
    
    x = data.iloc[0:-2].to_numpy() # 2021년 1월 04일 ~ 2021년 10월 22일까지의 데이터로
    y = data.iloc[1:-1].to_numpy() # 2021년 1월 11일 ~ 2021년 10월 29일까지의 데이터를 학습한다.
    y_0 = y[:,0]
    y_1 = y[:,1]
    y_2 = y[:,2]
    y_3 = y[:,3]
    y_4 = y[:,4]

    y_values = [y_0, y_1, y_2, y_3, y_4]
    x_public = data.iloc[-2].to_numpy() # 2021년 11월 1일부터 11월 5일까지의 데이터를 예측할 것이다.
    
    predictions = []
    for y_value in y_values :
        model.fit(x,y_value)
        prediction = model.predict(np.expand_dims(x_public,0))
        predictions.append(prediction[0])
    sample_submission.loc[:,code] = predictions * 2
sample_submission.isna().sum().sum()

100% 370/370 [5:49:44<00:00, 56.72s/it]  


0

In [21]:
sample_submission

Unnamed: 0,Day,000060,000080,000100,000120,000150,000240,000250,000270,000660,...,330860,336260,336370,347860,348150,348210,352820,357780,363280,950130
0,2021-11-01,28691.5,35023.6,60599.7,147111.0,103915.8,16841.4,48994.9,85120.1,102629.0,...,50549.55,51457.95,85162.4,37236.65,26535.05,53470.7,323712.5,261340.0,27175.1,17225.0
1,2021-11-02,29231.35,35303.95,60389.5,145970.0,103818.5,16999.8,47708.7,84817.5,103052.8,...,50369.85,52076.2,85251.0,38234.1,27433.2,53830.8,332624.0,261282.4,27049.9,17417.35
2,2021-11-03,29055.1,35330.8,61145.6,149237.0,103114.9,17200.2,46957.2,84238.3,102531.0,...,48852.75,51077.55,85386.8,39244.2,27193.1,53202.8,339802.0,261393.3,26634.2,17265.15
3,2021-11-04,29182.1,35579.2,60641.0,150013.0,104303.7,17144.75,48020.7,84259.5,102322.5,...,48887.45,51564.55,82001.2,38985.9,26840.35,55249.0,327463.0,261088.1,26448.6,17549.7
4,2021-11-05,28796.1,35479.2,60849.3,148203.0,102486.6,17313.85,47518.15,83376.2,102210.1,...,49909.5,51863.4,82106.6,38730.75,26660.85,55083.0,329267.0,260691.0,26338.55,17458.675
5,2021-11-29,28691.5,35023.6,60599.7,147111.0,103915.8,16841.4,48994.9,85120.1,102629.0,...,50549.55,51457.95,85162.4,37236.65,26535.05,53470.7,323712.5,261340.0,27175.1,17225.0
6,2021-11-30,29231.35,35303.95,60389.5,145970.0,103818.5,16999.8,47708.7,84817.5,103052.8,...,50369.85,52076.2,85251.0,38234.1,27433.2,53830.8,332624.0,261282.4,27049.9,17417.35
7,2021-12-01,29055.1,35330.8,61145.6,149237.0,103114.9,17200.2,46957.2,84238.3,102531.0,...,48852.75,51077.55,85386.8,39244.2,27193.1,53202.8,339802.0,261393.3,26634.2,17265.15
8,2021-12-02,29182.1,35579.2,60641.0,150013.0,104303.7,17144.75,48020.7,84259.5,102322.5,...,48887.45,51564.55,82001.2,38985.9,26840.35,55249.0,327463.0,261088.1,26448.6,17549.7
9,2021-12-03,28796.1,35479.2,60849.3,148203.0,102486.6,17313.85,47518.15,83376.2,102210.1,...,49909.5,51863.4,82106.6,38730.75,26660.85,55083.0,329267.0,260691.0,26338.55,17458.675


In [22]:
columns = list(sample_submission.columns[1:])

columns = ['Day'] + [str(x).zfill(6) for x in columns]

sample_submission.columns = columns

In [23]:
sample_submission.to_csv('random-forest.csv', index=False)