In [1]:
 # 주가 정보를 쉽게 가져올 수 있게 도와주는 FinanceDataReader 패키지 설치
# !pip install -U finance-datareader

# 라이브러리 임포트
import pandas as pd
import numpy as np
import os
import FinanceDataReader as fdr

from tqdm import tqdm

In [2]:
list_name = 'stock_list.csv'  # 주식 종목별 종목코드가 담긴 데이터(Dacon 제공)
sample_name = 'sample_submission.csv'

stock_list = pd.read_csv(os.path.join('./',list_name))  # 종목코드 데이터 로드
stock_list['종목코드'] = stock_list['종목코드'].apply(lambda x : str(x).zfill(6))  # 종목코드를 FinanceDataReander 패키지 양식에 맞게 6자리 정보로 변환
stock_list  # 370여개 종목임을 확인

Unnamed: 0,종목명,종목코드,상장시장
0,삼성전자,005930,KOSPI
1,SK하이닉스,000660,KOSPI
2,NAVER,035420,KOSPI
3,카카오,035720,KOSPI
4,삼성바이오로직스,207940,KOSPI
...,...,...,...
365,맘스터치,220630,KOSDAQ
366,다날,064260,KOSDAQ
367,제이시스메디칼,287410,KOSDAQ
368,크리스에프앤씨,110790,KOSDAQ


In [3]:
start_date = '20210104'
end_date = '20211105'

start_weekday = pd.to_datetime(start_date).weekday()  # 시작 날짜의 요일을 숫자로 출력(월요일:0 ~ 일요일:6)
max_weeknum = pd.to_datetime(end_date).strftime('%V')  # 종료 날짜가 그 해의 몇번째 주차에 속하는지를 출력
Business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])  # 주식시장이 열리는 Business day(주말, 공휴일 제외)의 리스트 생성. 고려 기간의 Business day가 총 220일임을 확인.

print(f'WEEKDAY of "start_date" : {start_weekday}')
print(f'NUM of WEEKS to "end_date" : {max_weeknum}')
print(f'HOW MANY "Business_days" : {Business_days.shape}', )
display(Business_days.head())

WEEKDAY of "start_date" : 0
NUM of WEEKS to "end_date" : 44
HOW MANY "Business_days" : (220, 1)


Unnamed: 0,Date
0,2021-01-04
1,2021-01-05
2,2021-01-06
3,2021-01-07
4,2021-01-08


In [4]:
display(Business_days.tail())

Unnamed: 0,Date
215,2021-11-01
216,2021-11-02
217,2021-11-03
218,2021-11-04
219,2021-11-05


## Baseline 모델의 구성 소개 ( Sample )

X : (월 ~ 금) * 43주간

y : (다음주 월 ~ 금) * 43주간

y_0 : 다음주 월요일

y_1 : 다음주 화요일

y_2 : 다음주 수요일

y_3 : 다음주 목요일

y_4 : 다음주 금요일

이번주 월~금요일의 패턴을 학습해 다음주 월요일 ~ 금요일을 각각 예측하는 모델을 생성

이 과정을 모든 종목(370개)에 적용

In [5]:
sample_code = stock_list.loc[0,'종목코드']  # 종목 코드 리스트에서 삼성전자 종목코드 추출

sample = fdr.DataReader(sample_code, start = start_date, end = end_date)  # 삼성전자 주가 정보 열람(Open : 개장가, 시가 High : 고가, Low : 저가, Close : 마감가, 종가, Volume : 거래량)
sample.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-01-04,81000,84400,80200,83000,38655276,0.024691
2021-01-05,81600,83900,81600,83900,35335669,0.010843
2021-01-06,83300,84500,82100,82200,42089013,-0.020262
2021-01-07,82800,84200,82700,82900,32644642,0.008516
2021-01-08,83300,90000,83000,88800,59013307,0.07117


In [6]:
sample = fdr.DataReader(sample_code, start = start_date, end = end_date)[['Close']].reset_index() # 종가 정보 추출 및 리인덱스
print(len(sample))
sample.head()

209


Unnamed: 0,Date,Close
0,2021-01-04,83000
1,2021-01-05,83900
2,2021-01-06,82200
3,2021-01-07,82900
4,2021-01-08,88800


In [7]:
sample = pd.merge(Business_days, sample, how = 'outer')
print(len(sample))
sample.head()

220


Unnamed: 0,Date,Close
0,2021-01-04,83000.0
1,2021-01-05,83900.0
2,2021-01-06,82200.0
3,2021-01-07,82900.0
4,2021-01-08,88800.0


In [8]:
sample['weekday'] = sample.Date.apply(lambda x : x.weekday())  # 요일을 나타내는 column 추가(월요일:0, 일요일:6)
sample['weeknum'] = sample.Date.apply(lambda x : x.strftime('%V'))  # 해당연도 주차를 나타내는 column 추가(1주차 : 01, 53주차 : 53)
sample.head()

Unnamed: 0,Date,Close,weekday,weeknum
0,2021-01-04,83000.0,0,1
1,2021-01-05,83900.0,1,1
2,2021-01-06,82200.0,2,1
3,2021-01-07,82900.0,3,1
4,2021-01-08,88800.0,4,1


In [9]:
sample.isna().sum()

Date        0
Close      11
weekday     0
weeknum     0
dtype: int64

In [10]:
sample.Close = sample.Close.interpolate()  # 결측값을 보간값으로 채움
# sample.Close = sample.Close.ffill()  # 결측값을 직전 row 해당값으로 채움
sample = pd.pivot_table(data = sample, values = 'Close', columns = 'weekday', index = 'weeknum')  # 데이터프레임 양식 변환
print(len(sample))  # 1~44 주차 정보임을 확인
sample.head()

44


weekday,0,1,2,3,4
weeknum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,83000.0,83900.0,82200.0,82900.0,88800.0
2,91000.0,90600.0,89700.0,89700.0,88000.0
3,85000.0,87000.0,87200.0,88100.0,86800.0
4,89400.0,86700.0,85600.0,83700.0,82000.0
5,83000.0,84400.0,84600.0,82500.0,83500.0


In [11]:
sample.tail(5)

weekday,0,1,2,3,4
weeknum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
40,72700.0,72200.0,71300.0,71600.0,71500.0
41,70250.0,69000.0,68800.0,69400.0,70100.0
42,70200.0,70600.0,70300.0,70200.0,70400.0
43,70200.0,71100.0,70100.0,70700.0,69800.0
44,69900.0,71500.0,70400.0,70600.0,70200.0


In [12]:
x = sample.iloc[0:-2].to_numpy() # 0번째 행에서 아래에서 두번째 행까지를 읽어와 numpy로 
x.shape  # x은 1~42주차 월 ~ 금요일 종가

(42, 5)

In [13]:
y = sample.iloc[1:-1].to_numpy()
y_0 = y[:,0]
y_1 = y[:,1]
y_2 = y[:,2]
y_3 = y[:,3]
y_4 = y[:,4]

y_values = [y_0, y_1, y_2, y_3, y_4]  # y_0 ~ y_4는 2~43주차 월 ~ 금요일 종가

In [14]:
x_public = sample.iloc[-2].to_numpy()  # public 예측 기간(11/01~ 11/05) 직전 주의 종가를 x_public로 추출
x_public

array([70200., 71100., 70100., 70700., 69800.])

# 예측

In [46]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import ElasticNet
base_model = ElasticNet(
    alpha=1.0,
    l1_ratio=0.5,
    random_state=42,
    tol=1e-1
)

In [47]:
model = AdaBoostRegressor(base_estimator=base_model, n_estimators=70, learning_rate=1e-2, random_state=42)  # 모델링 방법으로 boosting 선택

In [48]:
predictions = []
for y_value in y_values:
    model.fit(x,y_value)
    prediction = model.predict(np.expand_dims(x_public,0))
    predictions.append(prediction[0])
predictions

[70254.194005482,
 70443.39857716451,
 70075.44717211966,
 69795.75890676904,
 69839.95504192259]

In [49]:
# 실제 public 값
sample.iloc[-1].values

array([69900., 71500., 70400., 70600., 70200.])

## 전체 모델링

In [50]:
sample_name = 'sample_submission.csv'
sample_submission = pd.read_csv(os.path.join('./',sample_name))
sample_submission

Unnamed: 0,Day,000060,000080,000100,000120,000150,000240,000250,000270,000660,...,330860,336260,336370,347860,348150,348210,352820,357780,363280,950130
0,2021-11-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2021-11-02,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2021-11-03,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2021-11-04,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2021-11-05,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,2021-11-29,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,2021-11-30,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,2021-12-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,2021-12-02,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,2021-12-03,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
# Baseline Code에서는 위에서 구한 public 예측값을 private 예측값으로 재사용
# private 기간에 대해 새롭게 예측을 수행해야함

for code in tqdm(stock_list['종목코드'].values):
    data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']].reset_index()
    data = pd.merge(Business_days, data, how = 'outer')
    data['weekday'] = data.Date.apply(lambda x : x.weekday())
    data['weeknum'] = data.Date.apply(lambda x : x.strftime('%V'))
    data.Close = data.Close.ffill()
    data = pd.pivot_table(data = data, values = 'Close', columns = 'weekday', index = 'weeknum')
    
    x = data.iloc[0:-2].to_numpy() # 2021년 1월 04일 ~ 2021년 10월 22일까지의 데이터로
    y = data.iloc[1:-1].to_numpy() # 2021년 1월 11일 ~ 2021년 10월 29일까지의 데이터를 학습한다.
    y_0 = y[:,0]
    y_1 = y[:,1]
    y_2 = y[:,2]
    y_3 = y[:,3]
    y_4 = y[:,4]

    y_values = [y_0, y_1, y_2, y_3, y_4]
    x_public = data.iloc[-2].to_numpy() # 2021년 11월 1일부터 11월 5일까지의 데이터를 예측할 것이다.
    
    predictions = []
    for y_value in y_values :
        model.fit(x,y_value)
        prediction = model.predict(np.expand_dims(x_public,0))
        predictions.append(prediction[0])
    sample_submission.loc[:,code] = predictions * 2
sample_submission.isna().sum().sum()

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

0

In [52]:
sample_submission

Unnamed: 0,Day,000060,000080,000100,000120,000150,000240,000250,000270,000660,...,330860,336260,336370,347860,348150,348210,352820,357780,363280,950130
0,2021-11-01,29473.530064,34999.08444,61283.613246,142914.18756,107796.61648,17028.227564,49231.461768,85126.183864,103501.319036,...,49541.991115,53292.321722,84487.330096,37735.166448,26912.503263,54370.019417,338019.473848,263891.095134,27360.745558,17773.039502
1,2021-11-02,29497.833794,35348.386006,61104.200705,143625.687254,108462.790293,17245.779078,48864.983289,85428.541954,102327.267752,...,48589.493989,53227.105655,85199.208719,35713.507553,27033.65382,54737.122019,336604.506106,266974.415917,27513.233028,17725.827328
2,2021-11-03,29797.283808,35208.405901,60936.165786,146179.732164,109358.54523,17522.023885,47457.353422,85627.363814,102930.306905,...,48650.761536,50639.03749,85584.844903,35822.069956,27210.805082,55830.762592,326680.904961,266600.816496,27488.726312,17510.522323
3,2021-11-04,30018.081413,34860.591875,60586.627765,144963.049197,108286.721695,17663.291575,48677.686887,85297.73888,101989.011885,...,48837.694522,49880.378216,83886.1941,35033.444256,27153.448014,56356.014571,329391.420146,265116.408557,27605.465205,17572.706167
4,2021-11-05,29619.838615,34847.138396,60319.149334,143275.660168,107022.82248,17855.80161,47710.167115,84448.901828,101644.167505,...,49527.554046,48037.580634,85514.627719,34801.336271,27447.685766,55519.477744,330702.699819,266096.169647,27435.626949,17719.300942
5,2021-11-29,29473.530064,34999.08444,61283.613246,142914.18756,107796.61648,17028.227564,49231.461768,85126.183864,103501.319036,...,49541.991115,53292.321722,84487.330096,37735.166448,26912.503263,54370.019417,338019.473848,263891.095134,27360.745558,17773.039502
6,2021-11-30,29497.833794,35348.386006,61104.200705,143625.687254,108462.790293,17245.779078,48864.983289,85428.541954,102327.267752,...,48589.493989,53227.105655,85199.208719,35713.507553,27033.65382,54737.122019,336604.506106,266974.415917,27513.233028,17725.827328
7,2021-12-01,29797.283808,35208.405901,60936.165786,146179.732164,109358.54523,17522.023885,47457.353422,85627.363814,102930.306905,...,48650.761536,50639.03749,85584.844903,35822.069956,27210.805082,55830.762592,326680.904961,266600.816496,27488.726312,17510.522323
8,2021-12-02,30018.081413,34860.591875,60586.627765,144963.049197,108286.721695,17663.291575,48677.686887,85297.73888,101989.011885,...,48837.694522,49880.378216,83886.1941,35033.444256,27153.448014,56356.014571,329391.420146,265116.408557,27605.465205,17572.706167
9,2021-12-03,29619.838615,34847.138396,60319.149334,143275.660168,107022.82248,17855.80161,47710.167115,84448.901828,101644.167505,...,49527.554046,48037.580634,85514.627719,34801.336271,27447.685766,55519.477744,330702.699819,266096.169647,27435.626949,17719.300942


In [53]:
columns = list(sample_submission.columns[1:])

columns = ['Day'] + [str(x).zfill(6) for x in columns]

sample_submission.columns = columns

In [54]:
sample_submission.to_csv('adaboost_elastic.csv', index=False)