## Reference

[파이썬 코드 한줄로 종목코드 한번에 다운받기](https://minjejeon.github.io/learningstock/2017/09/07/download-krx-ticker-symbols-at-once.html)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
from multiprocessing.pool import ThreadPool

from data_io import download_stock_data, load_stock_data
from data_io import download_stock_data_from_naver

In [2]:
kospi = pd.read_csv('data/metadata/kospi.csv', error_bad_lines=False)
kospi[0:2]

b'Skipping line 437: expected 12 fields, saw 13\n'


Unnamed: 0,번호,종목코드,기업명,업종코드,업종,상장주식수(주),자본금(원),액면가(원),통화구분,대표전화,주소,총카운트
0,1,95570,AJ네트웍스,147603.0,산업용 기계 및 장비 임대업,46822295,46822295000,1000,원(KRW),02-6363-9999,"서울특별시 송파구 정의로8길 9 (문정동,AJ빌딩)",789.0
1,2,68400,AJ렌터카,147601.0,운송장비 임대업,22146300,11073150000,500,원(KRW),1544-1600,서울특별시 구로구 서부샛길 822,789.0


In [3]:
df_from_krx = pd.read_html('http://kind.krx.co.kr/corpgeneral/corpList.do?method=download&searchType=13', header=0)[0]
df_from_krx['종목코드'] = df_from_krx['종목코드'].apply(lambda x: str(int(x)).zfill(6))
display(df_from_krx[0:2])
print(df_from_krx.shape)
df_from_krx.to_csv('data/metadata/df_from_krx.csv')

Unnamed: 0,회사명,종목코드,업종,주요제품,상장일,결산월,대표자명,홈페이지,지역
0,DSR,155660,1차 비철금속 제조업,합섬섬유로프,2013-05-15,12월,홍석빈,http://www.dsr.com,부산광역시
1,GS글로벌,1250,상품 종합 도매업,"수출입업(시멘트,철강금속,전기전자,섬유,기계화학),상품중개,광업,채석업/하수처리 서...",1976-06-26,12월,김태형,http://www.gsgcorp.com,서울특별시


(2323, 9)


In [4]:
stocklist = pd.read_csv('data/metadata/companylist_mac.csv') # macOS
#stocklist = pd.read_csv('data/metadata/companylist_windows.csv') # Windows

stocklist.columns = stocklist.columns.str.replace('코드','종목코드')

stocklist['종목코드'] = stocklist['종목코드'].apply(lambda x: str(int(x)).zfill(6))
display(stocklist.head())
stocklist['종목1'].value_counts()

Unnamed: 0,종목1,종목코드,기업명
0,공기청정기,71840,하이마트
1,공기청정기,45520,크린앤사이언스
2,공기청정기,284740,쿠쿠홈시스
3,공기청정기,192400,쿠쿠홀딩스
4,공기청정기,44340,위닉스


제약        16
전기차       15
공기청정기     13
수소차       10
대기환경설비     8
천연가스       6
여과설비       5
헬스케어       3
마스크        3
Name: 종목1, dtype: int64

In [5]:
df_to_download_metadata = pd.DataFrame(columns=['기업명', '종목코드', '상장일'])
df_to_download_metadata['기업명'] = stocklist['기업명']
df_to_download_metadata['종목코드'] = stocklist['종목코드']
display(df_to_download_metadata)

Unnamed: 0,기업명,종목코드,상장일
0,하이마트,071840,
1,크린앤사이언스,045520,
2,쿠쿠홈시스,284740,
3,쿠쿠홀딩스,192400,
4,위닉스,044340,
...,...,...,...
74,동성화인텍,033500,
75,대창솔루션,096350,
76,인피니트헬스케어,071200,
77,솔고바이오,043100,


In [6]:
def get_ipo_date(ticker) : 
    data = df_from_krx[df_from_krx['종목코드'] == ticker]['상장일']
    try : 
        if data.shape[0] != 0 :
            return data.values[0]
        else : 
            return 0
    except Exception as e :
        print(e)
        return 0

df_to_download_metadata['상장일'] = df_to_download_metadata['종목코드'].apply(lambda ticker : get_ipo_date(ticker))

In [7]:
wrong_idx = df_to_download_metadata[df_to_download_metadata['상장일']==0].index
df_to_download_metadata = df_to_download_metadata.drop(index=wrong_idx)
df_to_download_metadata['상장일'] = pd.to_datetime(df_to_download_metadata['상장일'])
display(df_to_download_metadata)

Unnamed: 0,기업명,종목코드,상장일
0,하이마트,071840,2011-06-29
1,크린앤사이언스,045520,2000-12-05
2,쿠쿠홈시스,284740,2018-01-11
3,쿠쿠홀딩스,192400,2014-08-06
4,위닉스,044340,2000-10-24
...,...,...,...
74,동성화인텍,033500,1997-12-19
75,대창솔루션,096350,2007-12-17
76,인피니트헬스케어,071200,2010-05-26
77,솔고바이오,043100,2000-08-08


# Process with single core

In [8]:
def download_stock_data(df_metadata, folder_name) : 
    df_result = pd.DataFrame(columns=['종목코드', '상장일', '결과'])
    outer = tqdm(total=df_metadata.shape[0], desc='Epoch', position=0)
    
    args = []
    for i in range(df_metadata.shape[0]) :
        args.append((i, i+1))
    
    for idx in range(0, df_metadata.shape[0]) :
        try : 
            download_stock_data('data/%s/%s.csv'%(folder_name, df_metadata['종목코드'].iloc[idx]),
                                df_metadata['종목코드'].iloc[idx],
                                int(df_metadata['상장일'].dt.year.iloc[idx]),
                                int(df_metadata['상장일'].dt.month.iloc[idx]),
                                int(df_metadata['상장일'].dt.day.iloc[idx]),
                                pd.Timestamp.today().year,
                                pd.Timestamp.today().month,
                                pd.Timestamp.today().day)
            df_result.loc[idx] = [df_metadata['종목코드'].iloc[idx],
                                  df_metadata['상장일'].iloc[idx],
                                  'Yahoo']
        except Exception as e:
            try : 
                download_stock_data_from_naver('data/%s/%s.csv'%(folder_name, df_metadata['종목코드'].iloc[idx]),
                                               df_metadata['종목코드'].iloc[idx],
                                               'day',
                                               '100000')
                df_result.loc[idx] = [df_metadata['종목코드'].iloc[idx],
                                  df_metadata['상장일'].iloc[idx],
                                  'Naver']
            except Exception as e : 
                df_result.loc[idx] = [df_metadata['종목코드'].iloc[idx],
                                  df_metadata['상장일'].iloc[idx],
                                  e]  
        outer.update(1)
    return df_result
    
result = download_stock_data(df_to_download_metadata, 'stock')
result.to_csv('data/history/download_result.csv')

Epoch: 100%|██████████| 79/79 [01:35<00:00,  1.07s/it]

In [9]:
result

Unnamed: 0,종목코드,상장일,결과
0,071840,2011-06-29,Naver
1,045520,2000-12-05,Naver
2,284740,2018-01-11,Naver
3,192400,2014-08-06,Naver
4,044340,2000-10-24,Naver
...,...,...,...
74,033500,1997-12-19,Naver
75,096350,2007-12-17,Naver
76,071200,2010-05-26,Naver
77,043100,2000-08-08,Naver
