# OpenDart API 
- 회계년도와 공시년도 정확한 조정이 필요하다. 
- 전에는 회계년도에서 4개월을 미루는 것으로 했었는데 정확한 데이터 클리닝을 위해서는 OpenDart API를 이용해서 정확한 공시년도를 가져올 필요가 있다.

## OpenDart API 발급
- [Dart 사이트](https://opendart.fss.or.kr/)에서 회원가입후 발급 받을 수 있다.

In [74]:
import OpenDartReader
import pandas as pd 
from pathlib import Path

In [3]:
api_key = '507067cac0daee90d45db48476241a0f47665afc'

In [4]:
dart = OpenDartReader(api_key)

## 예시로 삼성전자 정기보고서 가져오기

In [5]:
data = dart.list('005930', start='1990-01-01', kind='A')

In [6]:
data 

Unnamed: 0,corp_code,corp_name,stock_code,corp_cls,report_nm,rcept_no,flr_nm,rcept_dt,rm
0,00126380,삼성전자,005930,Y,분기보고서 (2021.03),20210517001185,삼성전자,20210517,
1,00126380,삼성전자,005930,Y,사업보고서 (2020.12),20210309000744,삼성전자,20210309,연
2,00126380,삼성전자,005930,Y,분기보고서 (2020.09),20201116001248,삼성전자,20201116,
3,00126380,삼성전자,005930,Y,반기보고서 (2020.06),20200814001766,삼성전자,20200814,
4,00126380,삼성전자,005930,Y,분기보고서 (2020.03),20200515001451,삼성전자,20200515,
...,...,...,...,...,...,...,...,...,...
83,00126380,삼성전자,005930,Y,반기보고서 (2000.06),20000814000482,삼성전자,20000814,
84,00126380,삼성전자,005930,Y,[첨부추가]사업보고서 (1999.12),20000330000796,삼성전자,20000330,연
85,00126380,삼성전자,005930,Y,분기보고서 (2000.03),20000515000739,삼성전자,20000515,
86,00126380,삼성전자,005930,Y,[기재정정]반기보고서 (1999.06),19990824000003,삼성전자,19990824,


## 정규 표현식으로 회계년도 가져오기

In [7]:
import re 

In [14]:
pattern = r"\d{4}.\d{2}"

In [15]:
text = '반기보고서 (2000.06)'

In [44]:
tmp = re.findall(pattern, text)[0]

In [46]:
report_name = data['report_nm']

In [50]:
data['account_date'] = report_name.apply(lambda x: re.findall(pattern, x)[0])

In [51]:
data 

Unnamed: 0,corp_code,corp_name,stock_code,corp_cls,report_nm,rcept_no,flr_nm,rcept_dt,rm,account_date
0,00126380,삼성전자,005930,Y,분기보고서 (2021.03),20210517001185,삼성전자,20210517,,2021.03
1,00126380,삼성전자,005930,Y,사업보고서 (2020.12),20210309000744,삼성전자,20210309,연,2020.12
2,00126380,삼성전자,005930,Y,분기보고서 (2020.09),20201116001248,삼성전자,20201116,,2020.09
3,00126380,삼성전자,005930,Y,반기보고서 (2020.06),20200814001766,삼성전자,20200814,,2020.06
4,00126380,삼성전자,005930,Y,분기보고서 (2020.03),20200515001451,삼성전자,20200515,,2020.03
...,...,...,...,...,...,...,...,...,...,...
83,00126380,삼성전자,005930,Y,반기보고서 (2000.06),20000814000482,삼성전자,20000814,,2000.06
84,00126380,삼성전자,005930,Y,[첨부추가]사업보고서 (1999.12),20000330000796,삼성전자,20000330,연,1999.12
85,00126380,삼성전자,005930,Y,분기보고서 (2000.03),20000515000739,삼성전자,20000515,,2000.03
86,00126380,삼성전자,005930,Y,[기재정정]반기보고서 (1999.06),19990824000003,삼성전자,19990824,,1999.06


In [54]:
data['account_date'] = pd.to_datetime(data['account_date'])

In [58]:
data['rcept_dt'] = pd.to_datetime(data['rcept_dt'])

In [60]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88 entries, 0 to 87
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   corp_code     88 non-null     object        
 1   corp_name     88 non-null     object        
 2   stock_code    88 non-null     object        
 3   corp_cls      88 non-null     object        
 4   report_nm     88 non-null     object        
 5   rcept_no      88 non-null     object        
 6   flr_nm        88 non-null     object        
 7   rcept_dt      88 non-null     datetime64[ns]
 8   rm            88 non-null     object        
 9   account_date  88 non-null     datetime64[ns]
dtypes: datetime64[ns](2), object(8)
memory usage: 7.0+ KB


## 데이터를 보면 성공적으로 회계년도와 공시년도가 datetime으로 타입이 변경된 것을 볼 수 있다.

In [64]:
data = data[['stock_code', 'report_nm', 'rcept_dt', 'account_date']]

## 멀티 인덱스 먹이기

In [67]:
data.set_index(['account_date', 'stock_code'], inplace=True)

In [69]:
data.columns = ['report_name', 'receipt_date']

In [70]:
data 

Unnamed: 0_level_0,Unnamed: 1_level_0,report_name,receipt_date
account_date,stock_code,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-03-01,005930,분기보고서 (2021.03),2021-05-17
2020-12-01,005930,사업보고서 (2020.12),2021-03-09
2020-09-01,005930,분기보고서 (2020.09),2020-11-16
2020-06-01,005930,반기보고서 (2020.06),2020-08-14
2020-03-01,005930,분기보고서 (2020.03),2020-05-15
...,...,...,...
2000-06-01,005930,반기보고서 (2000.06),2000-08-14
1999-12-01,005930,[첨부추가]사업보고서 (1999.12),2000-03-30
2000-03-01,005930,분기보고서 (2000.03),2000-05-15
1999-06-01,005930,[기재정정]반기보고서 (1999.06),1999-08-24


## 클리닝 코드 함수화 하기

In [71]:
def cleaning_report(data: pd.DataFrame):
    report_name = data['report_nm']
    data['account_date'] = report_name.apply(lambda x: re.findall(pattern, x)[0])
    data['account_date'] = pd.to_datetime(data['account_date'])
    data['rcept_dt'] = pd.to_datetime(data['rcept_dt'])
    data = data[['stock_code', 'report_nm', 'rcept_dt', 'account_date']]
    data.set_index(['account_date', 'stock_code'], inplace=True)
    data.columns = ['report_name', 'receipt_date']
    data.index.names = ['date', 'ticker']
    return data 

In [72]:
data = dart.list('005930', start='1990-01-01', kind='A')

In [73]:
cleaning_report(data)

Unnamed: 0_level_0,Unnamed: 1_level_0,report_name,receipt_date
account_date,stock_code,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-03-01,005930,분기보고서 (2021.03),2021-05-17
2020-12-01,005930,사업보고서 (2020.12),2021-03-09
2020-09-01,005930,분기보고서 (2020.09),2020-11-16
2020-06-01,005930,반기보고서 (2020.06),2020-08-14
2020-03-01,005930,분기보고서 (2020.03),2020-05-15
...,...,...,...
2000-06-01,005930,반기보고서 (2000.06),2000-08-14
1999-12-01,005930,[첨부추가]사업보고서 (1999.12),2000-03-30
2000-03-01,005930,분기보고서 (2000.03),2000-05-15
1999-06-01,005930,[기재정정]반기보고서 (1999.06),1999-08-24


## 상장폐지 포함 코스피 전 종목 ticker들 불러오기

In [75]:
DATA_STORE = Path('fnguide_data.h5')

In [76]:
with pd.HDFStore(DATA_STORE) as store:
    stocks = store['fnguide/stocks']

In [78]:
tickers = stocks['ticker']

In [79]:
tickers 

0       000010
1       000020
2       000030
3       000040
4       000050
         ...  
1156    900140
1157    950010
1158    950070
1159    950100
1160    950210
Name: ticker, Length: 1161, dtype: object

In [80]:
from concurrent import futures
import concurrent

In [81]:
data_list = []

def get_data(ticker: str):
    try:
        data = dart.list(ticker, start='1990-01-01', kind='A')
        data = cleaning_report(data)
        data_list.append(data)
    except Exception as e:
        print(f"Error: {e}, Ticker: {ticker}")
        
with futures.ThreadPoolExecutor(50) as executor:
    executor.map(get_data, tickers)

Error: name 'code' is not defined, Ticker: 000090
Error: name 'code' is not defined, Ticker: 000130
Error: name 'code' is not defined, Ticker: 000160
Error: name 'code' is not defined, Ticker: 000170
Error: name 'code' is not defined, Ticker: 000200
Error: name 'code' is not defined, Ticker: 000280
Error: name 'code' is not defined, Ticker: 000310
Error: name 'code' is not defined, Ticker: 000330
Error: name 'code' is not defined, Ticker: 000450
Error: name 'code' is not defined, Ticker: 000510
Error: name 'code' is not defined, Ticker: 000570
Error: name 'code' is not defined, Ticker: 000600Error: list index out of range, Ticker: 000070

Error: list index out of range, Ticker: 000240
Error: name 'code' is not defined, Ticker: 000730
Error: name 'code' is not defined, Ticker: 000840
Error: name 'code' is not defined, Ticker: 000870
Error: name 'code' is not defined, Ticker: 000900
Error: name 'code' is not defined, Ticker: 000930
Error: name 'code' is not defined, Ticker: 001030
Error:

Error: name 'code' is not defined, Ticker: 014150

Error: name 'code' is not defined, Ticker: 014390
Error: name 'code' is not defined, Ticker: 014400
Error: name 'code' is not defined, Ticker: 014430
Error: name 'code' is not defined, Ticker: 014450
Error: name 'code' is not defined, Ticker: 014490
Error: name 'code' is not defined, Ticker: 014740
Error: name 'code' is not defined, Ticker: 014980
Error: list index out of range, Ticker: 012030
Error: name 'code' is not defined, Ticker: 015080
Error: name 'code' is not defined, Ticker: 015340
Error: name 'code' is not defined, Ticker: 015580Error: name 'code' is not defined, Ticker: 015620
Error: list index out of range, Ticker: 012400
Error: name 'code' is not defined, Ticker: 015650
Error: name 'code' is not defined, Ticker: 015730Error: name 'code' is not defined, Ticker: 015780Error: name 'code' is not defined, Ticker: 015880Error: name 'code' is not defined, Ticker: 015980



Error: name 'code' is not defined, Ticker: 016070
Error:

In [86]:
report_data = pd.concat(data_list).sort_index()

In [101]:
report_data

Unnamed: 0_level_0,Unnamed: 1_level_0,report_name,receipt_date
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1
1998-01-01,008540,[기재정정]사업보고서 (1998.01),1999-07-02
1998-03-01,000020,사업보고서 (1998.03),1999-06-28
1998-03-01,000060,사업보고서 (1998.03),1999-06-29
1998-03-01,000220,사업보고서 (1998.03),1999-06-28
1998-03-01,000230,사업보고서 (1998.03),1999-06-28
...,...,...,...
2021-03-01,375500,분기보고서 (2021.03),2021-05-14
2021-03-01,378850,분기보고서 (2021.03),2021-05-17
2021-03-01,900140,분기보고서 (2021.03),2021-05-28
2021-03-01,950210,분기보고서 (2021.03),2021-05-28


## 데이터 저장

In [102]:
OPENDART_DATA = Path('opendart.h5')

In [103]:
with pd.HDFStore(OPENDART_DATA) as store:
    store.put('opendart/report', report_data)

In [104]:
with pd.HDFStore(OPENDART_DATA) as store:
    print(store.info())

<class 'pandas.io.pytables.HDFStore'>
File path: opendart.h5
/opendart/report            frame        (shape->[1,2])


In [107]:
stocks[stocks['ticker'] == '000090']

Unnamed: 0,ticker,name
8,90,두산상사
