In [19]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import traceback
from tqdm import tqdm
import time
import json

In [37]:
corp_info = pd.read_csv(r'corporation_information_2016.csv')

#### JSON File Dictionary로 불러오기 & key는 list로 불러오기

In [38]:
with open('doc_info.json', 'r') as f:
     doc_dict = json.load(f)

corp_keys = list(doc_dict.keys())
corp_keys.sort()
bsn_tp = list(doc_dict[corp_keys[0]].keys())
bsn_tp.sort()

### 최신 공시 추출

In [39]:
def latestDisclosures(report_type_list):
    latest_disclosures = pd.DataFrame(columns=['종목코드', '기업명','보고서명','접수번호','접수일자'])
    index = 0
    for key in corp_keys:
        df_list = [pd.read_json(doc_dict[key][tp]) for tp in report_type_list]
        df = pd.concat(df_list, ignore_index=True)
        # 비어있는 Dataframe은 일단 무시 (일일요청횟수추가로 다 수집이 안 되어있음)
        if len(df) == 0:
            continue
        df = df.sort_values(by=['접수일자'], ascending=False)
        df = df.reset_index(drop=True)
        
        for ix, rw in df.iterrows():
            if '정정' in rw.보고서명 or '추가'  in rw.보고서명:
                continue
            else:
                break
        
        종목코드 = '_'+key.split('_')[1]
        기업명 = key.split('_')[0]
        보고서명 = rw.보고서명
        접수번호 = str(rw.접수번호)
        접수일자 = rw.접수일자
        
        latest_disclosures.loc[index] = [종목코드, 기업명, 보고서명, 접수번호, 접수일자]
        index += 1
    
    return latest_disclosures

In [40]:
def missingCorp(disclosure_df):
    global corp_keys
    
    기업명 = list(disclosure_df.기업명)
    종목코드 = list(disclosure_df.종목코드)
    existing_keys = [c1+c2 for c1,c2 in zip(종목코드)]
    
    missing_corp = list(set(corp_keys) - set(existing_keys))
    missing_corp.sort()
    
    return [c.split('_')[0] for c in missing_corp]

In [41]:
latest_disclosures = latestDisclosures(['A001'])

#### 정기공시 없는 리스트

In [42]:
corp_info[corp_info.종목명.isin(missingCorp(latest_disclosures))]

ValueError: not enough values to unpack (expected 2, got 1)

### Download Link Column으로 추가

In [44]:
latest_disclosures.head()

Unnamed: 0,종목코드,기업명,보고서명,접수번호,접수일자
0,_060310,3S,사업보고서 (2018.03),20180628000160,20180628
1,_095570,AJ네트웍스,사업보고서 (2017.12),20180330003492,20180330
2,_068400,AJ렌터카,사업보고서 (2017.12),20180329000766,20180329
3,_006840,AK홀딩스,사업보고서 (2017.12),20180330002417,20180330
4,_054620,AP시스템,사업보고서 (2017.12),20180402004758,20180402


In [45]:
def getDocumentNumber(disclosure_df):
    base = 'http://dart.fss.or.kr/dsaf001/main.do?rcpNo='
    dcm_no = []
    for index, row in tqdm(disclosure_df.iterrows()):
        url = base + row.접수번호
        r = requests.post(url)
        time.sleep(np.random.randint(1,300)/100)
        soup = BeautifulSoup(r.text,'lxml')
        time.sleep(np.random.randint(1,300)/100)
        dn = str(soup.find_all(href='#download')).split('openPdfDownload')[1].split(';')[0].split(',')[1].split(')')[0].split()[0][1:-1]
        dcm_no.append(dn)
    disclosure_df['문서번호'] = dcm_no
    return

In [46]:
getDocumentNumber(latest_disclosures)


0it [00:00, ?it/s][A
1it [00:01,  1.34s/it][A
2it [00:02,  1.18s/it][A
Exception in thread Thread-6:
Traceback (most recent call last):
  File "/home/jin/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/jin/anaconda3/lib/python3.6/site-packages/tqdm/_monitor.py", line 62, in run
    for instance in self.tqdm_cls._instances:
  File "/home/jin/anaconda3/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

2119it [1:48:53,  3.08s/it]


In [47]:
latest_disclosures.head()

Unnamed: 0,종목코드,기업명,보고서명,접수번호,접수일자,문서번호
0,_060310,3S,사업보고서 (2018.03),20180628000160,20180628,6231042
1,_095570,AJ네트웍스,사업보고서 (2017.12),20180330003492,20180330,6040642
2,_068400,AJ렌터카,사업보고서 (2017.12),20180329000766,20180329,6025677
3,_006840,AK홀딩스,사업보고서 (2017.12),20180330002417,20180330,6036212
4,_054620,AP시스템,사업보고서 (2017.12),20180402004758,20180402,6059222


In [48]:
def getDownloadLink(disclosure_df):
    base_pdf = 'http://dart.fss.or.kr/pdf/download/pdf.do?'
    base_xls = 'http://dart.fss.or.kr/pdf/download/excel.do?'

    pdf = []
    excel = []
    for index, row in tqdm(disclosure_df.iterrows()):
        rcp_no = row.접수번호
        dcm_no = row.문서번호
        
        pdf_link = base_pdf + 'rcp_no=' + rcp_no + '&dcm_no=' + dcm_no
        xls_link = base_xls + 'rcp_no=' + rcp_no + '&dcm_no=' + dcm_no
        
        pdf.append(pdf_link)
        excel.append(xls_link)
    
    disclosure_df['보고서링크'] = pdf
    disclosure_df['재무제표링크'] = excel
    return

In [49]:
getDownloadLink(latest_disclosures)

2119it [00:00, 7976.06it/s]


In [50]:
latest_disclosures.to_csv(r'latest_disclosures_KOSPI_170102.csv',index=False, encoding='utf8')

In [51]:
latest_disclosures.to_csv(r'latest_disclosures_KOSPI_170102.csv',index=False, encoding='utf8')