In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re
import random
import tqdm
from tqdm.contrib import tzip

In [None]:
# 전처리를 위한 함수 구현
def replacer(text, input):
    for garbage in input:
        text = text.replace(garbage, "")
    return text

In [None]:
class KIND_REPORTS:
    __kind_url = "https://kind.krx.co.kr/disclosure/details.do"
    __kind_header = {
        'authority': 'kind.krx.co.kr',
        'method': 'POST',
        'path': '/disclosure/details.do',
        'scheme': 'https',
        'accept': 'text/html, */*; q=0.01',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
        'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'origin': 'https://kind.krx.co.kr',
        'referer': 'https://kind.krx.co.kr/disclosure/details.do?method=searchDetailsMain',
        'sec-ch-ua': """"Google Chrome";v="105", "Not)A;Brand";v="8", "Chromium";v="105""""",
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': "Windows",
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-origin',
        'user-agent': '',   # 자신의 user-agent가 필요합니다
        'x-requested-with': 'XMLHttpRequest'
    }
    __kind_params ={
        'method': 'searchDetailsSub',
        'currentPageSize': '100',
        'pageIndex': '1',
        'orderMode': '1',
        'orderStat': 'D',
        'forward': 'details_sub',
        'disclosureType01': '',
        'disclosureType02': '',
        'disclosureType03': '',
        'disclosureType04': '',
        'disclosureType05': '',
        'disclosureType06': '',
        'disclosureType07': '',
        'disclosureType08': '',
        'disclosureType09': '',
        'disclosureType10': '',
        'disclosureType11': '',
        'disclosureType13': '',
        'disclosureType14': '',
        'disclosureType20': '',
        'pDisclosureType01': '',
        'pDisclosureType02': '',
        'pDisclosureType03': '',
        'pDisclosureType04': '',
        'pDisclosureType05': '',
        'pDisclosureType06': '',
        'pDisclosureType07': '',
        'pDisclosureType08': '',
        'pDisclosureType09': '',
        'pDisclosureType10': '',
        'pDisclosureType11': '',
        'pDisclosureType13': '',
        'pDisclosureType14': '',
        'pDisclosureType20': '',
        'searchCodeType': '',
        'repIsuSrtCd': '',
        'allRepIsuSrtCd': '',
        'oldSearchCorpName': '',
        'disclosureType': '',
        'disTypevalue': '',
        'reportNm': '',
        'reportCd': '',
        'searchCorpName': '',
        'business': '',
        'marketType': '',
        'settlementMonth': '',
        'securities': '',
        'submitOblgNm': '',
        'enterprise': '',
        'fromDate': '2',
        'toDate': '',
        'reportNmTemp': '',
        'reportNmPop': '',
        'bfrDsclsType': 'on'
    }

    def __init__(self):
        self.__sleep_min = 0.2
        self.__sleep_max = 0.5
        self.report_list = []
        self.report_content = []
        pass
    
    def __isResultExist(self, what_result):
        if len(what_result) != 0:
            print("Warning! Previous "+ str(what_result)+" remains")
            answer = input("Do you want to continue and replace the result? [y/n] :")
            if answer == 'y':
                pass
            else:
                raise Exception("Process closed")
        else:
            pass

    def set_sleep(self, min=0.2, max=0.5):
        self.__sleep_min = min
        self.__sleep_max = max
    
    def get_list(self, report, start, end, market_type, submit_by=""):
        '''
        marker_type = 1 : 코스피, 2 코스닥 / 검색어는 report에 입력
        '''
        self.__isResultExist(self.report_list)
        self.market_type = market_type
        params = self.__kind_params   # __kind_params 인수를 복제해서 사용
        params['reportNm'] = report
        params['fromDate'] = start
        params['toDate'] = end
        params['marketType'] = market_type
        params['reportNmTemp'] = report

        if submit_by == 'krx':
            submit_by = "유가증권시장본부"
        params['submitOblgNm'] = submit_by
        res = requests.post(self.__kind_url, headers=self.__kind_header, data=params) # 쿼리 날려요
        bs = BeautifulSoup(res.text, 'html.parser')

        end_page_loc = str(bs).find('</strong>/')
        page_length = int(str(bs)[end_page_loc+len('</strong>/'):end_page_loc+str(bs)[end_page_loc:].find('\xa0')])

        report_date = []
        company_name = []
        report_code = []
        report_name = []
        isCorrection = []

        for i in tqdm.tqdm(range(page_length)):
            time.sleep(random.uniform(self.__sleep_min, self.__sleep_max))
            params['pageIndex'] = str(i+1)
            res = requests.post(self.__kind_url, headers=self.__kind_header, data=params) # 쿼리 날려요
            bs = BeautifulSoup(res.text, 'html.parser')

            tags = bs.find_all("a")

            for tag in tags:
                if tag.get('href') == "#companysum":
                    company_name.append(tag.get('title'))

                elif tag.get('href') == "#viewer":
                    tmp_code = re.findall(r'\d+', str(tag.get('onclick')))[0]
                    report_code.append(tmp_code)
                    report_date.append(tmp_code[:4]+'-'+tmp_code[4:6]+'-'+tmp_code[6:8])                
                    report_name.append(tag.get('title'))

                    if '정정' in str(tag):
                        isCorrection.append('YES')
                    else:
                        isCorrection.append('NO')

        self.report_list = pd.DataFrame([report_date,company_name,report_code,report_name,isCorrection]).T
        self.report_list.columns = ['공시일','회사명','코드','보고서명','정정신고']
        # 자회사인 경우는 drop
        self.report_list = self.report_list.loc[~self.report_list["보고서명"].str.contains("자회사"), :].reset_index(drop=True)

        #if report == '상호변경':
        #    self.report_list = self.report_list.loc[(self.report_list['보고서명'] == '상호변경안내') | (self.report_list['보고서명'] == '변경상장(상호변경)')]
        #    self.report_list.reset_index(drop=True)

        # 회사명이 None인 경우를 해결해보자
        if self.report_list['회사명'].isnull().sum() >= 1:
            df_null = self.report_list.loc[self.report_list['회사명'].isnull()]
            index_list = df_null.index
            date_list = df_null['공시일'].values
            report_l = df_null['보고서명'].values

            find_name = []
            for date, r in zip(date_list, report_l):
                name_tag = self.__find_none(report = r, start=date,end = date, market_type=market_type)
                find_name.append(name_tag)
            
            for indx, name in zip(index_list, find_name):
                self.report_list['회사명'].loc[indx] = name
        print()
        print('Jobs Done')
        print('check the result with .report_list')
    
    # v6에서 추가
    def __find_none(self, report, start, end, market_type, submit_by=""):

        if len(report) > 6:
            report = report[:5]  #띄어쓰기는 검색이 안되는 오류를 잡기 위해서 추가

        params = self.__kind_params   # __kind_params 인수를 복제해서 사용
        params['reportNm'] = report
        params['fromDate'] = start
        params['toDate'] = end
        params['marketType'] = market_type
        params['reportNmTemp'] = report
        
        if submit_by == 'krx':
            submit_by = "유가증권시장본부"
        params['submitOblgNm'] = submit_by
        res = requests.post(self.__kind_url, headers=self.__kind_header, data=params) # 쿼리 날려요
        bs = BeautifulSoup(res.text, 'html.parser')
        tag = bs.a
        tag = tag.text.replace(' ', '')  # tag가 해당 회사 이름이 된다
        return tag

    def __engine_read_report(self, doc):
        url = "https://kind.krx.co.kr/common/disclsviewer.do?method=search&acptno=" + doc + "&docno=&viewerhost=&viewerport="
        res = requests.get(url) # 쿼리 날려요
        bs = BeautifulSoup(res.text, 'html.parser')
        
        # 기업공시코드를 찾는다
        h1_tag = bs.find_all("h1")[0]
        company_code = re.findall(r"\d{6,6}", str(h1_tag))[0]

        # url을 찾는다
        option_tag = bs.find_all("option")
        select_str = ''

        for tag in option_tag:
            if str(tag).startswith('<option selected="selected"'):
                select_str = str(tag)
                if "[정정]" in select_str:
                    return ('No','No','No', 'No') #정정공시일 경우에는 읽지 않는다(기존공시의 부수로 읽을 거기 때문에)
                else:
                    new_doc = re.findall(r'\d{6,}', str(option_tag))
                    doc_code = None  # 현재 읽은 공시의 코드를 리턴한다

                    url = "https://kind.krx.co.kr/common/disclsviewer.do?method=searchContents&docNo=" + str(new_doc[0]) #최초의 공시로 쿼리
                    doc_code = new_doc[0]
                    after_list = new_doc[1:] # 쿼리를 날린 0번째를 제외한 나머지를    

                    res = requests.get(url) # 쿼리 날려요
                    bs = str(BeautifulSoup(res.text, 'html.parser'))

                    inner_url = re.findall('https://[a-z0-9/.]+\.htm', bs)[0] #v7코드개선
                    res = requests.get(inner_url) # 쿼리 날려요
                    html = res.content.decode('utf-8','replace')
                    bs = BeautifulSoup(html, 'html.parser')

                    text_only_list = []

                    ######################여기부분 수정#############
                    if (self.read_what == 'right_issue') and (self.market_type =='1'):
                        a = bs.find_all("pre")[0].get_text()
                        a = a.replace('\n',' : ')
                        text = re.split(r"[①②③④⑤⑥]", a)
                        #text = a.split("") # 띄어쓰기 하나 수정?
                        return text, after_list, doc_code, company_code
                    ###############################################


                #    if self.read_what == 'sales':
                #        text_only_list.append([tag.get_text() for tag in bs.find_all('span')])
#
                    else: #유상증자인 경우 코스닥은 여기로?
                        if '▶ 업종코드 :' not in str(bs):
                            tags = bs.find_all('td')
                            for tag in tags:
                                text = tag.get_text()
                                text = text.replace('\r\n', ', ')
                                text = replacer(text, [", \n","\r",'\n'])
                                text_only_list.append(text)
                        elif '▶ 업종코드 :' in str(bs): # 유가증권시장본부가 공시하는 '변경상장(상호변경)'
                            text = replacer(bs.get_text(), [", \n","\r",'\n','   → ','- '])
                            text = text.split('    ')

                            if len(text) <=4:
                                return "No","No","No","No"

                            text_only_list = text_only_list + replacer(text[3], ['(영문명',')','보통주','우선주']).split(': ')
                            text_only_list = text_only_list + replacer(text[5], ['(영문명',')','보통주','우선주']).split(': ')

                    if len(after_list) <=0:
                        after_list = '없음' 

                    return text_only_list, after_list, doc_code, company_code # after_list는 이후 정정된 공시코드 리스트임 / doc_code는 지금 읽은 공시의 코드
    
    # v7 추가
    def __doc_filter(self, doc_list):    # 필터 기업코드 찾는 걸로 손봐야한다
        '''정정된 공시를 제외한 첫번째 공시만 읽어온다'''
        self.__isResultExist(self.report_content)
        self.read_what = 'filter'

        announce_date = doc_list['공시일'].values
        name_ar = doc_list['회사명'].values
        code_ar = doc_list['코드'].values
        report_list = doc_list['보고서명'].values
        rereport_list = doc_list['정정신고'].values

        preprocessed_data = []

        for code, name, date, report, rereport in tzip(code_ar, name_ar, announce_date, report_list, rereport_list):
            time.sleep(random.uniform(self.__sleep_min, self.__sleep_max))
            url = "https://kind.krx.co.kr/common/disclsviewer.do?method=search&acptno=" + code + "&docno=&viewerhost=&viewerport="
            res = requests.get(url) # 쿼리 날려요
            bs = BeautifulSoup(res.text, 'html.parser')
            option_tag = bs.find_all("option")
            select_str = ''

            for tag in option_tag:
                if str(tag).startswith('<option selected="selected"'):
                    select_str = str(tag)
                    if "[정정]" in select_str:
                        continue #정정공시일 경우에는 읽지 않는다(기존공시의 부수로 읽을 거기 때문에)
                    preprocessed_data.append([date, name, code, report, rereport])
        return pd.DataFrame(preprocessed_data, columns=["공시일","회사명","코드","보고서명","정정신고"])
    
    def __preprocess_return(input_str):
        input_str = input_str.replace(":","").replace("주식의 종류와 수", "")
        input_str = re.sub("\(제[0-9]*회\)","", string=input_str)
        input_str = re.sub("제[0-9]*회","", string=input_str)
        input_str = input_str.replace(",","").replace("(신형)","").replace("-","").strip()
        return_lst = input_str.split("주 ")
        return_lst = [x.strip() for x in return_lst]
        return return_lst

    def __preprocess_price(input_str):
        input_str = re.sub("\(1주당 자본금:[0-9]+원\)","", input_str)
        input_str = re.sub("\(무액면\)","",input_str)
        input_str = input_str.replace(":","").replace("1주의 발행가액","")
        input_str = input_str.replace(",","")
        input_str = re.sub("\(액면가\d+원\)", "", string=input_str)
        input_str = re.sub("\(액면가 \d+원\)", "", string=input_str)
        input_str = re.sub("\(액면가  \d+원\)", "", string=input_str)
        input_str = input_str.replace("(1주당 자본금  무액면)", "")
        input_str = input_str.replace("(1주당 자본금0, )", "")
        input_str = input_str.replace("(액면가 USD 0.25)", "")
        input_str = input_str.replace("(액면금  무액면)", "")
        input_str = re.sub("기명식 보통주 [0-9]+주","", input_str)
        input_str = re.sub("\(제[0-9]+회\)","", string=input_str)
        input_str = re.sub("제[0-9]+회","", string=input_str).replace("-","").strip()
        return_lst = input_str.split("원")
        return return_lst

    def __preprocess_how(input_str):
        input_str = input_str.replace(":","").replace("증자방법","")
        input_str = re.sub("제[0-9]+회","",input_str)
        input_str = re.sub("및","",input_str)
        input_str = re.sub(",","",input_str)
        input_str = re.sub("[0-9]+주","",input_str).replace(" ","")
        return input_str
    

    def read_right_issue(self, doc_list):
        '''(v7수정사항) 좀 더 정확한 "추가상장 유상증자" 검색어를 입력해야 한다'''
        self.__isResultExist(self.report_content)
        self.read_what = 'right_issue'
        
        code_ar = doc_list['코드'].values
        name_ar = doc_list["회사명"].values
        announce_date = doc_list["공시일"].values
        report_name = doc_list["보고서명"].values
        preprocessed_data = []

        if self.market_type == '1': #코스피
            for code, name, date,report_n in tzip(code_ar, name_ar, announce_date, report_name):
                time.sleep(random.uniform(self.__sleep_min, self.__sleep_max))
                text_only_list, after_list, now_code, company_code = self.__engine_read_report(code) # 쿼리 날려요 두번 날려요
                list_of_corp = []

                if (text_only_list == 'No') or (text_only_list == []): # 정정공시일 경우에는 None을 리턴해서 SKIP
                    continue
                print(name,date,report_n,"진행중.....")

                tmp_return = text_only_list[1]
                price = text_only_list[2]
                how = text_only_list[5]
                list_of_corp.extend([name,company_code,date,tmp_return,price,how])
                preprocessed_data.append(list_of_corp)

            sample = pd.DataFrame(preprocessed_data, columns=['회사명','기업공시코드','공시일',"임시리턴","발행가(원)","발행방법"])
            sample["임시리턴"] = sample["임시리턴"].apply(self.__preprocess_return)
            sample["발행가(원)"] = sample["발행가(원)"].apply(self.__preprocess_price)
            sample["발행방법"] = sample["발행방법"].apply(self.__preprocess_how)
            sample["주식종류"] = sample["임시리턴"].apply(lambda x: x[0])
            sample["발행주식수"] = sample["임시리턴"].apply(lambda x: x[1])
            sample["발행가(원)"] = sample["발행가(원)"].apply(lambda x: x[0])
            sample = sample.drop("임시리턴", axis=1)
            self.report_content = sample

        elif self.market_type=='2': #코스닥인 경우
            for code, name, date,report_n in tzip(code_ar, name_ar, announce_date, report_name):
                time.sleep(random.uniform(self.__sleep_min, self.__sleep_max))
                text_only_list, after_list, now_code, company_code = self.__engine_read_report(code) # 쿼리 날려요 두번 날려요
                list_of_corp = []

                if (text_only_list == 'No') or (text_only_list == []): # 정정공시일 경우에는 None을 리턴해서 SKIP
                    continue
                print(name,date,report_n,"진행중.....")

                list_of_corp.extend([name,company_code, date, text_only_list[-4], text_only_list[6],text_only_list[-3], text_only_list[-2]])
                preprocessed_data.append(list_of_corp)
            
            self.report_content = pd.DataFrame(preprocessed_data,columns=["회사명","기업공시코드","공시일","발행방법","주식종류","발행주식수","발행금액"])
        print()
        print('Jobs Done')
        print('check the result with .report_content')

In [40]:
# 검색어 : [단일판매ㆍ공급계약체결 / 업종변경 / 상호변경 / 추가상장 유상증자 /전환사채권발행결정 / 신주인수권부사채권발행결정/투자주의환기종목지정 / 최대주주변경 / 불성실공시법인지정/조회공시요구/타법인주식및출자증권취득결정]
test20 = KIND_REPORTS()

test20.get_list('추가상장 유상증자', start='2012-01-01', end='2013-12-31', market_type='2')

100%|██████████| 1/1 [00:01<00:00,  1.33s/it]


Jobs Done
check the result with .report_list





In [41]:
test20.report_list.head()#.loc[test20.report_list["보고서명"] == "유상증자결정(제3자배정-철회)"]

Unnamed: 0,공시일,회사명,코드,보고서명,정정신고
0,2013-12-16,팬스타엔터프라이즈,20131216000561,추가상장(유상증자(일반공모),NO
1,2013-12-13,THE E&M,20131213000650,추가상장(유상증자),NO
2,2013-12-05,안국약품,20131205000458,추가상장(유상증자),NO
3,2013-11-21,엘컴텍,20131121000384,추가상장(유상증자(제3자배정)),NO
4,2013-11-21,엘컴텍,20131121000382,추가상장(유상증자(출자전환)),NO


In [42]:
test20.read_right_issue(test20.report_list)

  0%|          | 0/90 [00:00<?, ?it/s]

팬스타엔터프라이즈 2013-12-16 추가상장(유상증자(일반공모) 진행중.....
THE E&M 2013-12-13 추가상장(유상증자) 진행중.....
안국약품 2013-12-05 추가상장(유상증자) 진행중.....
엘컴텍 2013-11-21 추가상장(유상증자(제3자배정)) 진행중.....
엘컴텍 2013-11-21 추가상장(유상증자(출자전환)) 진행중.....
케스피온 2013-11-13 추가상장(유상증자) 진행중.....
골드퍼시픽 2013-11-05 추가상장(유상증자) 진행중.....
바이온 2013-10-28 추가상장(유상증자) 진행중.....
KD 2013-10-14 추가상장(유상증자(일반공모)) 진행중.....
휴먼엔 2013-10-08 추가상장(유상증자(구주주배정)) 진행중.....
디에스티 2013-10-04 추가상장(유상증자(일반공모)) 진행중.....
큐로컴 2013-09-17 추가상장(유상증자) 진행중.....
THE E&M 2013-09-13 추가상장(유상증자) 진행중.....
KD 2013-09-10 추가상장(유상증자 제3자배정) 진행중.....
HLB생명과학 2013-08-29 추가상장(유상증자) 진행중.....
에이스하이텍 2013-08-27 추가상장(유상증자) 진행중.....
SM C&C 2013-08-26 추가상장(유상증자(제3자배정)) 진행중.....
동방선기 2013-08-22 추가상장(유상증자) 진행중.....
유니드코리아 2013-08-19 추가상장(유상증자(제3자 배정)) 진행중.....
대성엘텍 2013-08-13 추가상장(유상증자) 진행중.....
휴먼엔 2013-07-24 추가상장(유상증자(제3자배정)) 진행중.....
컴투스홀딩스 2013-07-12 추가상장(유상증자 일반공모) 진행중.....
SBW생명과학 2013-07-04 추가상장(유상증자(주주배정 후 실권주 일반공모)) 진행중.....
승화프리텍 2013-06-18 추가상장(유상증자 제3자배정) 진행중.....
사람인에이치알 2013-06-13 추가상장(유상

In [44]:
pd.set_option("display.max_rows", 500)
test20.report_content.to_csv("유상증자_코스닥_2012-2013.csv")

In [None]:
sample = test20.report_content.copy()
sample.head()

In [None]:
#print(test20.report_content.iloc[245,:]["임시리턴"])
#test20.report_content.iloc[245,:]["발행가(원)"]

In [None]:
#sample["발행가(원)"].apply(preprocess_price)

In [None]:
sample.head()

In [None]:
#sample.drop(8, inplace=True)

In [None]:
#sample.loc[129,"발행가(원)"] = "2414"
#sample.loc[129]

In [None]:
sample["발행가(원)"] = sample["발행가(원)"].astype("int64")
sample["발행주식수"] = sample["발행주식수"].str.replace("주","").astype("int64")
sample["발행금액"] = sample["발행가(원)"] * sample["발행주식수"]
sample.head()

In [None]:
sample

In [None]:
# 값 수정하는 과정
#sample.loc[256,:] = ["진흥기업",	"002780",	"2018-12-06"	,850,	"유상증자(제3자배정)",	"기명식 보통",	91476,	77754600]
sample.tail()

In [None]:
name = "유상증자_코스피_2020_2022.csv"

In [None]:
sample.to_csv("{}".format(name))