# 2. 종목별 기업 정보, 펀더멘털, 재무제표 크롤링하기

#### 아래 코드를 사용하려면 Selenium 패키지 다운 필요 및 FireFox 설치 필요합니다.

In [1]:
#Package for Crawling
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import WebDriverException

In [2]:
#Package for data processing
import pandas as pd
import re
from time import sleep

### 1. 종목 코드 불러오기

In [3]:
#코드 불러오기
with open('./data/kosdaq_code.txt') as f:
    kosdaq = f.read().splitlines() 

with open('./data/kospi_code.txt') as f:
    kospi = f.read().splitlines() 

In [4]:
print(len(kosdaq), len(kospi))

2667 883


#### 주의) kosdaq은 Daum Finance에서 여러 번 중복되어 있음

In [5]:
kosdaq = list(set(kosdaq))
kospi = list(set(kospi))

In [6]:
print(len(kosdaq), len(kospi))

1264 883


### 2.  Naver에서 각 종목별 페이지 크롤링

In [7]:
basic_url = 'http://companyinfo.stock.naver.com/v1/company/c1010001.aspx?cmp_cd='
ajax_url = 'http://companyinfo.stock.naver.com/v1/company/ajax/cF1001.aspx?cmp_cd='

sheet_a = '&fin_typ=0&freq_typ=A'
sheet_y = '&fin_typ=0&freq_typ=Y'
sheet_q = '&fin_typ=0&freq_typ=Q'

Firefox에서 제공하는 geckodriver 사용
-> https://github.com/mozilla/geckodriver/releases 에서 자신의 운영체제에 맞는 파일 다운받으면 된다

In [9]:
driver = webdriver.Firefox(executable_path='/Users/sailyourlife/Dropbox/Project/Investment/geckodriver')

In [10]:
#데이터 처리 함수 정의
def clean_string(string):
    string = string.replace('\t', '')
    string = string.replace('\n', '')
    string = string.replace(' ', '')
    string = string.replace(',', '')
    
    return string

In [13]:
#정보 저장(kospi)
company_info = []
company_fundamental = []
company_sheet_year = []
company_sheet_quarter = []

In [14]:
for code in range(len(kospi)):
    
    #1. 기업 정보
    sleep(0.5)

    driver.get(basic_url + kospi[code])
    html = driver.page_source
    soup = BeautifulSoup(html)
        
    #1) 이름
    name = soup.find_all("span", {"class": "name"})[0]
    name = name.get_text()

    #2) 주가
    price = soup.find_all("td", {"class": "num"})[0]
    price = price.get_text()
    price = clean_string(price)
    
    price.split('/')[0]
    price = re.findall('\d+', price)[0]
        
    #3) 52주 상한가 / 하한가
    price52 = soup.find_all("td", {"class": "num"})[1]
    price52 = price52.get_text()
    price52 = clean_string(price52)
    
    max52 = price52.split('/')[0]
    max52 = re.findall('\d+', max52)[0]
    min52 = price52.split('/')[1]
    min52 = re.findall('\d+', min52)[0]    
    
    #4) 시가총액
    total_price = soup.find_all("td", {"class": "num"})[4]
    total_price = total_price.get_text()
    total_price = clean_string(total_price)
    
    total_price.split('/')[0]
    total_price = re.findall('\d+', total_price)[0]

    #5) 수익률(1M / 3M / 6M / 1Y)
    up_ratio = soup.find_all("td", {"class": "num"})[8]
    up_ratio = up_ratio.get_text()
    up_ratio = clean_string(up_ratio)
    
    price = price.split('/')
    price_list = [i.split('%')[0] for i in price]
    
    company_info.append({kospi[code]: [name, price, total_price, max52, min52, up_ratio]})

    #2. 펀더멘탈
    fundamental = soup.find_all("tbody")[5]
    fundamental = fundamental.get_text()
    fundamental.split('\n')
    fundamental_list = fundamental.split('\n')
    fundamental_info = [elem for elem in fundamental_list if elem.strip()]
    fundamental_info = fundamental_info[:-3]
    
    company_fundamental.append({kospi[code]: fundamental_info})
    
    
    #3. 재무제표
    #A. 연간
    sleep(0.5)

    driver.get(ajax_url+ kospi[code] + sheet_y)
    html = driver.page_source
    soup = BeautifulSoup(html)
    
    #1) 년월일
    time = soup.find_all('thead')[0]
    time = time.get_text()
    time = time.replace('\n', '')
    time = time.replace('\t', '')
    time = re.findall('\d+', time)
    
    #2) 내용
    ybody = soup.find_all('tbody')[0]
    ybody = ybody.get_text()
    ybody = ybody.replace('\n', ' ')
    ybody = ybody.replace('\t', ' ')
    ybody = ybody.replace('\xa0', '')
    ybody_list = ybody.split(' ')
    ybody_info = [elem for elem in ybody_list if elem.strip()]
    
    company_sheet_year.append({kospi[code]: ybody_info})
    
    """#B. 분기
    
    #1) 분기
    sleep(0.5)

    driver.get(ajax_url+ kospi[code] + sheet_q)
    html = driver.page_source
    soup = BeautifulSoup(html)
    
    quarter = soup.find_all('thead')[0]
    quarter = quarter.get_text()
    quarter = quarter.replace('\n', '')
    quarter = quarter.replace('\t', '')
    quarter = re.findall('\d+', quarter)
    
    #2) 내용
    qbody = soup.find_all('tbody')[0]
    qbody = qbody.get_text()
    qbody = qbody.replace('\n', ' ')
    qbody = qbody.replace('\t', ' ')
    qbody = qbody.replace('\xa0', '')
    qbody_list = qbody.split(' ')
    qbody_info = [elem for elem in qbody_list if elem.strip()]
    
    company_sheet_quarter.append({kospi[code]: qbody_info})"""
    
    if code % 50 == 0:
        print([code, name])



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


[0, '대한유화']
[50, '에스원']
[100, '한솔홈데코']
[150, 'LG하우시스']
[200, '부국철강']
[250, '한국콜마']
[300, '금호에이치티']
[350, '제일약품']
[400, '두산']
[450, '흥아해운']
[500, '이아이디']
[550, '다우기술']
[600, '두산밥캣']
[650, '디아이']
[700, '엔에스쇼핑']
[750, '지투알']
[800, '오뚜기']
[850, '웅진']


#### 빠른 저장 및 불러오기를 위해서 pickle package 다운로드

In [12]:
import pickle

In [15]:
#저장하기(KOSPI)
with open("./data/kospi_company_info.txt", "wb") as fp:   #Pickling
    pickle.dump(company_info, fp)
with open("./data/kospi_company_fundamental.txt", "wb") as fp:   #Pickling
    pickle.dump(company_fundamental, fp)
with open("./data/kospi_company_year_sheet.txt", "wb") as fp:   #Pickling
    pickle.dump(company_sheet_year, fp)
"""with open("./data/kospi_company_quarter_sheet.txt", "wb") as fp:   #Pickling
    pickle.dump(company_sheet_quarter, fp)"""

'with open("./data/kospi_company_quarter_sheet.txt", "wb") as fp:   #Pickling\n    pickle.dump(company_sheet_quarter, fp)'

In [18]:
#정보 저장(kosdaq)
company_info = []
company_fundamental = []
company_sheet_year = []
company_sheet_quarter = []

In [19]:
for code in range(len(kosdaq)):
     
    #1. 기업 정보
    sleep(0.5)
    
    driver.get(basic_url + kosdaq[code])
    html = driver.page_source
    soup = BeautifulSoup(html)
    
    #1) 이름
    name = soup.find_all("span", {"class": "name"})[0]
    name = name.get_text()

    #2) 주가
    price = soup.find_all("td", {"class": "num"})[0]
    price = price.get_text()
    price = clean_string(price)
    
    price.split('/')[0]
    price = re.findall('\d+', price)[0]
        
    #3) 52주 상한가 / 하한가
    price52 = soup.find_all("td", {"class": "num"})[1]
    price52 = price52.get_text()
    price52 = clean_string(price52)
    
    max52 = price52.split('/')[0]
    max52 = re.findall('\d+', max52)[0]
    min52 = price52.split('/')[1]
    min52 = re.findall('\d+', min52)[0]    
    
    #4) 시가총액
    total_price = soup.find_all("td", {"class": "num"})[4]
    total_price = total_price.get_text()
    total_price = clean_string(total_price)
    
    total_price.split('/')[0]
    total_price = re.findall('\d+', total_price)[0]

    #5) 수익률(1M / 3M / 6M / 1Y)
    up_ratio = soup.find_all("td", {"class": "num"})[8]
    up_ratio = up_ratio.get_text()
    up_ratio = clean_string(up_ratio)
    
    price = price.split('/')
    price_list = [i.split('%')[0] for i in price]
    
    company_info.append({kosdaq[code]: [name, price, total_price, max52, min52, up_ratio]})

    #2. 펀더멘탈
    fundamental = soup.find_all("tbody")[5]
    fundamental = fundamental.get_text()
    fundamental.split('\n')
    fundamental_list = fundamental.split('\n')
    fundamental_info = [elem for elem in fundamental_list if elem.strip()]
    fundamental_info = fundamental_info[:-3]
    
    company_fundamental.append({kosdaq[code]: fundamental_info})
    
    
    #3. 재무제표
    #A. 연간
    sleep(0.5)

    driver.get(ajax_url+ kosdaq[code] + sheet_y)
    html = driver.page_source
    soup = BeautifulSoup(html)
    
    #1) 년월일
    time = soup.find_all('thead')[0]
    time = time.get_text()
    time = time.replace('\n', '')
    time = time.replace('\t', '')
    time = re.findall('\d+', time)
    
    #2) 내용
    ybody = soup.find_all('tbody')[0]
    ybody = ybody.get_text()
    ybody = ybody.replace('\n', ' ')
    ybody = ybody.replace('\t', ' ')
    ybody = ybody.replace('\xa0', '')
    ybody_list = ybody.split(' ')
    ybody_info = [elem for elem in ybody_list if elem.strip()]
    
    company_sheet_year.append({kosdaq[code]: ybody_info})
    
    """#B. 분기
    
    #1) 분기
    sleep(0.5)

    driver.get(ajax_url+ kosdaq[code] + sheet_q)
    html = driver.page_source
    soup = BeautifulSoup(html)
    
    quarter = soup.find_all('thead')[0]
    quarter = quarter.get_text()
    quarter = quarter.replace('\n', '')
    quarter = quarter.replace('\t', '')
    quarter = re.findall('\d+', quarter)
    
    #2) 내용
    qbody = soup.find_all('tbody')[0]
    qbody = qbody.get_text()
    qbody = qbody.replace('\n', ' ')
    qbody = qbody.replace('\t', ' ')
    qbody = qbody.replace('\xa0', '')
    qbody_list = qbody.split(' ')
    qbody_info = [elem for elem in qbody_list if elem.strip()]
    
    company_sheet_quarter.append({kosdaq[code]: qbody_info})"""
    
    if code % 50 == 0:
        print([code, name])



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


[0, '장원테크']
[50, '코렌']
[100, '대호피앤씨']
[150, '바른손이앤에이']
[200, '제이엔케이히터']
[250, '엑시콘']
[300, '엔터메이트']
[350, '한프']
[400, '리드코프']
[450, '에이치시티']
[500, '피에스케이']
[550, '리더스코스메틱']
[600, '포비스티앤씨']
[650, '글로본']
[700, '게임빌']
[750, '쎄노텍']
[800, '삼진엘앤디']
[850, 'SK머티리얼즈']
[900, '엠케이전자']
[950, '부방']
[1000, '대한과학']
[1050, '인터파크']
[1100, '아우딘퓨쳐스']
[1150, '아이엠']
[1200, '포스코켐텍']
[1250, '인터불스']


In [20]:
#저장하기(KOSDAQ)
with open("./data/kosdaq_company_info.txt", "wb") as fp:   #Pickling
    pickle.dump(company_info, fp)
with open("./data/kosdaq_company_fundamental.txt", "wb") as fp:   #Pickling
    pickle.dump(company_fundamental, fp)
with open("./data/kosdaq_company_year_sheet.txt", "wb") as fp:   #Pickling
    pickle.dump(company_sheet_year, fp)
"""with open("kosdaq_company_quarter_sheet.txt", "wb") as fp:   #Pickling
    pickle.dump(company_sheet_quarter, fp)"""

'with open("kosdaq_company_quarter_sheet.txt", "wb") as fp:   #Pickling\n    pickle.dump(company_sheet_quarter, fp)'

Kospi, Kosdaq 합치기

In [21]:
#읽어오기(KOSPI)
with open("./data/kospi_company_info.txt", "rb") as fp:   # Unpickling
    kospi_company_info = pickle.load(fp)
with open("./data/kospi_company_fundamental.txt", "rb") as fp:   # Unpickling
    kospi_company_fundamental = pickle.load(fp)
with open("./data/kospi_company_year_sheet.txt", "rb") as fp:   # Unpickling
    kospi_company_sheet_year = pickle.load(fp)
"""with open("./data/kospi_company_quarter_sheet.txt", "rb") as fp:   # Unpickling
    kospi_company_sheet_quarter = pickle.load(fp)"""

'with open("./data/kospi_company_quarter_sheet.txt", "rb") as fp:   # Unpickling\n    kospi_company_sheet_quarter = pickle.load(fp)'

In [22]:
#읽어오기(KOSDAQ)
with open("./data/kosdaq_company_info.txt", "rb") as fp:   # Unpickling
    kosdaq_company_info = pickle.load(fp)
with open("./data/kosdaq_company_fundamental.txt", "rb") as fp:   # Unpickling
    kosdaq_company_fundamental = pickle.load(fp)
with open("./data/kosdaq_company_year_sheet.txt", "rb") as fp:   # Unpickling
    kosdaq_company_sheet_year = pickle.load(fp)
"""with open("./data/kosdaq_company_quarter_sheet.txt", "rb") as fp:   # Unpickling
    kosdaq_company_sheet_quarter = pickle.load(fp)"""

'with open("./data/kosdaq_company_quarter_sheet.txt", "rb") as fp:   # Unpickling\n    kosdaq_company_sheet_quarter = pickle.load(fp)'

In [24]:
#전체 합치기
company_info = kosdaq_company_info + kospi_company_info
company_fundamental = kosdaq_company_fundamental + kospi_company_fundamental
company_sheet_year = kosdaq_company_sheet_year + kospi_company_sheet_year
"""company_sheet_quarter = kosdaq_company_sheet_quarter + kospi_company_sheet_quarter"""

In [25]:
print(len(company_info))

2147


In [26]:
#저장하기(전체)
with open("./data/company_info.txt", "wb") as fp:   #Pickling
    pickle.dump(company_info, fp)
with open("./data/company_fundamental.txt", "wb") as fp:   #Pickling
    pickle.dump(company_fundamental, fp)
with open("./data/company_year_sheet.txt", "wb") as fp:   #Pickling
    pickle.dump(company_sheet_year, fp)
"""with open("company_quarter_sheet.txt", "wb") as fp:   #Pickling
    pickle.dump(company_sheet_quarter, fp)"""

'with open("company_quarter_sheet.txt", "wb") as fp:   #Pickling\n    pickle.dump(company_sheet_quarter, fp)'

In [27]:
#읽어오기(전체)
with open("./data/company_info.txt", "rb") as fp:   # Unpickling
    company_info = pickle.load(fp)
with open("./data/company_fundamental.txt", "rb") as fp:   # Unpickling
    company_fundamental = pickle.load(fp)
with open("./data/company_year_sheet.txt", "rb") as fp:   # Unpickling
    company_sheet_year = pickle.load(fp)
"""with open("company_quarter_sheet.txt", "rb") as fp:   # Unpickling
    company_sheet_quarter = pickle.load(fp)"""

'with open("company_quarter_sheet.txt", "rb") as fp:   # Unpickling\n    company_sheet_quarter = pickle.load(fp)'

In [28]:
print(len(company_info))

2147
