# 1. 필요한 정보가 들어있는 웹사이트 찾기

<img src="image_lecture/crawling_01.png">

<img src="image_lecture/crawling_02.png">

# 2. 웹사이트 구조 파악

In [1]:
url = 'https://finance.naver.com/item/sise_day.nhn?code=005930'

<img src="image_lecture/crawling_03.png">

# 3. 필요한 정보의 위치 파악

<img src="image_lecture/crawling_04.png">

# 4. BeautifulSoup 등으로 필요한 정보 추출

In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [3]:
source = urlopen(url).read()
soup = BeautifulSoup(source, 'lxml')
soup

<html lang="ko">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<title>네이버 금융</title>
<link href="/css/newstock.css?20190411143845" rel="stylesheet" type="text/css"/>
<link href="/css/common.css?20190411143845" rel="stylesheet" type="text/css"/>
<link href="/css/layout.css?20190411143845" rel="stylesheet" type="text/css"/>
<link href="/css/main.css?20190411143845" rel="stylesheet" type="text/css"/>
<link href="/css/newstock2.css?20190411143845" rel="stylesheet" type="text/css"/>
<link href="/css/newstock3.css?20190411143845" rel="stylesheet" type="text/css"/>
<link href="/css/world.css?20190411143845" rel="stylesheet" type="text/css"/>
</head>
<body>
<script language="JavaScript">
function mouseOver(obj){
  obj.style.backgroundColor="#f6f4e5";
}
function mouseOut(obj){
  obj.style.backgroundColor="#ffffff";
}
</script>
<h4 class="tlline2"><strong><span class="red03">일별</span>시세</strong></h4>
<table cellspacing="0" class="type2">
<tr>
<th>날짜</th>
<th>종가</th>

In [4]:
soup.find_all('span', class_='tah p10 gray03')   # 날짜 수집   

[<span class="tah p10 gray03">2019.05.17</span>,
 <span class="tah p10 gray03">2019.05.16</span>,
 <span class="tah p10 gray03">2019.05.15</span>,
 <span class="tah p10 gray03">2019.05.14</span>,
 <span class="tah p10 gray03">2019.05.13</span>,
 <span class="tah p10 gray03">2019.05.10</span>,
 <span class="tah p10 gray03">2019.05.09</span>,
 <span class="tah p10 gray03">2019.05.08</span>,
 <span class="tah p10 gray03">2019.05.07</span>,
 <span class="tah p10 gray03">2019.05.03</span>]

In [5]:
# /html/body/table[1]/tbody/tr[3]/td[2]/span
soup.find_all('td', class_='num')   # 종가 수집

[<td class="num"><span class="tah p11">41,200</span></td>, <td class="num">
 <img alt="하락" height="6" src="https://ssl.pstatic.net/imgstock/images/images4/ico_down.gif" style="margin-right:4px;" width="7"/><span class="tah p11 nv01">
 				350
 				</span>
 </td>, <td class="num"><span class="tah p11">41,950</span></td>, <td class="num"><span class="tah p11">42,050</span></td>, <td class="num"><span class="tah p11">40,850</span></td>, <td class="num"><span class="tah p11">12,294,470</span></td>, <td class="num"><span class="tah p11">41,550</span></td>, <td class="num">
 <img alt="하락" height="6" src="https://ssl.pstatic.net/imgstock/images/images4/ico_down.gif" style="margin-right:4px;" width="7"/><span class="tah p11 nv01">
 				1,000
 				</span>
 </td>, <td class="num"><span class="tah p11">42,350</span></td>, <td class="num"><span class="tah p11">42,400</span></td>, <td class="num"><span class="tah p11">41,350</span></td>, <td class="num"><span class="tah p11">13,687,828</span></td>,

In [8]:
# 너무 많으니 몇 개나 있나 세어보자
len(soup.find_all('td', class_='num'))

60

In [9]:
# 6의 배수마다 종가가 산출되는 것 같으니 확인
for i in range(0, 60, 6):
    print(soup.find_all('td', class_='num')[i])

<td class="num"><span class="tah p11">41,200</span></td>
<td class="num"><span class="tah p11">41,550</span></td>
<td class="num"><span class="tah p11">42,550</span></td>
<td class="num"><span class="tah p11">42,650</span></td>
<td class="num"><span class="tah p11">42,650</span></td>
<td class="num"><span class="tah p11">42,900</span></td>
<td class="num"><span class="tah p11">42,450</span></td>
<td class="num"><span class="tah p11">44,250</span></td>
<td class="num"><span class="tah p11">44,850</span></td>
<td class="num"><span class="tah p11">45,300</span></td>


# 5. 추출한 정보 정제 

In [11]:
for i in range(10):
    d = soup.find_all('span', class_='tah p10 gray03')[i].text
    print(d)

2019.05.17
2019.05.16
2019.05.15
2019.05.14
2019.05.13
2019.05.10
2019.05.09
2019.05.08
2019.05.07
2019.05.03


In [12]:
import pandas as pd

In [13]:
for i in range(10):
    d = soup.find_all('span', class_='tah p10 gray03')[i].text
    d = pd.to_datetime(d).date()
    print(d)

2019-05-17
2019-05-16
2019-05-15
2019-05-14
2019-05-13
2019-05-10
2019-05-09
2019-05-08
2019-05-07
2019-05-03


In [14]:
for i in range(0, 60, 6):
    p = soup.find_all('td', class_='num')[i].text
    print(p)

41,200
41,550
42,550
42,650
42,650
42,900
42,450
44,250
44,850
45,300


In [15]:
type(p)

str

In [16]:
for i in range(0, 60, 6):
    p = soup.find_all('td', class_='num')[i].text
    p = p.replace(',', '')
    p = float(p)
    print(p)

41200.0
41550.0
42550.0
42650.0
42650.0
42900.0
42450.0
44250.0
44850.0
45300.0


In [17]:
type(p)

float

# 6. 정제된 정보를 데이터화<br>
(데이터프레임 등으로 저장)

In [18]:
date = []
for i in range(10):
    d = soup.find_all('span', class_='tah p10 gray03')[i].text
    d = pd.to_datetime(d).date()
    date.append(d)
date

[datetime.date(2019, 5, 17),
 datetime.date(2019, 5, 16),
 datetime.date(2019, 5, 15),
 datetime.date(2019, 5, 14),
 datetime.date(2019, 5, 13),
 datetime.date(2019, 5, 10),
 datetime.date(2019, 5, 9),
 datetime.date(2019, 5, 8),
 datetime.date(2019, 5, 7),
 datetime.date(2019, 5, 3)]

In [19]:
price = []
for i in range(0, 60, 6):
    p = soup.find_all('td', class_='num')[i].text
    p = p.replace(',', '')
    p = float(p)
    price.append(p)
price

[41200.0,
 41550.0,
 42550.0,
 42650.0,
 42650.0,
 42900.0,
 42450.0,
 44250.0,
 44850.0,
 45300.0]

In [20]:
price[5]

42900.0

In [21]:
prices = {}
for i in range(10):
    prices[date[i]] = price[i]

In [22]:
prices

{datetime.date(2019, 5, 17): 41200.0,
 datetime.date(2019, 5, 16): 41550.0,
 datetime.date(2019, 5, 15): 42550.0,
 datetime.date(2019, 5, 14): 42650.0,
 datetime.date(2019, 5, 13): 42650.0,
 datetime.date(2019, 5, 10): 42900.0,
 datetime.date(2019, 5, 9): 42450.0,
 datetime.date(2019, 5, 8): 44250.0,
 datetime.date(2019, 5, 7): 44850.0,
 datetime.date(2019, 5, 3): 45300.0}

In [23]:
# Series로 만들기
pd.Series(prices)

2019-05-17    41200.0
2019-05-16    41550.0
2019-05-15    42550.0
2019-05-14    42650.0
2019-05-13    42650.0
2019-05-10    42900.0
2019-05-09    42450.0
2019-05-08    44250.0
2019-05-07    44850.0
2019-05-03    45300.0
dtype: float64

In [24]:
# DataFrame으로 만들기
pd.DataFrame(price, index=date)

Unnamed: 0,0
2019-05-17,41200.0
2019-05-16,41550.0
2019-05-15,42550.0
2019-05-14,42650.0
2019-05-13,42650.0
2019-05-10,42900.0
2019-05-09,42450.0
2019-05-08,44250.0
2019-05-07,44850.0
2019-05-03,45300.0


함수로 만들기

In [25]:
# 날짜 포맷 변환
def date_format(d=''):
    if d != '':
        this_date = pd.to_datetime(d).date()
    else:
        this_date = pd.Timestamp.today().date()   # 입력값이 없으면 오늘 날짜를 지정
    return (this_date)

In [26]:
# 일자별 시세 수집
def stock_prices(stock_cd, start_date='', end_date='', page_n=1, last_page=0):
    
    # 날짜 세팅
    end_date = date_format(end_date)
    if start_date == '':
        start_date = end_date - pd.DateOffset(months=1)
    start_date = date_format(start_date)
    
    # 크롤링
    naver_stock = 'http://finance.naver.com/item/sise_day.nhn?code=' + stock_cd + '&page=' + str(page_n)
    source = urlopen(naver_stock).read()
    soup = BeautifulSoup(source, 'lxml')
    
    dates = soup.find_all('span', class_='tah p10 gray03')   # 날짜 수집   
    prices = soup.find_all('td', class_='num')   # 종가 수집
    
    # 데이터 정제
    for n in range(len(dates)):
    
        if len(dates) > 0:
            
            # 날짜 처리
            this_date = dates[n].text
            this_date = date_format(this_date)
            
            if this_date <= end_date and this_date >= start_date:   
            # start_date와 end_date 사이에서 데이터 저장
                # 종가 처리
                this_close = prices[n*6].text
                this_close = this_close.replace(',', '')
                this_close = float(this_close)

                # 딕셔너리에 저장
                historical_prices[this_date] = this_close
                              
            elif this_date < start_date:   
            # start_date 이전이면 함수 종료
                return historical_prices              
            
    # 페이지 네비게이션
    # last_page 정보 찾기
    if last_page == 0:
        last_page = soup.find_all('table')[1].find('td', class_='pgRR').find('a')['href']
        last_page = last_page.split('&')[1]
        last_page = last_page.split('=')[1]
        last_page = float(last_page)
        
    # 마지막 페이지가 아니면 다음 페이지 호출
    if page_n < last_page:
        page_n = page_n + 1
        stock_prices(stock_cd, start_date, end_date, page_n, last_page)   
        
    return historical_prices  

In [27]:
# 여러 종목 수집
'''
005930	삼성전자
000660	SK하이닉스
066570	LG전자
'''
stocks = ['005930', '000660', '066570']

In [28]:
k10_historical_prices = dict()

for stock_cd in stocks:
    
    historical_prices = dict()
    start_date = '2018-10-1'
    end_date = '2018-12-31'
    stock_prices(stock_cd, start_date, end_date)
    
    k10_historical_prices[stock_cd] = historical_prices

In [29]:
k10_historical_price = pd.DataFrame(k10_historical_prices)
k10_historical_price.head(3)

Unnamed: 0,005930,000660,066570
2018-10-01,46350.0,73700.0,71100.0
2018-10-02,45700.0,71700.0,69900.0
2018-10-04,44700.0,70000.0,69000.0
