# 크롤링 하기

## urllib 사용하기

In [1]:
import urllib
def download(url):
    return urllib.request.urlopen(url)

In [2]:
from urllib.error import URLError, HTTPError, ContentTooShortError

def download(url):
    try:
        html = urllib.request.urlopen(url)
    except (URLError, HTTPError, ContentTooShortError) as e:
        print('Download error', e.reason)
        html = None
    return html

download('https://www.google.com')

<http.client.HTTPResponse at 0x7f941c313450>

In [3]:
def download(url):
    try:
        html = urllib.request.urlopen(url).read()  # read메소드 추가
    except (URLError, HTTPError, ContentTooShortError) as e:
        print('Download error', e.reason)
        html = None
    return html

In [4]:
download('https://www.google.com')

b'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="ko"><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"><title>Google</title><script nonce="0cnA+wytiiIVa7J03OS31Q==">(function(){window.google={kEI:\'PL23X-XdI8u4mAXR9774Dg\',kEXPI:\'0,18168,183994,1157247,730,224,5105,206,3204,10,1145,81,364,925,574,611,206,383,246,5,1128,226,222,426,653,2798,315,3,65,768,217,284,981,452,408,664,1925,409,7,1369,630,1115170,1197769,513,7,328978,13677,4855,32691,16115,28684,9188,8384,1326,3533,1361,284,9006,3024,4744,11033,1808,4020,978,7931,5297,2974,873,37,4155,2784,3646,14527,4521,2774,919,2277,8,2796,889,704,1279,2212,530,149,1103,840,517,1522,157,4101,312,1137,2,2063,606,2023,1777,520,1946,2210,3,110,328,1284,16,2927,2246,1813,1787,3227,2845,7,4773,38,7542,4456,641,7876,4929,108,3407,908,2,941,2614,2397,7468,3277,3,346,230,970,865,4624,149,5990,6355,1630,4,

## BeautifulSoup, request 사용하기

### 웹페이지 다운로드

In [5]:
import requests
url = 'http://www.google.com'
response = requests.get(url)
response # 200 응답성공

<Response [200]>

* 예외처리한 코드

In [6]:
# class는 self에서만 사용..
def download2(url):
    try:
        response = requests.get(url)
        html = response.text
        
    except requests.ConnectionError:
        print('Connection error')
        html = None
    return html

download2('https://www.google.com')
    

'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="ko"><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"><title>Google</title><script nonce="P6ZTzooqiIAs/sFHmWllNA==">(function(){window.google={kEI:\'Pb23X7KxFf-Hr7wPs7qhoAo\',kEXPI:\'0,18168,183994,1157247,730,224,5104,207,2415,789,10,1144,82,364,1499,612,205,383,246,5,1354,222,426,3452,314,3,65,769,1,215,283,982,452,408,2998,7,1999,1115170,1197790,493,6,328978,13677,4855,32691,16115,28684,9188,8384,4858,1362,9290,3023,4745,9217,1816,1808,4020,978,7931,5297,2054,920,873,4192,6430,14528,4519,1394,1381,919,2277,8,2796,1593,1279,2212,530,149,1103,840,517,1522,157,4101,312,1137,2,2063,606,2023,1777,520,1947,2229,93,328,1284,16,2927,2247,1812,1787,3227,2845,7,12354,4455,641,7876,2038,2999,3407,908,2,941,2614,2397,7468,3277,3,346,230,970,865,4624,149,5991,7984,4,1328,122,78,2304,1236,271,874,405,1863

## 웹페이지 분석하기

### 태그에 쉽게접근

In [7]:
import requests
from bs4 import BeautifulSoup
html = requests.get('http://www.google.com')
soup = BeautifulSoup(html.text, 'html.parser')

In [8]:
soup.html.body

<body bgcolor="#fff"><script nonce="4XSCvxHRv0oWyUPxuAkgCw==">(function(){var src='/images/nav_logo229.png';var iesg=false;document.body.onload = function(){window.n && window.n();if (document.images){new Image().src=src;}
if (!iesg){document.f&&document.f.q.focus();document.gbqf&&document.gbqf.q.focus();}
}
})();</script><div id="mngb"><div id="gbar"><nobr><b class="gb1">검색</b> <a class="gb1" href="http://www.google.co.kr/imghp?hl=ko&amp;tab=wi">이미지</a> <a class="gb1" href="http://maps.google.co.kr/maps?hl=ko&amp;tab=wl">지도</a> <a class="gb1" href="https://play.google.com/?hl=ko&amp;tab=w8">Play</a> <a class="gb1" href="http://www.youtube.com/?gl=KR&amp;tab=w1">YouTube</a> <a class="gb1" href="https://news.google.com/?tab=wn">뉴스</a> <a class="gb1" href="https://mail.google.com/mail/?tab=wm">Gmail</a> <a class="gb1" href="https://drive.google.com/?tab=wo">드라이브</a> <a class="gb1" href="https://www.google.co.kr/intl/ko/about/products?tab=wh" style="text-decoration:none"><u>더보기</u> »</a><

In [9]:
soup = BeautifulSoup("<span>Wow it's so good!!</span>", 'html.parser')
soup.span

<span>Wow it's so good!!</span>

### find()  
### findAll()

* p태그 항목 찾기

In [10]:
# ''',"""는 여러줄의 문자열을 입력할 때 사용
html='''<title>Fundamental</title> 
         <body>
          <p id='programming'>python</p> 
          <p id='programming'>java</p> 
          <p id='algorithm'>algorithm</p> 
          <p id='fundamental'>math</p> 
          <p id='programming'>C++</p> 
          </body>'''

soup = BeautifulSoup(html, 'html.parser')
soup.findAll({'p'})

[<p id="programming">python</p>,
 <p id="programming">java</p>,
 <p id="algorithm">algorithm</p>,
 <p id="fundamental">math</p>,
 <p id="programming">C++</p>]

* p태그에서 속성값이 programming인 태그를 찾고 싶을때

In [11]:
soup.findAll('p', id='programming')

[<p id="programming">python</p>,
 <p id="programming">java</p>,
 <p id="programming">C++</p>]

# 크롤링 이용하기 -환율가져오기

## 네이버 환율 정보 크롤링

In [12]:
# import requests
# from bs4 import BeautifulSoup

In [13]:
html

"<title>Fundamental</title> \n         <body>\n          <p id='programming'>python</p> \n          <p id='programming'>java</p> \n          <p id='algorithm'>algorithm</p> \n          <p id='fundamental'>math</p> \n          <p id='programming'>C++</p> \n          </body>"

In [14]:
response

<Response [200]>

In [15]:
url = 'https://m.stock.naver.com/marketindex/index.nhn'

# requests에서 에러발생이 except로 넘어가서 html return 할 것이 필요하므로 None 지정
def download3(x):
    try:
        res = requests.get(x)
        html = res.text
    except requests.ConnectionError:
        print('Connection error')
        html = None
    return html
            
    

In [16]:
download3(url)

'\n\n\n\n\n<!-- header -->\n\n\n\n\n\n\n\n\n<!doctype html>\n\n<html lang="ko">\n<head>\n    <meta charset="utf-8">\n    <meta name="viewport" content="width=device-width,initial-scale=1.0,maximum-scale=1.0,minimum-scale=1.0,user-scalable=no">\n\n    \n    \n        \n        \n            <meta property="og:url" content="http://m.stock.naver.com/marketindex/index.nhn?menu=exchange"/>\n            \n                \n                    <meta property="og:title" content="네이버 증권"/>\n                \n                \n            \n            \n                \n                    <meta property="og:description" content="관심종목의 실시간 주가를 가장 빠르게 확인하는 곳"/>\n                \n                \n            \n            \n                \n                    <meta property="og:image" content="https://ssl.pstatic.net/static/m/stock/im/2016/08/og_stock-200.png"/>\n                \n                \n            \n        \n    \n\n    <meta property="og:type" content="article"/>\n    <meta pr