# 1번째 방법 -  Beautiful Soup을 이용한 웹크롤링

Beautiful Soup 공식 홈페이지 : https://www.crummy.com/software/BeautifulSoup/bs4/doc

## 크롤링하기 위한 준비

In [20]:
import numpy as np # pandas 라이브러리 의존성 때문에
import pandas as pd # 가져온 데이터를 가공하기 위해
from tqdm import tqdm_notebook # 진행률을 표시하기 위해
from bs4 import BeautifulSoup
from urllib.request import urlopen

## Example 1 - 네이버 환율 정보 가져오기

In [21]:
# url을 통해서 요청하는 부분
url_base = 'https://finance.naver.com/marketindex/exchangeList.nhn'

# 요청해서 가져온 내용을 page 객체에 담음
page = urlopen(url_base)

# page를 html.parser형태로 parsing
# soup 객체에 html을 전부 긁어옴
soup = BeautifulSoup(page,'html.parser')

In [22]:
# soup를 한번 확인해보자
# 크롬에서 개발자도구를 통해서 보던 내용과 동일하다
print(soup)


<html lang="ko">
<head>
<title>네이버 금융</title>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="text/javascript" http-equiv="Content-Script-Type"/>
<meta content="text/css" http-equiv="Content-Style-Type"/>
<link href="/css/finance.css?20190306113304" rel="stylesheet" type="text/css"/>
<script language="javascript">document.domain="naver.com";</script>
<script src="/js/info/jindo.min.ns.1.5.3.euckr.js" type="text/javascript"></script>
<script src="/js/lcslog.js?20190306113304" type="text/javascript"></script>
</head>
<body>
<div class="tbl_area">
<table border="1" class="tbl_exchange" summary="환전 고시 환율 리스트">
<caption>환전 고시 환율</caption>
<colgroup>
<col width="162"/>
<col width="92"/>
<col width="92"/>
<col width="92"/>
<col width="93"/>
<col width="92"/>
<col width="90"/>
</colgroup>
<thead>
<tr>
<th class="th_ex1" rowspan="2">
<a href="#" onclick="javascript:changeOrder('exchange'); return false;"><span>통화명</span></a></th>
<th class="th_ex2" rowspan="

In [117]:
cur_name = [td.a.string.strip() for td in soup.select('td.tit')]
cur_name

['미국 USD',
 '유럽연합 EUR',
 '일본 JPY (100엔)',
 '중국 CNY',
 '홍콩 HKD',
 '대만 TWD',
 '영국 GBP',
 '오만 OMR',
 '캐나다 CAD',
 '스위스 CHF',
 '스웨덴 SEK',
 '호주 AUD',
 '뉴질랜드 NZD',
 '체코 CZK',
 '칠레 CLP',
 '터키 TRY',
 '몽골 MNT',
 '이스라엘 ILS',
 '덴마크 DKK',
 '노르웨이 NOK',
 '사우디아라비아 SAR',
 '쿠웨이트 KWD',
 '바레인 BHD',
 '아랍에미리트 AED',
 '요르단 JOD',
 '이집트 EGP',
 '태국 THB',
 '싱가포르 SGD',
 '말레이시아 MYR',
 '인도네시아 IDR 100',
 '카타르 QAR',
 '카자흐스탄 KZT',
 '브루나이 BND',
 '인도 INR',
 '파키스탄 PKR',
 '방글라데시 BDT',
 '필리핀 PHP',
 '멕시코 MXN',
 '브라질 BRL',
 '베트남 VND 100',
 '남아프리카 공화국 ZAR',
 '러시아 RUB',
 '헝가리 HUF',
 '폴란드 PLN']

In [115]:
sale = [ td.string.strip() for td in soup.select('td.sale')]
sale

['1,134.50',
 '1,275.52',
 '1,019.73',
 '168.53',
 '144.53',
 '36.70',
 '1,473.66',
 '2,946.60',
 '845.44',
 '1,124.32',
 '120.55',
 '799.14',
 '772.25',
 '49.71',
 '1.69',
 '208.66',
 '0.43',
 '313.48',
 '170.97',
 '130.60',
 '302.51',
 '3,731.17',
 '3,009.20',
 '308.86',
 '1,602.29',
 '65.03',
 '35.75',
 '834.62',
 '277.42',
 '7.96',
 '311.61',
 '2.99',
 '834.62',
 '16.23',
 '8.12',
 '13.48',
 '21.71',
 '58.24',
 '293.36',
 '4.89',
 '78.96',
 '17.18',
 '4.04',
 '296.75']

In [119]:
ex_info = {
    'Currency' : cur_name,
    'Exchange rate' : sale
}
ex_table = pd.DataFrame(ex_info)
ex_table

Unnamed: 0,Currency,Exchange rate
0,미국 USD,1134.5
1,유럽연합 EUR,1275.52
2,일본 JPY (100엔),1019.73
3,중국 CNY,168.53
4,홍콩 HKD,144.53
5,대만 TWD,36.7
6,영국 GBP,1473.66
7,오만 OMR,2946.6
8,캐나다 CAD,845.44
9,스위스 CHF,1124.32


In [172]:
trs = soup.find('tbody').find_all('tr')
trs[0].find_all('td')[0].string.strip()

'미국 USD'

In [174]:
item = list()
for i in range(0,len(trs)):
    tds = trs[i].find_all('td')
    item.append([tds[j].string.strip() for j in range(0,len(tds))])
table = pd.DataFrame(item)
table.columns = ('curname','sale','buy','sell','remit','deposit','us_ex')
table

Unnamed: 0,curname,sale,buy,sell,remit,deposit,us_ex
0,미국 USD,1134.5,1154.35,1114.65,1145.6,1123.4,1.0
1,유럽연합 EUR,1275.52,1300.9,1250.14,1288.27,1262.77,1.124
2,일본 JPY (100엔),1019.73,1037.57,1001.89,1029.72,1009.74,0.899
3,중국 CNY,168.53,176.95,160.11,170.21,166.85,0.149
4,홍콩 HKD,144.53,147.37,141.69,145.97,143.09,0.127
5,대만 TWD,36.7,41.5,34.14,0.0,0.0,0.032
6,영국 GBP,1473.66,1502.69,1444.63,1488.39,1458.93,1.299
7,오만 OMR,2946.6,3208.84,2769.81,0.0,0.0,2.597
8,캐나다 CAD,845.44,862.09,828.79,853.89,836.99,0.745
9,스위스 CHF,1124.32,1146.46,1102.18,1135.56,1113.08,0.991


---
## Example 2 - 네이버 영화에서 현재 인기 영화 순위 정보 가져오기

https://github.com/mssung94/ssmc_python_using_bigdata/blob/master/data_crawling/step1/bs4_ex2_naver_movie.ipynb

In [18]:
search_time = '20190309'
url = 'https://movie.naver.com/movie/sdb/rank/rmovie.nhn?sel=cur&tg=0&date=%s'
page = urlopen(url % search_time)
soup = BeautifulSoup(page,'html.parser')

In [19]:
divs = soup.find_all('div',class_='tit5')
points = soup.find_all('td','point')
divs[:1]

[<div class="tit5">
 <a href="/movie/bi/mi/basic.nhn?code=171539" title="그린 북">그린 북</a>
 </div>]