## Beautiful Soup 기초

In [1]:
from bs4 import BeautifulSoup

In [2]:
# html 파일 읽어서 소스 보기
page = open('D:/data/03. test_first.html', 'r').read()
soup = BeautifulSoup(page, 'html.parser') # html.parser : html을 읽는 엔진 중 하나
print(soup.prettify()) # html 출력을 깔끔하게


<!DOCTYPE html>
<html>
 <head>
  <title>
   Very Simple HTML Code by PinkWink
  </title>
 </head>
 <body>
  <div>
   <p class="inner-text first-item" id="first">
    Happy PinkWink.
    <a href="http://www.pinkwink.kr" id="pw-link">
     PinkWink
    </a>
   </p>
   <p class="inner-text second-item">
    Happy Data Science.
    <a href="https://www.python.org" id="py-link">
     Python
    </a>
   </p>
  </div>
  <p class="outer-text first-item" id="second">
   <b>
    Data Science is funny.
   </b>
  </p>
  <p class="outer-text">
   <b>
    All I need is Love.
   </b>
  </p>
 </body>
</html>



In [3]:
# <body> 태그 내부의 소스만 출력
soup.body

<body>
<div>
<p class="inner-text first-item" id="first">
                Happy PinkWink.
                <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>
</p>
<p class="inner-text second-item">
                Happy Data Science.
                <a href="https://www.python.org" id="py-link">Python</a>
</p>
</div>
<p class="outer-text first-item" id="second">
<b>
                Data Science is funny.
            </b>
</p>
<p class="outer-text">
<b>
                All I need is Love.
            </b>
</p>
</body>

In [4]:
# find()    이것도 찾아주기는 하는데 맨 처음 하나만 출력
soup.find('p')

<p class="inner-text first-item" id="first">
                Happy PinkWink.
                <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>
</p>

In [5]:
# <p> 태그에 해당하는 소스 전부다 찾아서 출력
soup.find_all('p')

[<p class="inner-text first-item" id="first">
                 Happy PinkWink.
                 <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>
 </p>,
 <p class="inner-text second-item">
                 Happy Data Science.
                 <a href="https://www.python.org" id="py-link">Python</a>
 </p>,
 <p class="outer-text first-item" id="second">
 <b>
                 Data Science is funny.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 All I need is Love.
             </b>
 </p>]

In [6]:
# class 지정해서 출력도 가능
soup.find_all(class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 Data Science is funny.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 All I need is Love.
             </b>
 </p>]

In [7]:
# 텍스트 전부 출력하기
for each_tag in soup.find_all('p'):
    print('------------')
    print(each_tag.get_text())



------------

                Happy PinkWink.
                PinkWink

------------

                Happy Data Science.
                Python

------------


                Data Science is funny.
            

------------


                All I need is Love.
            



In [10]:
# 외부로 연결되는 링크 주소 출력
links = soup.find_all('a')

for each in links:
    href = each['href']
    text = each.string
    print(text + ' -> ' + href)

PinkWink -> http://www.pinkwink.kr
Python -> https://www.python.org


### 크롬 개발자 도구

In [11]:
# 웹 주소에 접근할 때 urllib의 request 모듈 필요
from urllib.request import urlopen

In [12]:
# 네이버 증권 사이트의 시장지표 페이지의 html 소스 출력
url = 'https://finance.naver.com/marketindex/'
page = urlopen(url)

soup = BeautifulSoup(page, 'html.parser')
print(soup.prettify())

<script language="javascript" src="/template/head_js.naver?referer=info.finance.naver.com&amp;menu=marketindex&amp;submenu=market">
</script>
<script src="https://ssl.pstatic.net/imgstock/static.pc/20230531104931/js/info/jindo.min.ns.1.5.3.euckr.js" type="text/javascript">
</script>
<script src="https://ssl.pstatic.net/imgstock/static.pc/20230531104931/js/jindo.1.5.3.element-text-patch.js" type="text/javascript">
</script>
<div id="container" style="padding-bottom:0px;">
 <div class="market_include">
  <div class="market_data">
   <div class="market1">
    <div class="title">
     <h2 class="h_market1">
      <span>
       환전 고시 환율
      </span>
     </h2>
    </div>
    <!-- data -->
    <div class="data">
     <ul class="data_lst" id="exchangeList">
      <li class="on">
       <a class="head usd" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdt', '', '', event);">
        <h3 class="h_lst">
         <span class="blind">
          미국 U

In [13]:
# span 태그에서 value 값을 가진 첫번째 내용 출력 
soup.find_all('span','value')[0].string

'1,306.60'

In [16]:
soup.find_all('span',{'class':'value'})

[<span class="value">1,306.60</span>,
 <span class="value">941.32</span>,
 <span class="value">1,406.95</span>,
 <span class="value">184.32</span>,
 <span class="value">138.9300</span>,
 <span class="value">1.0733</span>,
 <span class="value">1.2522</span>,
 <span class="value">103.5000</span>,
 <span class="value">70.1</span>,
 <span class="value">1594.03</span>,
 <span class="value">1995.5</span>,
 <span class="value">83213.72</span>]

### 다른 방법

In [17]:
import requests

In [18]:
url = 'https://finance.naver.com/marketindex/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.prettify())

<script language="javascript" src="/template/head_js.naver?referer=info.finance.naver.com&amp;menu=marketindex&amp;submenu=market">
</script>
<script src="https://ssl.pstatic.net/imgstock/static.pc/20230531104931/js/info/jindo.min.ns.1.5.3.euckr.js" type="text/javascript">
</script>
<script src="https://ssl.pstatic.net/imgstock/static.pc/20230531104931/js/jindo.1.5.3.element-text-patch.js" type="text/javascript">
</script>
<div id="container" style="padding-bottom:0px;">
 <div class="market_include">
  <div class="market_data">
   <div class="market1">
    <div class="title">
     <h2 class="h_market1">
      <span>
       환전 고시 환율
      </span>
     </h2>
    </div>
    <!-- data -->
    <div class="data">
     <ul class="data_lst" id="exchangeList">
      <li class="on">
       <a class="head usd" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdt', '', '', event);">
        <h3 class="h_lst">
         <span class="blind">
          미국 U

In [19]:
exchangeList = soup.select('#exchangeList > li')
exchangeList

[<li class="on">
 <a class="head usd" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdt', '', '', event);">
 <h3 class="h_lst"><span class="blind">미국 USD</span></h3>
 <div class="head_info point_dn">
 <span class="value">1,306.20</span>
 <span class="txt_krw"><span class="blind">원</span></span>
 <span class="change"> 13.80</span>
 <span class="blind">하락</span>
 </div>
 </a>
 <a class="graph_img" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW" onclick="clickcr(this, 'fr1.usdc', '', '', event);">
 <img alt="" height="153" src="https://ssl.pstatic.net/imgfinance/chart/marketindex/FX_USDKRW.png" width="295"/>
 </a>
 <div class="graph_info">
 <span class="time">2023.06.02 14:36</span>
 <span class="source">하나은행 기준</span>
 <span class="count">고시회차<span class="num">424</span>회</span>
 </div>
 </li>,
 <li class="">
 <a class="head jpy" href="/marketindex/exchangeDetail.naver?marketindexCd=FX_JPYKRW" onclick="clickcr(this, 'fr1.jp

In [24]:
title = exchangeList[0].select_one('.h_lst').text
exchange = exchangeList[0].select_one('.value').text
change = exchangeList[0].select_one('.change').text
updown = exchangeList[0].select_one('div.head_info.point_dn > .blind').text

title, exchange, change, updown

('미국 USD', '1,306.20', ' 13.80', '하락')

In [29]:
# 4개 데이터 수집
exchange_datas = []
baseurl = 'https://finance.naver.com'

for item in exchangeList:
    data = {
        'title':item.select_one('.h_lst').text,
        'exchange':item.select_one('.value').text,
        'change':item.select_one('.change').text,
        'updown':item.select_one('div.head_info.point_dn > .blind').text,
        'link':baseurl + item.select_one('a').get('href')
    }
    print(data)
    exchange_datas.append(data)
exchange_datas

{'title': '미국 USD', 'exchange': '1,306.20', 'change': ' 13.80', 'updown': '하락', 'link': 'https://finance.naver.com/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW'}
{'title': '일본 JPY(100엔)', 'exchange': '939.95', 'change': ' 10.07', 'updown': '하락', 'link': 'https://finance.naver.com/marketindex/exchangeDetail.naver?marketindexCd=FX_JPYKRW'}
{'title': '유럽연합 EUR', 'exchange': '1,406.25', 'change': ' 9.45', 'updown': '하락', 'link': 'https://finance.naver.com/marketindex/exchangeDetail.naver?marketindexCd=FX_EURKRW'}
{'title': '중국 CNY', 'exchange': '184.17', 'change': ' 1.34', 'updown': '하락', 'link': 'https://finance.naver.com/marketindex/exchangeDetail.naver?marketindexCd=FX_CNYKRW'}


[{'title': '미국 USD',
  'exchange': '1,306.20',
  'change': ' 13.80',
  'updown': '하락',
  'link': 'https://finance.naver.com/marketindex/exchangeDetail.naver?marketindexCd=FX_USDKRW'},
 {'title': '일본 JPY(100엔)',
  'exchange': '939.95',
  'change': ' 10.07',
  'updown': '하락',
  'link': 'https://finance.naver.com/marketindex/exchangeDetail.naver?marketindexCd=FX_JPYKRW'},
 {'title': '유럽연합 EUR',
  'exchange': '1,406.25',
  'change': ' 9.45',
  'updown': '하락',
  'link': 'https://finance.naver.com/marketindex/exchangeDetail.naver?marketindexCd=FX_EURKRW'},
 {'title': '중국 CNY',
  'exchange': '184.17',
  'change': ' 1.34',
  'updown': '하락',
  'link': 'https://finance.naver.com/marketindex/exchangeDetail.naver?marketindexCd=FX_CNYKRW'}]

In [30]:
import pandas as pd

In [31]:
df = pd.DataFrame(exchange_datas)
df

Unnamed: 0,title,exchange,change,updown,link
0,미국 USD,1306.2,13.8,하락,https://finance.naver.com/marketindex/exchange...
1,일본 JPY(100엔),939.95,10.07,하락,https://finance.naver.com/marketindex/exchange...
2,유럽연합 EUR,1406.25,9.45,하락,https://finance.naver.com/marketindex/exchange...
3,중국 CNY,184.17,1.34,하락,https://finance.naver.com/marketindex/exchange...
