# BeautifulSoup import하기

In [3]:
from bs4 import BeautifulSoup

# 예제 1

In [10]:
html_str = """
<html>
    <head>
        <title>Intro to Web Scraping</title>
    </head>
    <body>
        Life is short. You need Python.
    </body>
</html>
"""

In [11]:
type(html_str)

str

- <span style = 'font-size:1.1em;line-height:1.5em'>현재까지 위의 html_str에 저장되어있는 값은 단순 문자열일뿐</span>
- <span style = 'font-size:1.1em;line-height:1.5em'>이제 이를 parsing하기 위해서 이 문자열이 HTML문서라는 것을 알게 해줘야 함</span>
    - <span style = 'font-size:1.0em;line-height:1.5em'><b>파싱(parsing): </b>텍스트의 구성 성분을 분해한다는 의미</span>
    - <span style = 'font-size:1.0em;line-height:1.5em'><b>예제: </b>You need Python -> You/need/Python</span>

## Find \<head\> tag

In [12]:
bs_obj = BeautifulSoup(html_str, 'html.parser')

In [19]:
head = bs_obj.find('head')
print(head)

<head>
<title>Intro to Web Scraping</title>
</head>


- <span style = 'font-size:1.1em;line-height:1.5em'>find()로 생성된 결과물은 Tag라는 클래스의 객체(object)</span>

In [20]:
print(type(head))
print(head.__class__.__name__)

<class 'bs4.element.Tag'>
Tag


- <span style = 'font-size:1.1em;line-height:1.5em'>이 객체의 속성(attribute)를 사용할 수 있다.</span>
    - <span style = 'font-size:1.0em;line-height:1.5em'>이 객체의 text라는 속성을 사용하면 내부 문자열만 추출 가능</span>

In [21]:
head.text

'\nIntro to Web Scraping\n'

In [23]:
# 줄바꿈 문자 제거
head.text.replace('\n','')

'Intro to Web Scraping'

In [22]:
print(head.text)


Intro to Web Scraping



## \<body\>부분 찾기

In [17]:
body = bs_obj.find('body')
print(body)

<body>
        Life is short. You need Python.
    </body>


In [18]:
print(body.text)


        Life is short. You need Python.
    


In [12]:
body.text

'\n        Life is short. You need Python.\n    '

In [13]:
# 빈 문자열 제거
body.text.strip()

'Life is short. You need Python.'

In [14]:
# strip(): 문자열(str)에서 양 끝에 존재하는 공백을 제거하는 함수
print('   aa  a '.strip())

aa  a


# Example 2

In [15]:
html_str = """
<html>
    <head>
        <title>Intro to Web Scraping</title>
    </head>
    <body>
        <div>First div</div>
        <div>Second div</div>
    </body>
</html>
"""

## \<body\>부분 찾기

In [16]:
bs_obj = BeautifulSoup(html_str, 'html.parser')

In [17]:
bs_obj


<html>
<head>
<title>Intro to Web Scraping</title>
</head>
<body>
<div>First div</div>
<div>Second div</div>
</body>
</html>

In [18]:
bs_obj = BeautifulSoup(html_str, 'html.parser')
body = bs_obj.find('body')
print(body)

<body>
<div>First div</div>
<div>Second div</div>
</body>


In [19]:
body.text

'\nFirst div\nSecond div\n'

## \<div\>부분 전부 찾기

- <span style = 'font-size:1.1em;line-height:1.5em'><b>[KOR]</b> div부분이 여러 개 있는데, find 메소드를 사용하면 첫번째 결과만 반환합니다.</span>

In [20]:
div = bs_obj.find('div')
print(div)

<div>First div</div>


- <span style = 'font-size:1.1em;line-height:1.5em'><b>[KOR]</b>find_all 메소드를 사용하여 모든 결과를 반환하도록 합니다.</span>
- <span style = 'font-size:1.1em;line-height:1.5em'><b>[ENG]</b>You can get all the results by using find_all() method.</span>

In [21]:
div = bs_obj.find_all('div')
print(div)

[<div>First div</div>, <div>Second div</div>]


In [22]:
div[1]

<div>Second div</div>

In [23]:
print(div[0].text)
print(div[1].text)

First div
Second div


# Example 3

In [24]:
bs_obj = BeautifulSoup(open("practice05.html", encoding="utf8"), "html.parser")

In [25]:
bs_obj

<!DOCTYPE html>

<html>
<head>
<title>초간단 테이블</title>
</head>
<body>
<table border="1">
<caption>첫번째 표</caption>
<tr>
<th>상품</th>
<th>가격</th>
</tr>
<tr>
<td>item 01</td>
<td>1000</td>
</tr>
<tr>
<td>item 02</td>
<td>2000</td>
</tr>
</table>
<table border="2">
<caption>두번째 표</caption>
<tr>
<th>상품</th>
<th>가격</th>
</tr>
<tr>
<td>item 03</td>
<td>3000</td>
</tr>
<tr>
<td>item 04</td>
<td>4000</td>
</tr>
</table>
</body>
</html>

## table내의 내용 추출
## Extract the contents in the table

In [26]:
table = bs_obj.find_all('table')

In [27]:
table

[<table border="1">
 <caption>첫번째 표</caption>
 <tr>
 <th>상품</th>
 <th>가격</th>
 </tr>
 <tr>
 <td>item 01</td>
 <td>1000</td>
 </tr>
 <tr>
 <td>item 02</td>
 <td>2000</td>
 </tr>
 </table>,
 <table border="2">
 <caption>두번째 표</caption>
 <tr>
 <th>상품</th>
 <th>가격</th>
 </tr>
 <tr>
 <td>item 03</td>
 <td>3000</td>
 </tr>
 <tr>
 <td>item 04</td>
 <td>4000</td>
 </tr>
 </table>]

In [28]:
print(f'table 수: {len(table)}')
print()
print(table)

table 수: 2

[<table border="1">
<caption>첫번째 표</caption>
<tr>
<th>상품</th>
<th>가격</th>
</tr>
<tr>
<td>item 01</td>
<td>1000</td>
</tr>
<tr>
<td>item 02</td>
<td>2000</td>
</tr>
</table>, <table border="2">
<caption>두번째 표</caption>
<tr>
<th>상품</th>
<th>가격</th>
</tr>
<tr>
<td>item 03</td>
<td>3000</td>
</tr>
<tr>
<td>item 04</td>
<td>4000</td>
</tr>
</table>]


## 태그와 속성을 함께 이용해 추출 가능

In [29]:
table = bs_obj.find_all('table', {'border':'2'})

In [30]:
table

[<table border="2">
 <caption>두번째 표</caption>
 <tr>
 <th>상품</th>
 <th>가격</th>
 </tr>
 <tr>
 <td>item 03</td>
 <td>3000</td>
 </tr>
 <tr>
 <td>item 04</td>
 <td>4000</td>
 </tr>
 </table>]