In [1]:
from bs4 import BeautifulSoup

In [2]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [6]:
soup = BeautifulSoup(html_doc)
soup.title

<title>The Dormouse's story</title>

In [7]:
soup.p

<p class="title"><b>The Dormouse's story</b></p>

In [8]:
soup.p['class']

['title']

In [13]:
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [14]:
soup.find_all(id='link3')

[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [16]:
soup.find_all(class_='sister')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [17]:
[link['href'] for link in soup.find_all(class_='sister')]

['http://example.com/elsie',
 'http://example.com/lacie',
 'http://example.com/tillie']

In [22]:
tag = soup.p
type(tag)

bs4.element.Tag

In [23]:
tag.name

'p'

In [25]:
tag.attrs

{'class': ['title']}

In [26]:
css_soup = BeautifulSoup('<p class="body strikeout"></p>')
css_soup.p['class']

['body', 'strikeout']

In [29]:
tag.string

u"The Dormouse's story"

In [31]:
soup.head

<head><title>The Dormouse's story</title></head>

In [32]:
soup.title

<title>The Dormouse's story</title>

In [34]:
soup.head.title

<title>The Dormouse's story</title>

In [36]:
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [37]:
head_tag = soup.head
head_tag

<head><title>The Dormouse's story</title></head>

In [38]:
head_tag.contents

[<title>The Dormouse's story</title>]

In [41]:
title_tag = head_tag.contents[0]
title_tag

<title>The Dormouse's story</title>

In [43]:
text = title_tag.string
text

u"The Dormouse's story"

In [44]:
for child in title_tag.children:
    print child

The Dormouse's story


In [47]:
for string in soup.strings:
    print repr(string)

u"The Dormouse's story"
u'\n'
u'\n'
u"The Dormouse's story"
u'\n'
u'Once upon a time there were three little sisters; and their names were\n'
u'Elsie'
u',\n'
u'Lacie'
u' and\n'
u'Tillie'
u';\nand they lived at the bottom of a well.'
u'\n'
u'...'
u'\n'


In [48]:
for string in soup.stripped_strings:
    print repr(string)

u"The Dormouse's story"
u"The Dormouse's story"
u'Once upon a time there were three little sisters; and their names were'
u'Elsie'
u','
u'Lacie'
u'and'
u'Tillie'
u';\nand they lived at the bottom of a well.'
u'...'


In [49]:
import re

In [50]:
for tag in soup.find_all(re.compile(r'^b')):
    print tag.name

body
b


In [51]:
soup.find_all(['a', 'b'])

[<b>The Dormouse's story</b>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [52]:
soup.find_all('a', class_='sister')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [54]:
soup('a', class_='sister')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [60]:
%timeit soup.select('title')

The slowest run took 5.19 times longer than the fastest. This could mean that an intermediate result is being cached 
100000 loops, best of 3: 17 µs per loop


In [61]:
%timeit soup('title')

The slowest run took 4.05 times longer than the fastest. This could mean that an intermediate result is being cached 
10000 loops, best of 3: 21.4 µs per loop


In [76]:
markup = """"<a href="http://example.com/">\nI linked to <i>example.com</i>dd


<b>xx</b>\n</a>"""

In [77]:
soup = BeautifulSoup(markup)
soup.get_text(separator='#', strip=True)

u'"#I linked to#example.com#dd#xx'

In [78]:
soup.get_text(separator='#', strip=False)

u'"#\nI linked to #example.com#dd\n\n\n#xx#\n'