In [1]:
import copy

from bs4 import BeautifulSoup
from bs4.dammit import EntitySubstitution

In [2]:
html_code = """<a>BeautifulSoup is a 
<i>Python</i> 
library and works with 
<i>Python</i> 3</a>"""

soup = BeautifulSoup(html_code, "html.parser")

print(soup.prettify())

<a>
 BeautifulSoup is a
 <i>
  Python
 </i>
 library and works with
 <i>
  Python
 </i>
 3
</a>


### Comparing objects for equality
#### BeautifulSoup will say the two tags or NavigableString which are represent same HTML/XML markup are same

In [3]:
a_tag = soup.a

a_tag

<a>BeautifulSoup is a 
<i>Python</i> 
library and works with 
<i>Python</i> 3</a>

In [4]:
first_i, second_i = soup.find_all("i")

In [5]:
first_i

<i>Python</i>

In [6]:
second_i

<i>Python</i>

In [7]:
first_i == second_i

True

In [8]:
first_i.previous_element, second_i.previous_element

('BeautifulSoup is a \n', ' \nlibrary and works with \n')

In [9]:
first_i is second_i

False

In [10]:
html_code = """<a>BeautifulSoup is a 
<i class="first">Python</i> 
library and works with 
<i class="second">Python</i> 3</a>"""

soup = BeautifulSoup(html_code, "html.parser")

print(soup.prettify())

<a>
 BeautifulSoup is a
 <i class="first">
  Python
 </i>
 library and works with
 <i class="second">
  Python
 </i>
 3
</a>


In [11]:
first_i, second_i = soup.find_all("i")

first_i, second_i

(<i class="first">Python</i>, <i class="second">Python</i>)

In [12]:
first_i == second_i

False

### Copying BeautifulSoup objects
#### using copy.copy( ) we can create copy for any Tag or NavigableString

In [13]:
soup.a

<a>BeautifulSoup is a 
<i class="first">Python</i> 
library and works with 
<i class="second">Python</i> 3</a>

In [14]:
a_copy = copy.copy(soup.a)

a_copy

<a>BeautifulSoup is a 
<i class="first">Python</i> 
library and works with 
<i class="second">Python</i> 3</a>

In [15]:
soup.a == a_copy

True

In [16]:
a_copy is soup.a

False

#### This will have a parent because this is a part of a html document

In [17]:
print(soup.a.parent.name)

[document]


#### copy will never have a parent because this is completely detached from the document

In [18]:
print(a_copy.parent)

None


### Pretty printing
#### BeautifulSoup is giving us prettify( ) method to print the html code into well formatted way like it will give separate line for each tag and it will show clearly how tags are nestedly arranged in document

In [19]:
html_code = """<!DOCTYPE html>
<html><head>
<title>The three Apples that changed the world</title></head>
<body><h2>All you want to know</h2><p>
<p>Eve's Apple</p><p class="apple">Newton's Apple</p>
<p class="steve">Steve Jobs' Apple</p></p>
<p id="3apples">Not really an insignificant fruit<i>is it</i>!</p>
<!-- Interesting isn't it? -->
</body>
</html>
"""

In [20]:
soup = BeautifulSoup(html_code, "html.parser")

soup

<!DOCTYPE html>

<html><head>
<title>The three Apples that changed the world</title></head>
<body><h2>All you want to know</h2><p>
<p>Eve's Apple</p><p class="apple">Newton's Apple</p>
<p class="steve">Steve Jobs' Apple</p></p>
<p id="3apples">Not really an insignificant fruit<i>is it</i>!</p>
<!-- Interesting isn't it? -->
</body>
</html>

In [21]:
soup.prettify()

'<!DOCTYPE html>\n<html>\n <head>\n  <title>\n   The three Apples that changed the world\n  </title>\n </head>\n <body>\n  <h2>\n   All you want to know\n  </h2>\n  <p>\n   <p>\n    Eve\'s Apple\n   </p>\n   <p class="apple">\n    Newton\'s Apple\n   </p>\n   <p class="steve">\n    Steve Jobs\' Apple\n   </p>\n  </p>\n  <p id="3apples">\n   Not really an insignificant fruit\n   <i>\n    is it\n   </i>\n   !\n  </p>\n  <!-- Interesting isn\'t it? -->\n </body>\n</html>\n'

In [22]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   The three Apples that changed the world
  </title>
 </head>
 <body>
  <h2>
   All you want to know
  </h2>
  <p>
   <p>
    Eve's Apple
   </p>
   <p class="apple">
    Newton's Apple
   </p>
   <p class="steve">
    Steve Jobs' Apple
   </p>
  </p>
  <p id="3apples">
   Not really an insignificant fruit
   <i>
    is it
   </i>
   !
  </p>
  <!-- Interesting isn't it? -->
 </body>
</html>



#### we can call this prettify( ) method for tags as well

In [23]:
print(soup.body.prettify())

<body>
 <h2>
  All you want to know
 </h2>
 <p>
  <p>
   Eve's Apple
  </p>
  <p class="apple">
   Newton's Apple
  </p>
  <p class="steve">
   Steve Jobs' Apple
  </p>
 </p>
 <p id="3apples">
  Not really an insignificant fruit
  <i>
   is it
  </i>
  !
 </p>
 <!-- Interesting isn't it? -->
</body>



### Non-pretty printing
#### If we dont want any kind of special formatting then we can directly with unicode( ) and str( ) 

In [24]:
soup

<!DOCTYPE html>

<html><head>
<title>The three Apples that changed the world</title></head>
<body><h2>All you want to know</h2><p>
<p>Eve's Apple</p><p class="apple">Newton's Apple</p>
<p class="steve">Steve Jobs' Apple</p></p>
<p id="3apples">Not really an insignificant fruit<i>is it</i>!</p>
<!-- Interesting isn't it? -->
</body>
</html>

In [25]:
str(soup)

'<!DOCTYPE html>\n\n<html><head>\n<title>The three Apples that changed the world</title></head>\n<body><h2>All you want to know</h2><p>\n<p>Eve\'s Apple</p><p class="apple">Newton\'s Apple</p>\n<p class="steve">Steve Jobs\' Apple</p></p>\n<p id="3apples">Not really an insignificant fruit<i>is it</i>!</p>\n<!-- Interesting isn\'t it? -->\n</body>\n</html>\n'

### Output formatters

In [26]:
html_code = """<b>I ate both; pizza & burger</b>
<a href="http://example.com/?foo=val1&bar=val2">Some link</a>"""

soup = BeautifulSoup(html_code, "html.parser")

#### By default tha characters which are escaped upon output are bare ampersands and angle brackets, ampersand is converted inti "&amp:", we can observe here

In [27]:
soup.b

<b>I ate both; pizza &amp; burger</b>

#### Here also we can observe in link also it is changed from "&" to "amp;"

In [28]:
soup.a

<a href="http://example.com/?foo=val1&amp;bar=val2">Some link</a>

https://www.crummy.com/software/BeautifulSoup/bs4/doc/#output-formatters

In [29]:
french = "<p>Il a dit &lt;&lt;une &eacute;cole&gt;&gt;</p>"

soup = BeautifulSoup(french)

"""Since it is a different language we dont use formatting"""

print(soup.prettify(formatter=None))

<html>
 <body>
  <p>
   Il a dit <<une école>>
  </p>
 </body>
</html>


In [30]:
print(soup.prettify(formatter="minimal"))

<html>
 <body>
  <p>
   Il a dit &lt;&lt;une école&gt;&gt;
  </p>
 </body>
</html>


In [31]:
print(soup.prettify(formatter = 'html'))

<html>
 <body>
  <p>
   Il a dit &lt;&lt;une &eacute;cole&gt;&gt;
  </p>
 </body>
</html>


In [32]:
print(soup.prettify(formatter = 'html5'))

<html>
 <body>
  <p>
   Il a dit &lt;&lt;une &eacute;cole&gt;&gt;
  </p>
 </body>
</html>


In [33]:
soup = BeautifulSoup("<br>")

In [34]:
print(soup.prettify(formatter="html"))

<html>
 <body>
  <br/>
 </body>
</html>


In [35]:
print(soup.prettify(formatter="html5"))

<html>
 <body>
  <br>
 </body>
</html>


#### If we want more control on our output data we can make use of custom_formatters class also

In [36]:
def convert_uppercase(str):
    return EntitySubstitution.substitute_html(str.upper())

#### Here we are converting the string data into upper case

In [37]:
soup = BeautifulSoup("<p>Initially lowercase</p>")

print(soup.prettify(formatter=convert_uppercase))

<html>
 <body>
  <p>
   INITIALLY LOWERCASE
  </p>
 </body>
</html>
