In [3]:
import re
from bs4 import BeautifulSoup

with open("files/TomJerry_WithImages.html") as html:
    soup = BeautifulSoup(html, "lxml")

print(soup.prettify())

<html>
 <head>
  <title>
   The story of Tom and Jerry
  </title>
 </head>
 <body class="container">
  <h1>
   Tom and Jerry
  </h1>
  &gt;
  <img alt="cartoon_image" height="300" src="TomAndJerry.jpg" width="300"/>
  <p class="comedy animated series">
   Tom and Jerry is an American animated series of comedy short films created by
   <a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">
    William Hanna
   </a>
   and
   <a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">
    Joseph Barbera
   </a>
   . 
        It centers on a rivalry between the title characters
   <a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">
    Tom
   </a>
   , a cat, and
   <a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">
    Jerry
   </a>
   , a mouse.
  </p>
  <div>
   <img alt="creator_image" height="300" name="William_Hanna" src="https://upload.wikimedia.org/wikipedia/commons/d/d2/Will

In [2]:
"""rescursive, limit"""

limit_a = soup.find_all("a", limit=1)
print(limit_a)
print()
"""Recursion is set to True by default"""
limit_a_recursive = soup.find_all("a", recursive=False)
print(limit_a_recursive)
print()
limit_a_not_recursive = soup.find_all("a", recursive=True)
print(limit_a_not_recursive)
print()


[<a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William Hanna</a>]

[]

[<a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William Hanna</a>, <a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>, <a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">Tom</a>, <a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">Jerry</a>]



In [3]:
"""below is equivalent to find_all"""
soup('a')

[<a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William Hanna</a>,
 <a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>,
 <a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">Tom</a>,
 <a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">Jerry</a>]

In [4]:
print(soup.body.find_all('p'))
"rescurive=False returns the values of direct children only"
print(soup.body.find_all('p', recursive=False))

[<p class="comedy animated series">
        Tom and Jerry is an American animated series of comedy short films created by 
        <a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William Hanna</a> and  
        <a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>. 
        It centers on a rivalry between the title characters
        <a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">Tom</a>, a cat, and 
        <a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">Jerry</a>, a mouse.</p>, <p class="comedy story">
<b>
                The series features comic fights between an iconic pair of adversaries, 
                a house cat (Tom) and a mouse (Jerry). The plots of each short usually center on Tom's 
                numerous attempts to capture Jerry and the mayhem and destruction that follows. 
                Tom rarely succeeds in catching Jerry

In [5]:
"""custom attributes"""
soup.find_all('img', name="William_Hanna")

TypeError: find_all() got multiple values for argument 'name'

In [None]:
"""Fix above issue we use"""
soup.find_all('img', attrs={"name": "William_Hanna"})

In [None]:
"""attributes with dash -"""
soup.find_all('img', extra-info="Tom_Cat")

In [None]:
soup.find_all('img', attrs={"extra-info": "Tom_Cat"})

In [None]:
print(soup.find_all(attrs={'character'}))
print()
print(soup.find_all(class_='character'))

In [None]:
import re
print(soup.find_all(attrs={re.compile(r"^ani")}))

In [None]:
soup.find_all(string= "Tom")

In [None]:
soup.find_all("a", string= "Tom")

In [None]:
soup.find_all(["b", "p"], string= re.compile('cat'))

In [None]:
def is_the_only_string_within_a_tag(string_):
    return string_ == string_.parent.string
soup.find_all(string=is_the_only_string_within_a_tag)

In [None]:
"""find_parent"""
a_tag = soup.find('a')
print(a_tag)
print()
a_tag.find_parent(class_="container")

In [None]:
a_tag.find_parents('p')

In [None]:
"siblings"
print(a_tag.find_next_sibling(href=True))
print()
print(a_tag.find_next_siblings(href=True))
print()
print(a_tag.find_next_sibling(src=True))
print()
print(a_tag.find_next_siblings(src=True))
print()
print(a_tag.find_next(src=True))

In [None]:
"siblings as well as in other elements"
print(a_tag.find_next(src=True))
print()
print(a_tag.find_all_next(src=True))
print()

In [None]:
img_tag = soup.find('img', attrs={'extra-info': 'Tom_Cat'})
img_tag

In [None]:
"""previous try using fetch methods they also work well"""
print(img_tag.find_previous_sibling(alt=True))
print()
print(img_tag.find_previous_siblings(alt=True))

In [None]:
print(img_tag.find_previous(href=True))
print(img_tag.find_all_previous(href=True))

In [4]:
"""CSS SELECTORS"""
with open("files/TomJerry_WithImages.html") as html:
    soup = BeautifulSoup(html, 'lxml')
soup.select('title') # soup.select works like soup.find_all()

[<title> The story of Tom and Jerry </title>]

In [6]:
soup.select("p:nth-of-type(1)")

[<p class="comedy animated series">
         Tom and Jerry is an American animated series of comedy short films created by 
         <a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William Hanna</a> and  
         <a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>. 
         It centers on a rivalry between the title characters
         <a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">Tom</a>, a cat, and 
         <a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">Jerry</a>, a mouse.</p>]

In [7]:
soup.select("p:nth-of-type(2)")

[<p class="comedy story">
 <b>
                 The series features comic fights between an iconic pair of adversaries, 
                 a house cat (Tom) and a mouse (Jerry). The plots of each short usually center on Tom's 
                 numerous attempts to capture Jerry and the mayhem and destruction that follows. 
                 Tom rarely succeeds in catching Jerry, mainly because of Jerry's cleverness, 
                 cunning abilities, and luck. 
             </b>
 </p>]

In [8]:
"""soup.select works like soup.find_all()"""
soup.select("i")

[<i>Tom and Jerry show is a full length comedy show</i>]

In [9]:
soup.select("body p a")

[<a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William Hanna</a>,
 <a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>,
 <a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">Tom</a>,
 <a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">Jerry</a>]

In [10]:
"""It works with both immediate as well as not immediate parent"""
"""non - immediate parent"""
soup.select("body b")

[<b>
                 The series features comic fights between an iconic pair of adversaries, 
                 a house cat (Tom) and a mouse (Jerry). The plots of each short usually center on Tom's 
                 numerous attempts to capture Jerry and the mayhem and destruction that follows. 
                 Tom rarely succeeds in catching Jerry, mainly because of Jerry's cleverness, 
                 cunning abilities, and luck. 
             </b>]

In [11]:
"immediate parent"
soup.select("head title")

[<title> The story of Tom and Jerry </title>]

In [17]:
"""To force to return only immediate parent"""
print(soup.select("body > img:nth-of-type(2)"))
print()
print(soup.select("body > div > img:nth-of-type(2)"))

[]

[<img alt="creator_image" height="300" name="Joseph_Barbera" src="https://upload.wikimedia.org/wikipedia/commons/6/67/JBarbera.jpg" width="300"/>]


In [21]:
"""Below find the sibling of element with id link1 and 
the sibling should have the class creator"""
print(soup.select("#link1 ~ .creator")) # both + and ~ do the same operation
print(soup.select("#link1 + .creator"))

[<a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>]
[<a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>]


In [23]:
"""(.) represents class and # represents id"""
print(soup.select(".creator"))
print()
print(soup.select("#link1"))

[<a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William Hanna</a>, <a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>]

[<a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William Hanna</a>]


In [25]:
"""The syntax for class can be . or like below"""
print(soup.select("[class~=animated]"))

[<p class="comedy animated series">
        Tom and Jerry is an American animated series of comedy short films created by 
        <a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William Hanna</a> and  
        <a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>. 
        It centers on a rivalry between the title characters
        <a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">Tom</a>, a cat, and 
        <a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">Jerry</a>, a mouse.</p>]


In [30]:
"""Find a tags with id = link1"""
print(soup.select("a#link1"))
"""Find a tags with id = link2 and class=creator"""
print(soup.select("a.creator#link2"))


[<a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William Hanna</a>]
[<a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>]


In [29]:
"""Find a tags with some/any value in href"""
soup.select("a[href]")

[<a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William Hanna</a>,
 <a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>,
 <a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">Tom</a>,
 <a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">Jerry</a>]

In [32]:
# print(soup.prettify())
"""Exact match"""
soup.select('img[src="https://upload.wikimedia.org/wikipedia/commons/6/67/JBarbera.jpg"]')

[<img alt="creator_image" height="300" name="Joseph_Barbera" src="https://upload.wikimedia.org/wikipedia/commons/6/67/JBarbera.jpg" width="300"/>]

In [33]:
"""Regular expression"""
"""$ - endswith"""
soup.select('img[src$=".jpg"]')

[<img alt="cartoon_image" height="300" src="TomAndJerry.jpg" width="300"/>,
 <img alt="creator_image" height="300" name="William_Hanna" src="https://upload.wikimedia.org/wikipedia/commons/d/d2/William_Hanna_1977.jpg" width="300"/>,
 <img alt="creator_image" height="300" name="Joseph_Barbera" src="https://upload.wikimedia.org/wikipedia/commons/6/67/JBarbera.jpg" width="300"/>]

In [35]:
soup.select('img[src^="https"]')

[<img alt="creator_image" height="300" name="William_Hanna" src="https://upload.wikimedia.org/wikipedia/commons/d/d2/William_Hanna_1977.jpg" width="300"/>,
 <img alt="creator_image" height="300" name="Joseph_Barbera" src="https://upload.wikimedia.org/wikipedia/commons/6/67/JBarbera.jpg" width="300"/>,
 <img src="https://upload.wikimedia.org/wikipedia/en/2/2f/Jerry_Mouse.png"/>,
 <img extra-info="Tom_Cat" src="https://upload.wikimedia.org/wikipedia/en/f/f6/Tom_Tom_and_Jerry.png"/>]

In [36]:
soup.select('img[src*=".org"]')

[<img alt="creator_image" height="300" name="William_Hanna" src="https://upload.wikimedia.org/wikipedia/commons/d/d2/William_Hanna_1977.jpg" width="300"/>,
 <img alt="creator_image" height="300" name="Joseph_Barbera" src="https://upload.wikimedia.org/wikipedia/commons/6/67/JBarbera.jpg" width="300"/>,
 <img src="https://upload.wikimedia.org/wikipedia/en/2/2f/Jerry_Mouse.png"/>,
 <img extra-info="Tom_Cat" src="https://upload.wikimedia.org/wikipedia/en/f/f6/Tom_Tom_and_Jerry.png"/>]

In [None]:
"""XML"""