In [2]:
import re
from bs4 import BeautifulSoup

with open("files/TomJerry_WithImages.html") as html:
    soup = BeautifulSoup(html, "lxml")

print(soup.prettify())

<html>
 <head>
  <title>
   The story of Tom and Jerry
  </title>
 </head>
 <body class="container">
  <h1>
   Tom and Jerry
  </h1>
  &gt;
  <img alt="cartoon_image" height="300" src="TomAndJerry.jpg" width="300"/>
  <p class="comedy animated series">
   Tom and Jerry is an American animated series of comedy short films created by
   <a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">
    William Hanna
   </a>
   and
   <a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">
    Joseph Barbera
   </a>
   . 
        It centers on a rivalry between the title characters
   <a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">
    Tom
   </a>
   , a cat, and
   <a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">
    Jerry
   </a>
   , a mouse.
  </p>
  <div>
   <img alt="creator_image" height="300" name="William_Hanna" src="https://upload.wikimedia.org/wikipedia/commons/d/d2/Will

In [6]:
"""rescursive, limit"""

limit_a = soup.find_all("a", limit=1)
print(limit_a)
print()
"""Recursion is set to True by default"""
limit_a_recursive = soup.find_all("a", recursive=False)
print(limit_a_recursive)
print()
limit_a_not_recursive = soup.find_all("a", recursive=True)
print(limit_a_not_recursive)
print()


[<a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William Hanna</a>]
[]


In [7]:
"""below is equivalent to find_all"""
soup('a')

[<a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William Hanna</a>,
 <a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>,
 <a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">Tom</a>,
 <a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">Jerry</a>]

In [9]:
print(soup.body.find_all('p'))
"rescurive=False returns the values of direct children only"
print(soup.body.find_all('p', recursive=False))

[<p class="comedy animated series">
        Tom and Jerry is an American animated series of comedy short films created by 
        <a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William Hanna</a> and  
        <a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>. 
        It centers on a rivalry between the title characters
        <a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">Tom</a>, a cat, and 
        <a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">Jerry</a>, a mouse.</p>, <p class="comedy story">
<b>
                The series features comic fights between an iconic pair of adversaries, 
                a house cat (Tom) and a mouse (Jerry). The plots of each short usually center on Tom's 
                numerous attempts to capture Jerry and the mayhem and destruction that follows. 
                Tom rarely succeeds in catching Jerry

In [15]:
"""custom attributes"""
soup.find_all('img', name="William_Hanna")

TypeError: find_all() got multiple values for argument 'name'

In [16]:
"""Fix above issue we use"""
soup.find_all('img', attrs={"name": "William_Hanna"})

[<img alt="creator_image" height="300" name="William_Hanna" src="https://upload.wikimedia.org/wikipedia/commons/d/d2/William_Hanna_1977.jpg" width="300"/>]

In [17]:
"""attributes with dash -"""
soup.find_all('img', extra-info="Tom_Cat")

SyntaxError: keyword can't be an expression (<ipython-input-17-464d04487e2d>, line 2)

In [19]:
soup.find_all('img', attrs={"extra-info": "Tom_Cat"})

[<img extra-info="Tom_Cat" src="https://upload.wikimedia.org/wikipedia/en/f/f6/Tom_Tom_and_Jerry.png"/>]

In [23]:
print(soup.find_all(attrs={'character'}))
print()
print(soup.find_all(class_='character'))

[<a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">Tom</a>, <a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">Jerry</a>]

[<a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">Tom</a>, <a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">Jerry</a>]


In [24]:
import re
print(soup.find_all(attrs={re.compile(r"^ani")}))

[<p class="comedy animated series">
        Tom and Jerry is an American animated series of comedy short films created by 
        <a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William Hanna</a> and  
        <a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>. 
        It centers on a rivalry between the title characters
        <a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">Tom</a>, a cat, and 
        <a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">Jerry</a>, a mouse.</p>]


In [25]:
soup.find_all(string= "Tom")

['Tom']

In [26]:
soup.find_all("a", string= "Tom")

[<a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">Tom</a>]

In [27]:
soup.find_all(["b", "p"], string= re.compile('cat'))

[<b>
                 The series features comic fights between an iconic pair of adversaries, 
                 a house cat (Tom) and a mouse (Jerry). The plots of each short usually center on Tom's 
                 numerous attempts to capture Jerry and the mayhem and destruction that follows. 
                 Tom rarely succeeds in catching Jerry, mainly because of Jerry's cleverness, 
                 cunning abilities, and luck. 
             </b>]

In [28]:
def is_the_only_string_within_a_tag(string_):
    return string_ == string_.parent.string
soup.find_all(string=is_the_only_string_within_a_tag)

[' The story of Tom and Jerry ',
 'Tom and Jerry',
 'William Hanna',
 'Joseph Barbera',
 'Tom',
 'Jerry',
 "\n                The series features comic fights between an iconic pair of adversaries, \n                a house cat (Tom) and a mouse (Jerry). The plots of each short usually center on Tom's \n                numerous attempts to capture Jerry and the mayhem and destruction that follows. \n                Tom rarely succeeds in catching Jerry, mainly because of Jerry's cleverness, \n                cunning abilities, and luck. \n            ",
 'Tom and Jerry show is a full length comedy show']

In [45]:
"""find_parent"""
a_tag = soup.find('a')
print(a_tag)
print()
a_tag.find_parent(class_="container")

<a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William Hanna</a>



<body class="container">
<h1>Tom and Jerry</h1>&gt;
    <img alt="cartoon_image" height="300" src="TomAndJerry.jpg" width="300"/>
<p class="comedy animated series">
        Tom and Jerry is an American animated series of comedy short films created by 
        <a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William Hanna</a> and  
        <a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>. 
        It centers on a rivalry between the title characters
        <a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">Tom</a>, a cat, and 
        <a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">Jerry</a>, a mouse.</p>
<div>
<img alt="creator_image" height="300" name="William_Hanna" src="https://upload.wikimedia.org/wikipedia/commons/d/d2/William_Hanna_1977.jpg" width="300"/>
<img alt="creator_image" height="300" name="Joseph_Barbera" src="https://upload.wiki

In [39]:
a_tag.find_parents('p')

[<p class="comedy animated series">
         Tom and Jerry is an American animated series of comedy short films created by 
         <a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William Hanna</a> and  
         <a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>. 
         It centers on a rivalry between the title characters
         <a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">Tom</a>, a cat, and 
         <a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">Jerry</a>, a mouse.</p>]

In [54]:
"siblings"
print(a_tag.find_next_sibling(href=True))
print()
print(a_tag.find_next_siblings(href=True))
print()
print(a_tag.find_next_sibling(src=True))
print()
print(a_tag.find_next_siblings(src=True))
print()
print(a_tag.find_next(src=True))

<a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>

[<a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>, <a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">Tom</a>, <a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">Jerry</a>]

None

[]

<img alt="creator_image" height="300" name="William_Hanna" src="https://upload.wikimedia.org/wikipedia/commons/d/d2/William_Hanna_1977.jpg" width="300"/>


In [55]:
"siblings as well as in other elements"
print(a_tag.find_next(src=True))
print()
print(a_tag.find_all_next(src=True))
print()

<img alt="creator_image" height="300" name="William_Hanna" src="https://upload.wikimedia.org/wikipedia/commons/d/d2/William_Hanna_1977.jpg" width="300"/>

[<img alt="creator_image" height="300" name="William_Hanna" src="https://upload.wikimedia.org/wikipedia/commons/d/d2/William_Hanna_1977.jpg" width="300"/>, <img alt="creator_image" height="300" name="Joseph_Barbera" src="https://upload.wikimedia.org/wikipedia/commons/6/67/JBarbera.jpg" width="300"/>, <img src="https://upload.wikimedia.org/wikipedia/en/2/2f/Jerry_Mouse.png"/>, <img extra-info="Tom_Cat" src="https://upload.wikimedia.org/wikipedia/en/f/f6/Tom_Tom_and_Jerry.png"/>]



In [58]:
img_tag = soup.find('img', attrs={'extra-info': 'Tom_Cat'})
img_tag

<img extra-info="Tom_Cat" src="https://upload.wikimedia.org/wikipedia/en/f/f6/Tom_Tom_and_Jerry.png"/>

In [65]:
"""previous try using fetch methods they also work well"""
print(img_tag.find_previous_sibling(alt=True))
print()
print(img_tag.find_previous_siblings(alt=True))

<img alt="creator_image" height="300" name="Joseph_Barbera" src="https://upload.wikimedia.org/wikipedia/commons/6/67/JBarbera.jpg" width="300"/>

[<img alt="creator_image" height="300" name="Joseph_Barbera" src="https://upload.wikimedia.org/wikipedia/commons/6/67/JBarbera.jpg" width="300"/>, <img alt="creator_image" height="300" name="William_Hanna" src="https://upload.wikimedia.org/wikipedia/commons/d/d2/William_Hanna_1977.jpg" width="300"/>]


In [68]:
print(img_tag.find_previous(href=True))
print(img_tag.find_all_previous(href=True))

<a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">Jerry</a>
[<a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">Jerry</a>, <a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">Tom</a>, <a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>, <a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William Hanna</a>]
