# loading the necessary libraries

In [2]:
import requests                              # use to interact with web services and resources over the internet
from bs4 import BeautifulSoup as bs          # web scraping library

# load our first page

In [None]:
# load the webpage content using request library
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

# converting to beautifullsoup object
soup = bs(r.content)

#print html
print(soup)

<html>
<head>
<title>HTML Example</title>
</head>
<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>
</html>



In [4]:
#print html
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



### start using Beautifull Soup Scraping

#### find and find_all


In [None]:
# use of find
first_header = soup.find("h2")
print(first_header)

<h2>A Header</h2>


In [7]:
# use of find_all
headers = soup.find_all("h2")
print(headers)

[<h2>A Header</h2>, <h2>Another header</h2>]


In [None]:
# pass a list of elements in find
first_header = soup.find(['h1','h2'])
print(first_header)

<h1>HTML Webpage</h1>


In [10]:
# pass a list of elements in find_all
headers = soup.find_all(['h1','h2'])
print(headers)

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]


In [None]:
# find will just give the element that is first occured
# find_all will give all elements that are given to find

In [None]:
# you can pass in attributes to the find/find_all function
paragraph = soup.find_all('p',attrs={'id':'paragraph-id'})  # attrs mean attribute of element
print(paragraph)

[<p id="paragraph-id"><b>Some bold text</b></p>]


In [17]:
# you can nest find/find_all cals
body = soup.find('body')
div = body.find('div')
header = div.find('h1')
print(header)

<h1>HTML Webpage</h1>


In [19]:
# we can find specific strings in find/find_all functions
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



In [24]:
paragraphs = soup.find_all('p',string='Some bold text')
print(paragraphs)
# this is not a usefull case if we want to search a certain word what will we do then?
# hence we will re library function re.compile("text")

[<p id="paragraph-id"><b>Some bold text</b></p>]


In [26]:
# importing regex library
import re

paragraphs = soup.find_all('p',string=re.compile('Some'))
print(paragraphs)
# it print any thing that contain 'Some'

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


In [30]:
headers = soup.find_all('h2',string=re.compile('Header'))
print(headers)

[<h2>A Header</h2>]


In [32]:
headers = soup.find_all('h2',string=re.compile('(h|H)eader'))
# this (h|H)eader will find both uppercase Header and lowecase header
print(headers)

[<h2>A Header</h2>, <h2>Another header</h2>]


#### select method  (CSS selectors)

In [42]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [43]:
content = soup.select('div p')
print(content)

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]


In [44]:
paragraphs = soup.select('h2 ~ p')
print(paragraphs)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


In [47]:
bold_text = soup.select('p#paragraph-id b')
print(bold_text)

[<b>Some bold text</b>]


In [51]:
paragraphs = soup.select('body > p')
print(paragraph)
print()

for paragraph in paragraphs:
    print(paragraph.select('i'))

[<p id="paragraph-id"><b>Some bold text</b></p>]

[<i>Some italicized text</i>]
[]


In [52]:
soup.select('[align=middle]')

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

#### Getting different properties of html

In [None]:
header = soup.find('h2')
print(header)
print()
print(header.string)  # it will print string in element

<h2>A Header</h2>

A Header


In [54]:
div = soup.find('div')
print(div.prettify())
print(div.string)  # output will be None! then we will use get_text()

# for multiple child elements use get_text() instead of string funcion
print(div.get_text())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>

None

HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [None]:
#### Get a specific property from a element
link = soup.find("a")
link['href']

'https://keithgalli.github.io/web-scraping/webpage.html'

In [None]:
# paragraph id is printed:
paragraphs = soup.select("p#paragraph-id")
print(paragraphs[0]['id'])

paragraph-id


# Code Navigation

In [61]:
# path syntax
print(soup.body.div.h1.string)

HTML Webpage


In [None]:
print(soup.body.prettify())
"""
you should want to know the terms:(parent,child,siblings)
    'parent' is body
    div is 'child' of body
    the element on the same levels are the 'sibblings'

<body>  ---------------------------> parent tag
    <div>--------------------------> child tag of body
        <h1>heading</h1>-----------> child tag of div
    </div>
    <div>--------------------------> sibbling of upper div
        <h1>heading</h1>
    </div>
</body>

"""

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [65]:
soup.body.find('div').find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

### <a href='https://www.youtube.com/watch?v=GjKQ6V_ViQE&t=3847s'>Comprehensive Python Beautiful Soup Web Scraping Tutorial! (find/find_all, css select, scrape table)</a>