In [2]:
import requests
from bs4 import BeautifulSoup as bs


In [3]:
# Load the webpage contents
r = requests.get('https://keithgalli.github.io/web-scraping/example.html')

# convert to beautiful soup object
soup = bs(r.content)

print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



##### find and find_all

In [4]:
first_header = soup.find('h2')
headers = soup.find_all('h2')
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [5]:
# pass in a list of elements to look for
first_header = soup.find(['h1', 'h2'])
print(first_header)

headers = soup.find_all(['h1', 'h2'])
print(headers)

<h1>HTML Webpage</h1>
[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]


In [6]:
# attributes to the find/find_all function 
paragraphs = soup.find_all('p', {'id' : 'paragraph-id'})
paragraphs

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [7]:
# nesting find on elements 
body = soup.find('body')
body_div = body.find('div')
body_div = body_div.find('h1')
body_div

<h1>HTML Webpage</h1>

In [8]:
# search specific strings 
import re #regular expression library for string used for search
italicized = soup.find_all('p', string=re.compile('Some'))
print(italicized)

headers  = soup.find_all('h2', string=re.compile('(h|H)eader'))
print(headers)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<h2>A Header</h2>, <h2>Another header</h2>]


##### Select(css selector)

In [9]:
# nesting
content = soup.select('div p')
content


[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [10]:
# next element
paragraph  = soup.select('h2 ~ p')
paragraph

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [11]:
# nesting with id 
bold_text = soup.select('p#paragraph-id b')
bold_text

[<b>Some bold text</b>]

In [12]:
# same level elements
paragraph = soup.select('body > p')
paragraph

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [13]:
# selecting with specific property
aligned = soup.select('[align=middle]')
aligned

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

#### Get properties of html 

In [14]:
# use .string
header = soup.find('h2')
print(header.string)

# multiple child cases
div = soup.find('div')
print(div.get_text())

A Header

HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [15]:
# get a specific property from an element
link = soup.find('a')
print(link['href'])

print(div['align'])


https://keithgalli.github.io/web-scraping/webpage.html
middle


#### code navigation

In [23]:
# path syntax 
soup.body.div.h1.string


'HTML Webpage'

In [38]:
# finding the siblings, children and parent
# soup.body.div.find_parent()
# soup.body.div.find_next_sibling()
soup.body.div.contents

['\n',
 <h1>HTML Webpage</h1>,
 '\n',
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 '\n']