## Web Scrapping using Beautiful Soup

In [1]:
import requests

In [2]:
from bs4 import BeautifulSoup as bs

In [12]:
## Load the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

In [15]:
r

<Response [200]>

In [17]:
## Convert it to a beautiful soup object 

soup = bs(r.content)

## Print out our html

print(soup)

<html>
<head>
<title>HTML Example</title>
</head>
<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>
</html>



In [18]:
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



## Start using beautiful soup to Scrape

In [20]:
## find and find_all

first_header = soup.find("h2")
first_header

<h2>A Header</h2>

In [21]:
print(first_header)

<h2>A Header</h2>


In [22]:
headers = soup.find_all("h2")

In [23]:
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [24]:
## we can also pass in the list of elements to look for 

first_header = soup.find(['h1','h2'])
first_header

<h1>HTML Webpage</h1>

In [26]:
first_header = soup.find(['h2','h1'])
first_header            ## order does not matter, it finds first one of them

<h1>HTML Webpage</h1>

In [27]:
para = soup.find_all('p')
para

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [29]:
len(para)

3

In [30]:
para = soup.find_all('p',attrs={'id':'paragraph-id'})  ## attrs in nothing but propery inside a tag
para

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [31]:
## nested 

body = soup.find('body')

In [32]:
body

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

In [34]:
div = soup.find('div')
div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [35]:
body = soup.find('body')
div = soup.find('div')
head = soup.find('h1')
head

<h1>HTML Webpage</h1>

In [36]:
## we can search for specific strings in our find/find_All calls 

In [37]:
## lets find any paragraph that has 'some' in it
string_search = soup.find_all('p',text='some')
string_search

[]

In [43]:
string_search = soup.find_all('p',text='Some bold text')
string_search

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [39]:
import re

string_search = soup.find_all('p',string=re.compile('some'))
string_search

[]

In [40]:
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



In [41]:
string_search = soup.find_all('p',string=re.compile('Some'))
string_search

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [45]:
head = soup.find_all('h2',string = re.compile('header'))
head

[<h2>Another header</h2>]

In [46]:
head = soup.find_all('h2',string = re.compile('(H|h)eader'))
head

[<h2>A Header</h2>, <h2>Another header</h2>]

## Select ( CSS Selector )

In [47]:
content = soup.select("p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [49]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [50]:
content = soup.select("div p")
content 

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [57]:
parag = soup.select("h2 ~ p")

In [58]:
parag

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [53]:
bold = soup.select("p#paragraph-id b")
bold

[<b>Some bold text</b>]

In [54]:
parag = soup.select('body p')
parag

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [59]:
para = soup.select("h2 p")
para

[]

In [62]:
parag = soup.select('body > p')
print(parag)

for p in parag:
    p1 = p.select('i')
    print(p1)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


In [65]:
## grab by element with specific property 

soup.select('[align = middle]')

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

### Get different properties of the html

In [66]:
header = soup.find('h2')

In [68]:
header.string ## if no chid element use .string

'A Header'

In [89]:
div = soup.find('div')

In [91]:
div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [74]:
div.string

In [75]:
div.get_text()

'\nHTML Webpage\nLink to more interesting example: keithgalli.github.io/web-scraping/webpage.html\n'

In [76]:
print(div.get_text()) ## if multiple chid elements use get_text  


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [78]:
link = soup.find('a')
link

<a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>

In [79]:
link.href

In [82]:
link.string

'keithgalli.github.io/web-scraping/webpage.html'

In [80]:
type(link)

bs4.element.Tag

In [81]:
link['href']

'https://keithgalli.github.io/web-scraping/webpage.html'

In [95]:
para = soup.select('p#paragraph-id')
para

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [97]:
para[0]['id']

'paragraph-id'

In [92]:
head =  soup.find('h2')

In [93]:
head

<h2>A Header</h2>

In [94]:
head[0]

KeyError: 0

## Code Navigation 

In [98]:
## path syntax 

In [99]:
soup.body.div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [100]:
soup.body.div.h1.string

'HTML Webpage'

In [None]:
## Know the terms : Parent , Sibling and Child

In [102]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [104]:
soup.body.find('div').find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [108]:
soup.body.div.find_next_sibling()

<h2>A Header</h2>