In [3]:
import requests
from bs4 import BeautifulSoup as bs 

In [4]:
#Load Webpage content
r = requests.get('https://keithgalli.github.io/web-scraping/example.html')

#COnvert to bs object
soup = bs(r.content)

#print html
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



Start using Beautiful Soup to SCRAPE

In [5]:
first_header = soup.find('h2')
headers = soup.find_all('h2')
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [6]:
#PAss in list of elements to look for
first_header = soup.find(['h1', 'h2'])

headers = soup.find_all(['h1', 'h2'])
headers


[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [7]:
#You can pass in attributes as well
paragraph = soup.find_all('p', attrs={'id': 'paragraph-id'})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [8]:
# You can NEST find and find_all calls
# Good for narrowing down
body = soup.find('body')
div = body.find('div')
header = div.find('h1')
header

<h1>HTML Webpage</h1>

In [9]:
# Can search for specific strings in find/find_all calls
import re

paragraphs = soup.find_all('p', string=re.compile('Some'))

headers = soup.find_all('h2', string=re.compile('(H|h)eader'))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

CSS Selector

In [10]:
content = soup.select('div p')
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [11]:
paragraphs = soup.select('h2 ~ p')
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [12]:
bold_text = soup.select('p#paragraph-id b')
bold_text

[<b>Some bold text</b>]

In [13]:
paragraphs = soup.select('body > p')
print(paragraphs)

for paragraph in paragraphs:
    print(paragraph.select('i'))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


Get Different Properties of the HTML

In [14]:
# Use .string
header = soup.find('h2')
header.string

# If multiple elements inside, use get_text
div = soup.find('div')
print(div.prettify())
print(div.get_text())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [15]:
# Get a specific property/link from an element
link = soup.find('a')
link['href']

paragraphs = soup.select('p#paragraph-id')
paragraphs[0]['id']

'paragraph-id'

Code Navigation

In [16]:
# path syntax
soup.body.div.h1.string

'HTML Webpage'

In [17]:
# Know the terms: Parent, sibling, child
soup.body.find('div').find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

EXCERSIZES
https://keithgalli.github.io/web-scraping/webpage.html

In [18]:
# Laod webpage
wp = requests.get('https://keithgalli.github.io/web-scraping/webpage.html')

page = bs(wp.content)

Grab all social links from webpage

In [19]:
body = page.find('body')

socials = body.select('ul')
socials = socials[1]

links = socials.find_all('a')

for link in links:
    print(link.string)
    


https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


In [20]:
links = page.select('ul.socials a')
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [21]:
ulist = page.find('ul', attrs={'class': 'socials'})
links = ulist.find_all('a')
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [22]:
links = page.select('li.social a')
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

Scrape the Table

In [35]:
import pandas as pd 

table = page.select('table.hockey-stats')[0]
columns = table.find('thead').find_all('th')
column_names = [c.string for c in columns]

table_rows = table.find('tbody').find_all('tr')

l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [str(tr.get_text()).strip() for tr in td]
    l.append(row)

df = pd.DataFrame(l, columns=column_names)
df.head()

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


Grab all fun-facts that use word 'is'

In [71]:
facts = page.select('ul.fun-facts li')

facts_with_is = [fact.find(string= re.compile('is')) for fact in facts]
facts_with_is = [fact.find_parent().get_text() for fact in facts_with_is if fact]
facts_with_is

['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

Download an image

In [84]:
from os.path import basename

photos = page.select('div.row img')
#actual_photos = [photo['src'] for photo in photos]


for i in photos:
    if "http" in i.get('src'):
        lnk = i.get('src')
        with open(basename(lnk), "wb") as f:
            f.write(requests.get(lnk).content)
