In [None]:
import requests
from bs4 import BeautifulSoup as bs

In [None]:
#Load the webpage
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")
#convert to a beautiful soup object
soup = bs(r.content)
#Print out our html
print(soup.prettify())


<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



In [None]:
first_header = soup.find("h2")
print(first_header)
#Here we will print the 1st header with h2 tag

<h2>A Header</h2>


In [None]:
headers = soup.find_all("h2")
print(headers)
#Here we find a list of all the headers that start with the h2 tag

[<h2>A Header</h2>, <h2>Another header</h2>]


In [None]:
#Pass in a list of elements to look for; it will always print the first element that appears on the html (here h1)
first_header = soup.find(["h2", "h1"])
print(first_header)

<h1>HTML Webpage</h1>


In [None]:
headers = soup.find_all(["h1", "h2"]) #This one will print all the headers that come with those tags
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [None]:
#You can pass in attributes to the find/find all function
paragraph = soup.find_all("p", attrs={"id":"paragraph-id"})
paragraph
#attrs uses a dictionnary mapping of what we are looking for

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [None]:
#You can nest find/find all calls
body = soup.find('body')
div = body.find('div')
div
#Here we are narrowing the search step by step (with div and header)
header = div.find('h1')
header
#This works when working with large html pages

<h1>HTML Webpage</h1>

In [None]:
#We can also search for specific strings in out find/find_all calls
#Let's say we want to find the string 'Some' !!! careful with capital letters
import re
paragraphs = soup.find_all("p", string=re.compile("Some"))
paragraphs

headers = soup.find_all("h2", string = re.compile('(H|h)eader'))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

Select Method on Beautiful Soup (CSS selector)

In [None]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [None]:
content = soup.select('p')
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [None]:
#from the same content above, if we add div, we will get more specific, it selects the p elements inside div (check documentation on css)
content = soup.select("div p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [None]:
paragraphs = soup.select('h2 ~ p')
paragraphs
#This will show all the paragraphs that were preceded by h2

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [None]:
bold_text = soup.select("p#paragraph-id b")
bold_text
# This will show the bold text after id

[<b>Some bold text</b>]

Getting different properties of the HTML

In [None]:
header = soup.find("h2")
header.string
#.string will help us print only what's inside of h2

'A Header'

In [None]:
div = soup.find("div")
print(div.prettify())
print(div.get_text())
# get_text function will show the text, works when there are several tags inside (check here, div has h1 and p inside)

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [None]:
# Get a specific property from an element
link = soup.find("a")
link["href"]
paragraphs = soup.select('p#paragraph-id')
paragraphs[0]['id']

'paragraph-id'

Code navigation

In [None]:
#Path Syntax
soup.body.div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [None]:
soup.body.div.h1.string  #Will print the string on h1, very specific

'HTML Webpage'

In [None]:
soup.body.find("div")

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [None]:
soup.body.find("div").find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

EXERCISES we will work on the same github page above

In [None]:
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")
webpage = bs(r.content)
print(webpage.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

In [None]:
#Grab all of the social links from the webpage (3 parts)
#This is my try; not correct but getting close, check the other 2
webpage.body.find_all('ul')

[<ul class="fun-facts">
 <li>Owned my dream car in high school <a href="#footer"><sup>1</sup></a></li>
 <li>Middle name is Ronald</li>
 <li>Never had been on a plane until college</li>
 <li>Dunkin Donuts coffee is better than Starbucks</li>
 <li>A favorite book series of mine is <i>Ender's Game</i></li>
 <li>Current video game of choice is <i>Rocket League</i></li>
 <li>The band that I've seen the most times live is the <i>Zac Brown Band</i></li>
 </ul>, <ul class="socials">
 <li class="social instagram"><b>Instagram: </b><a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a></li>
 <li class="social twitter"><b>Twitter: </b><a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a></li>
 <li class="social linkedin"><b>LinkedIn: </b><a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a></li>
 <li class="social tiktok"><b>TikTok: </b><a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.c

In [None]:
#Another way to do it
links = webpage.select('ul.socials a')
links

[<a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>]

In [None]:
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [None]:
#Last way to do it
ulist = webpage.find("ul", attrs={"class": "socials"})
links2 = ulist.find_all('a')
actual_links2 = [link['href'] for link in links]
actual_links2

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [None]:
#Last way
links3= webpage.select("li.social a")
actual_links3 = [link['href'] for link in links]
actual_links3

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

2d exercice: scrape the table

In [None]:
#When we inspect the table on the webpage, we can see that we can grab the whole table with class_hockey stats
table = webpage.select("table.hockey-stats")[0]
table

<table class="hockey-stats">
<thead>
<tr>
<th class="season" data-sort="">S</th>
<th class="team" data-sort="team">Team</th>
<th class="league" data-sort="league">League</th>
<th class="regular gp" data-sort="gp">GP</th>
<th class="regular g" data-sort="g">G</th>
<th class="regular a" data-sort="a">A</th>
<th class="regular tp" data-sort="tp">TP</th>
<th class="regular pim" data-sort="pim">PIM</th>
<th class="regular pm" data-sort="pm">+/-</th>
<th class="separator"> </th>
<th class="postseason">POST</th>
<th class="postseason gp" data-sort="playoffs-gp">GP</th>
<th class="postseason g" data-sort="playoffs-g">G</th>
<th class="postseason a" data-sort="playoffs-a">A</th>
<th class="postseason tp" data-sort="playoffs-tp">TP</th>
<th class="postseason pim" data-sort="playoffs-pim">PIM</th>
<th class="postseason pm" data-sort="playoffs-pm">+/-</th>
</tr>
</thead>
<tbody>
<tr class="team-continent-NA ">
<td class="season sorted">
                  2014-15
              </td>
<td class="team

In [None]:
# Now we want to load this table into a pandas dataframe --> google it
import pandas as pd
columns = table.find('thead').find_all('th')
columns

[<th class="season" data-sort="">S</th>,
 <th class="team" data-sort="team">Team</th>,
 <th class="league" data-sort="league">League</th>,
 <th class="regular gp" data-sort="gp">GP</th>,
 <th class="regular g" data-sort="g">G</th>,
 <th class="regular a" data-sort="a">A</th>,
 <th class="regular tp" data-sort="tp">TP</th>,
 <th class="regular pim" data-sort="pim">PIM</th>,
 <th class="regular pm" data-sort="pm">+/-</th>,
 <th class="separator"> </th>,
 <th class="postseason">POST</th>,
 <th class="postseason gp" data-sort="playoffs-gp">GP</th>,
 <th class="postseason g" data-sort="playoffs-g">G</th>,
 <th class="postseason a" data-sort="playoffs-a">A</th>,
 <th class="postseason tp" data-sort="playoffs-tp">TP</th>,
 <th class="postseason pim" data-sort="playoffs-pim">PIM</th>,
 <th class="postseason pm" data-sort="playoffs-pm">+/-</th>]

In [None]:
column_names = [c.string for c in columns]
column_names

['S',
 'Team',
 'League',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-',
 '\xa0',
 'POST',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-']

In [None]:
table_rows = table.find('tbody').find_all("tr")
l = []
for tr in table_rows:
  td = tr.find_all('td')
  row = [str(tr.get_text()).strip() for tr in td]
  l.append(row)
print(l)

[['2014-15', 'MIT (Mass. Inst. of Tech.)', 'ACHA II', '17', '3', '9', '12', '20', '', '|', '', '', '', '', '', '', ''], ['2015-16', 'MIT (Mass. Inst. of Tech.)', 'ACHA II', '9', '1', '1', '2', '2', '', '|', '', '', '', '', '', '', ''], ['2016-17', 'MIT (Mass. Inst. of Tech.)', 'ACHA II', '12', '5', '5', '10', '8', '0', '|', '', '', '', '', '', '', ''], ['2017-18', 'Did not play', '', '', '', '', '', '', '', '|', '', '', '', '', '', '', ''], ['2018-19', 'MIT (Mass. Inst. of Tech.)', 'ACHA III', '8', '5', '10', '15', '8', '', '|', '', '', '', '', '', '', '']]


In [None]:
df = pd.DataFrame(l, columns=column_names)
df

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


In [None]:
df.head()

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


In [None]:
df.loc[df['Team'] != "Did not play"]  # To filter the not played

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17,3,9,12,20,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9,1,1,2,2,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12,5,5,10,8,0.0,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8,5,10,15,8,,|,,,,,,,


In [None]:
df.style.set_caption("Hello World")
df

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


Exercice, find all fun facts that use the word "is"

In [None]:
facts = webpage.select("ul.fun-facts li")
facts

[<li>Owned my dream car in high school <a href="#footer"><sup>1</sup></a></li>,
 <li>Middle name is Ronald</li>,
 <li>Never had been on a plane until college</li>,
 <li>Dunkin Donuts coffee is better than Starbucks</li>,
 <li>A favorite book series of mine is <i>Ender's Game</i></li>,
 <li>Current video game of choice is <i>Rocket League</i></li>,
 <li>The band that I've seen the most times live is the <i>Zac Brown Band</i></li>]

In [None]:
facts_with = [fact.find_all(string=re.compile("is")) for fact in facts]
facts_with

[[],
 ['Middle name is Ronald'],
 [],
 ['Dunkin Donuts coffee is better than Starbucks'],
 ['A favorite book series of mine is '],
 ['Current video game of choice is '],
 ["The band that I've seen the most times live is the "]]