In [3]:
import requests
from bs4 import BeautifulSoup as bs

Load the first page

In [2]:
url = "https://keithgalli.github.io/web-scraping/example.html"

r = requests.get(url)

Convert to beautiful Soup object

In [4]:
soup = bs(r.content)

Print out hrml

In [6]:
#print(soup)
# to make it more readable:

print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



In [8]:
#help(bs)

Find and find_all

In [11]:
first_header = soup.find("h2")
first_header
# it fins the first one

<h2>A Header</h2>

In [13]:
headers = soup.find_all("h2")
headers
# list of all elements

[<h2>A Header</h2>, <h2>Another header</h2>]

In [15]:
# pass in list of elements to look for
first_header = soup.find(["h1", "h2"])
first_header
# it finds the first occurence of any of the elements passed into the list

<h1>HTML Webpage</h1>

In [16]:
headers = soup.find_all(["h2", "h1"])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [20]:
# pass attributes to the find/find_all
paras = soup.find_all("p")
paras

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [21]:
paras = soup.find_all("p", attrs={"id" : "paragraph-id"})
paras

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [22]:
# nest find/find_all calls
body = soup.find("body")
body

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

In [24]:
div = body.find("div") # searching within the body
div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [25]:
# now lets get header from that
header = div.find("h1")
header

<h1>HTML Webpage</h1>

In [27]:
# search for specific strings in find/find_all
soup.find_all("p", string = "Some")

[]

In [28]:
# need another library
import re
soup.find_all("p", string = re.compile("Some"))

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [31]:
headers = soup.find_all("h2", string = re.compile("(H|h)eader"))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

### Select Method

like seleting stuff in css

In [32]:
content = soup.select("p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [34]:
# getting paragraphs after h2
soup.select("h2 ~ p")

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [37]:
soup.select("p#paragraph-id b")

[<b>Some bold text</b>]

In [39]:
paras = soup.select("body > p")
print(paras)

for p in paras:
    print(p.select("i"))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


### Get different Properties of the HTML

In [41]:
header = soup.find("h1")
header.string

'HTML Webpage'

In [45]:
div = soup.find("div")
print(div.get_text())


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [90]:
# how to get the link
link = soup.find("a")
print(link)
print(link["href"])
print(type(link))

<a href="https://www.youtube.com/kgmit">youtube.com/kgmit</a>
https://www.youtube.com/kgmit
<class 'bs4.element.Tag'>


### Code navigation

In [62]:
soup.body.div.p.a["href"]

'https://keithgalli.github.io/web-scraping/webpage.html'

In [64]:
# important syntaxx:
# parent, sibling, child
soup.body.find("div").find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

## Practice

In [178]:
url = "https://keithgalli.github.io/web-scraping/webpage.html"

r = requests.get(url)
soup = bs(r.content)

### 1. Grab all of the social links from the webpage in three different ways

Use SELECT

In [154]:
socials = soup.select("a")
# does not really help

In [162]:
links = soup.select("ul.socials a")
# "#" is for id and "." is for classnames
links

[<a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>]

In [163]:
actual_links = [link["href"] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

Use FIND_ALL

In [183]:
ulinks = soup.find("ul", attrs= {"class" : "socials"})
links = ulinks.find_all("a")
links


[<a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>]

In [186]:
reallinks = [link["href"] for link in links]
reallinks

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [189]:
# they have class tag of "social"
links = soup.select("li.social a")
links = [link["href"] for link in links]
links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

## Scrape the Table

In [196]:
# get the en tire table

table = soup.select("table.hockey-stats")[0]
import pandas as pd

In [209]:
# table to df
# get the columns:
columns = table.find("thead").find_all("th")
columns  = [t.text for t in columns]
columns

['S',
 'Team',
 'League',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-',
 '\xa0',
 'POST',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-']

In [205]:
table

<table class="hockey-stats">
<thead>
<tr>
<th class="season" data-sort="">S</th>
<th class="team" data-sort="team">Team</th>
<th class="league" data-sort="league">League</th>
<th class="regular gp" data-sort="gp">GP</th>
<th class="regular g" data-sort="g">G</th>
<th class="regular a" data-sort="a">A</th>
<th class="regular tp" data-sort="tp">TP</th>
<th class="regular pim" data-sort="pim">PIM</th>
<th class="regular pm" data-sort="pm">+/-</th>
<th class="separator"> </th>
<th class="postseason">POST</th>
<th class="postseason gp" data-sort="playoffs-gp">GP</th>
<th class="postseason g" data-sort="playoffs-g">G</th>
<th class="postseason a" data-sort="playoffs-a">A</th>
<th class="postseason tp" data-sort="playoffs-tp">TP</th>
<th class="postseason pim" data-sort="playoffs-pim">PIM</th>
<th class="postseason pm" data-sort="playoffs-pm">+/-</th>
</tr>
</thead>
<tbody>
<tr class="team-continent-NA">
<td class="season sorted">
                  2014-15
              </td>
<td class="team"