In [66]:
import requests 
# The requests module allows you to send HTTP requests using Python.
# The HTTP request returns a Response Object with all the response data (content, encoding, status, etc).
from bs4 import BeautifulSoup as bs 
import re
import pandas as pd 

# Load our first page ✨

In [3]:
# Load the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

# Convert to beautiful soup object 
soup = bs(r.content) # getting HTML of the webpage 

# Print out HTML
print(soup.prettify())


<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



# Start using Beautiful Soup for Scraping 🍲

## **find and find_all**

In [4]:
first_header = soup.find("h2")
# find - only first instance 
first_header

<h2>A Header</h2>

In [5]:
headers = soup.find_all("h2")
# find_all - all instances
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

 ## pass a list 

In [6]:
first_header = soup.find(["h1", "h2"])
first_header

<h1>HTML Webpage</h1>

In [7]:
headers = soup.find_all(["h1", "h2"])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

## pass an attribute "attrs" of the HTML tag

In [8]:
# Pass in attributes to the find / find_all function
paragraph = soup.find_all("p")
paragraph

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [9]:
paragraph = soup.find_all("p", attrs = {"id":"paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

## nesting

In [10]:
# Find all / find nesting 

body = soup.find("body")
div = body.find("div")
header = div.find("h1")
header

<h1>HTML Webpage</h1>


## finding strings using "string" and regex

In [11]:
# Specific strings in find/ find all funcion 

par = soup.find_all("p", string = re.compile("Some"))
par

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [12]:
par_2 = soup.find_all("h2", string = re.compile("(H|h)eader"))
par_2

[<h2>A Header</h2>, <h2>Another header</h2>]

## **Select (CSS selector)**

### LOGIC BEHIND SELECT IS PRETTY MUCH ABOUT CSS SELECTORS
## [Link to CSS selectors](https://www.w3schools.com/cssref/css_selectors.php)

In [13]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



## Descendants (all elements inside the teg)

In [14]:
# select can look through many tegs
content = soup.select("body p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

## Direct children elements (elements that are nested ONLY 1 level deeper inside parent teg)

In [15]:
paragraphs = soup.select("body > p")
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

## Sibling element (that are on the same level and share same parent)

In [16]:
# select can look what teg goes after this teg - on the same level
parag = soup.select("h2 ~ p") # looking for p that goes just after h2 
parag

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

## Selecting by #identefier

In [17]:
bold_text = soup.select("p#paragraph-id b")
bold_text

[<b>Some bold text</b>]

In [18]:
paragraphs = soup.select("body p")
print(paragraphs)

for p in paragraphs:
    print(p.select("a"))


[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>, <p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>]
[]
[]


## **Getting different properties of HTML**

### Getting string

In [19]:
# we can print out only text from the teg
header = soup.find("h2")
header.string

'A Header'

### with bigger tegs like div or body - .string does not work. 
### instead we should use get_text()

In [20]:
div = soup.find("div")
print(div.prettify())
print(div.string)

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>

None


In [21]:
# using get_text()
print(div.get_text())


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



## **Get a specific property from an element**

### Link

In [22]:
a = soup.find("a")
a

<a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>

In [23]:
# to select only link (href property) we must use ["href"]
a["href"]

'https://keithgalli.github.io/web-scraping/webpage.html'

### Id

In [24]:
paragraph = soup.select("p#paragraph-id")
# to select id, we should get the first element of the list (because select returns list) and then use ["id"]
paragraph[0]["id"]

'paragraph-id'

## **Code navigation**

### Path Syntax

In [25]:
soup.body.div.h1.string

'HTML Webpage'

### Know 3 terms: parent, sibling and child

In [26]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



### Find_next_siblings(sibling) and find_previous_siblings(sibling)

In [27]:
# find next siblings 
soup.body.find("div").find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

# **Exercises! 😇**

## [New_Web_Page](https://keithgalli.github.io/web-scraping/webpage.html)

### Load the webpage:

In [None]:
# Load the webpage content
q = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")

# Convert to beautiful soup object 
web = bs(q.content) # getting HTML of the webpage 

# Print out HTML
print(web.prettify())

### **1.Grab all the social links (only social networks) from the webpage in 3 different ways 📷**

In [57]:
# 1 using find_all 
# we have class social 
ulist = web.find("ul", attrs = {"class": "socials"})
alist = ulist.find_all("a")
actual_links = [link["href"] for link in alist]
actual_links
# links = links.fin

# for links in links:
#     link
# links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [35]:
# 2 using select - all social links have class: social 
# . is for class and # is for id
links = web.select("ul.socials a")
[link["href"] for link in links]
        

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [62]:
# 3 using nested select 
links = web.select("li.social a")
actual_links = [link["href"] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

### **2. Scrape the table 📋**

In [None]:
table = web.select("table.hockey-stats")[0]
print(table.prettify())

### Columns

In [71]:
columns = table.find("thead").find_all("th")
columns

[<th class="season" data-sort="">S</th>,
 <th class="team" data-sort="team">Team</th>,
 <th class="league" data-sort="league">League</th>,
 <th class="regular gp" data-sort="gp">GP</th>,
 <th class="regular g" data-sort="g">G</th>,
 <th class="regular a" data-sort="a">A</th>,
 <th class="regular tp" data-sort="tp">TP</th>,
 <th class="regular pim" data-sort="pim">PIM</th>,
 <th class="regular pm" data-sort="pm">+/-</th>,
 <th class="separator"> </th>,
 <th class="postseason">POST</th>,
 <th class="postseason gp" data-sort="playoffs-gp">GP</th>,
 <th class="postseason g" data-sort="playoffs-g">G</th>,
 <th class="postseason a" data-sort="playoffs-a">A</th>,
 <th class="postseason tp" data-sort="playoffs-tp">TP</th>,
 <th class="postseason pim" data-sort="playoffs-pim">PIM</th>,
 <th class="postseason pm" data-sort="playoffs-pm">+/-</th>]

### Column_names

In [77]:
column_names = [column.text for column in columns]
column_names

['S',
 'Team',
 'League',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-',
 '\xa0',
 'POST',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-']

### Rows

In [None]:

table_rows = table.find("tbody").find_all("tr")
table_rows

### Code from StackOverFlow about scraping the table 

In [84]:
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [str(tr.get_text()).strip() for tr in td]
    l.append(row)
df = pd.DataFrame(l, columns = column_names)
df


Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


## **3. Grab all the fan facts that use word "is" in it 😜**