In [1]:
from requests import get 
from bs4 import BeautifulSoup
import os
import pandas as pd

# Intro to Web Scraping 
- use requests to download the HTML 
- use BeautifulSoup to parse that HTML to get the thing(s) you need

## Process 
- Step 1: Use the 'request' library to make an HTTP request across the web 
- Step 2: use the 'response.text' property on the 'response' object to get the text of the HTML 

In [3]:
url = "https://site-to-scrape.glitch.me"

In [4]:
headers = {'User-Agent': 'Jemison Cohort'}
response = get(url, headers = headers)
response

<Response [200]>

In [5]:
response.text

'<!DOCTYPE html>\n<html lang="en">\n  <head>\n    <title>Site to Scrape!</title>\n    <meta charset="utf-8">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge">\n    <meta name="viewport" content="width=device-width, initial-scale=1">\n    \n    <!-- import the webpage\'s stylesheet -->\n    <link rel="stylesheet" href="/style.css">\n    \n    <!-- import the webpage\'s javascript file -->\n    <script src="/script.js" defer></script>\n  </head>  \n  <body>\n    <header>\n      <h1>This is the header!</h1>\n      <hr>\n    </header>\n    \n    <main>\n      <div>\n        <h1 class="first">\n        This is the main\n        </h1>\n        <h2>\n          This is an h2 of main\n        </h2>\n        <h3>\n          H3 inside of first div inside of main\n        </h3>\n      </div>\n      <div>\n        <h3 class="first">\n          H3 inside of second div inside of main.\n        </h3>\n        <p>\n          Here\'s some text content for us to scrape! 👽\n        </p>\n        

In [6]:
soup = BeautifulSoup(response.content, 'html.parser')

In [7]:
soup

<!DOCTYPE html>

<html lang="en">
<head>
<title>Site to Scrape!</title>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<!-- import the webpage's stylesheet -->
<link href="/style.css" rel="stylesheet"/>
<!-- import the webpage's javascript file -->
<script defer="" src="/script.js"></script>
</head>
<body>
<header>
<h1>This is the header!</h1>
<hr/>
</header>
<main>
<div>
<h1 class="first">
        This is the main
        </h1>
<h2>
          This is an h2 of main
        </h2>
<h3>
          H3 inside of first div inside of main
        </h3>
</div>
<div>
<h3 class="first">
          H3 inside of second div inside of main.
        </h3>
<p>
          Here's some text content for us to scrape! 👽
        </p>
<p>
          Here's another paragraph of content! ☠️
        </p>
<a href="https://ryanorsinger.com">Click here to visit my homepage</a>
</div>
</main>
<footer>
<h1>This is the f

In [8]:
soup.title

<title>Site to Scrape!</title>

In [9]:
soup.h1

<h1>This is the header!</h1>

In [10]:
#returns the first match only with this dot syntax
soup.h2

<h2>
          This is an h2 of main
        </h2>

In [11]:
soup.text

"\n\n\nSite to Scrape!\n\n\n\n\n\n\n\n\n\n\nThis is the header!\n\n\n\n\n\n        This is the main\n        \n\n          This is an h2 of main\n        \n\n          H3 inside of first div inside of main\n        \n\n\n\n          H3 inside of second div inside of main.\n        \n\n          Here's some text content for us to scrape! 👽\n        \n\n          Here's another paragraph of content! ☠️\n        \nClick here to visit my homepage\n\n\n\nThis is the footer\n\n\n\n\n"

In [12]:
print(soup.text)




Site to Scrape!










This is the header!





        This is the main
        

          This is an h2 of main
        

          H3 inside of first div inside of main
        



          H3 inside of second div inside of main.
        

          Here's some text content for us to scrape! 👽
        

          Here's another paragraph of content! ☠️
        
Click here to visit my homepage



This is the footer







In [13]:
#soup.element returns a beautiful soup tag 
soup.h2.text

'\n          This is an h2 of main\n        '

In [14]:
soup.h2.text.strip()[-5:]

' main'

In [16]:
# find_all returns a result set, which is like a list, but has more BeautifulSoup functionality 
soup.find_all('h3')[0]

<h3>
          H3 inside of first div inside of main
        </h3>

In [17]:
soup.find_all('h3')

[<h3>
           H3 inside of first div inside of main
         </h3>,
 <h3 class="first">
           H3 inside of second div inside of main.
         </h3>]

In [18]:
soup.title.text 

'Site to Scrape!'

In [20]:
soup.select("p")

[<p>
           Here's some text content for us to scrape! 👽
         </p>,
 <p>
           Here's another paragraph of content! ☠️
         </p>]

In [21]:
for p in soup.select("p"):
    print(p.text)


          Here's some text content for us to scrape! 👽
        

          Here's another paragraph of content! ☠️
        


In [22]:
#a is for anchor 
soup.select_one("a")

<a href="https://ryanorsinger.com">Click here to visit my homepage</a>

In [24]:
# .select will return a ResultSet even if there's only one of something 
soup.select("a")

[<a href="https://ryanorsinger.com">Click here to visit my homepage</a>]

In [25]:
soup.select("img")

[<img alt="" aria-hidden="true" src="https://traffic-analytics.glitch.me/counter.png?fallback=MY_WEBSITE&amp;color=black" style="vertical-align: bottom;"/>]

In [26]:
#try to select something that doesn't exist 
soup.select('h5')

[]

In [28]:
#what is the significance of the datatype in terms of web scraping 
type(soup.select("body"))

bs4.element.ResultSet

In [29]:
soup.select("body")[0].text

"\n\nThis is the header!\n\n\n\n\n\n        This is the main\n        \n\n          This is an h2 of main\n        \n\n          H3 inside of first div inside of main\n        \n\n\n\n          H3 inside of second div inside of main.\n        \n\n          Here's some text content for us to scrape! 👽\n        \n\n          Here's another paragraph of content! ☠️\n        \nClick here to visit my homepage\n\n\n\nThis is the footer\n\n\n"

In [30]:
soup.select_one("footer")

<footer>
<h1>This is the footer</h1>
<img alt="" aria-hidden="true" src="https://traffic-analytics.glitch.me/counter.png?fallback=MY_WEBSITE&amp;color=black" style="vertical-align: bottom;"/>
</footer>

In [31]:
soup.select_one("footer").text

'\nThis is the footer\n\n'

In [32]:
soup.select_one("footer").img

<img alt="" aria-hidden="true" src="https://traffic-analytics.glitch.me/counter.png?fallback=MY_WEBSITE&amp;color=black" style="vertical-align: bottom;"/>

In [33]:
#use dictionary syntax to access the attribute values 
soup.select_one("footer").img["src"]

'https://traffic-analytics.glitch.me/counter.png?fallback=MY_WEBSITE&color=black'

In [34]:
soup.select_one("a")

<a href="https://ryanorsinger.com">Click here to visit my homepage</a>

In [36]:
url = soup.select_one("a")["href"]
url

'https://ryanorsinger.com'