# 8.01 - Lab | Web Scraping Single Page

In [3]:
# 1. import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [4]:
# 2. find url and store it in a variable
url = "https://www.billboard.com/charts/hot-100"

In [5]:
# 3. download html with a get request
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [6]:
# 4. parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")

In [7]:
# 5. retrieve/extract the desired info

# song titles
songs = soup.find_all("span", class_="chart-element__information__song text--truncate color--primary")

# artists
artists = soup.find_all("span", class_="chart-element__information__artist text--truncate color--secondary")

# last week
last_week_ranks = soup.find_all("span", class_= "chart-element__meta text--center color--secondary text--last")

# peak rank
peak_ranks = soup.find_all("span", class_= "chart-element__meta text--center color--secondary text--peak")

# weeks on chart
weeks_on_chart = soup.find_all("span", class_= "chart-element__meta text--center color--secondary text--week")

In [8]:
# 6. Get the text
for i in [songs, artists, last_week_ranks, peak_ranks, weeks_on_chart]:
    for j in range(len(i)):
        i[j] = i[j].getText()

In [9]:
# 7. Build a dataframe

billboard = pd.DataFrame(
    {"song": songs,
     "artist": artists,
     "last_week_rank": last_week_ranks,
     "peak_rank": peak_ranks,
     "weeks_on_chart": weeks_on_chart}
)

In [10]:
billboard

Unnamed: 0,song,artist,last_week_rank,peak_rank,weeks_on_chart
0,My Universe,Coldplay x BTS,-,1,1
1,Stay,The Kid LAROI & Justin Bieber,1,1,12
2,Industry Baby,Lil Nas X & Jack Harlow,2,2,10
3,Way 2 Sexy,Drake Featuring Future & Young Thug,3,1,4
4,Fancy Like,Walker Hayes,5,5,15
...,...,...,...,...,...
95,Pipe Down,Drake,68,14,4
96,Papi's Home,Drake,66,8,4
97,Chosen,Blxst & Tyga Featuring Ty Dolla $ign,-,98,1
98,Toxic Punk,YoungBoy Never Broke Again,-,99,1


In [11]:
html_doc = """
<!DOCTYPE html>
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
</html>
"""

In [12]:
# parse the element
soup = BeautifulSoup(html_doc, 'html.parser')

# html well indented. not always works great...
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>



In [13]:
soup.title

<title>The Dormouse's story</title>

In [14]:
soup.title.name

'title'

In [15]:
soup.title.string

"The Dormouse's story"

In [17]:
soup.title.parent.name

'head'

In [18]:
soup.p

<p class="title"><b>The Dormouse's story</b></p>

In [19]:
soup.p["class"]

['title']

In [20]:
soup.find_all("p")

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [21]:
soup.find_all("a")[0].get("href")

'http://example.com/elsie'

In [22]:
for link in soup.find_all('a'):
    print(link.get('href'))


http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [23]:
for link in soup.find_all('a'):
    print(link['href'])

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [24]:
print(soup.get_text())



The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...




In [25]:
for a in soup.find_all('a'):
    print(a.get_text())

Elsie
Lacie
Tillie
