Web Scraping using Python

Lab 1: Single page scraping

In [63]:
#create a function: scrape_hot100() to scrape the current top 100 songs present at https://www.billboard.com/charts/hot-100 and their respective artists, put the information into a pandas dataframe, and save the dataframe in a csv file in the current folder.

In [64]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

#from time import sleep
#from random import randint


In [65]:
url = 'https://www.billboard.com/charts/hot-100'
#wait_time = randint(1, 5) # 5

In [66]:
# Function to scrape Billboard Hot 100 songs and artists
def scrape_hot100(url:str):
    # Send an HTTP GET request to the website
    song_list= []
    artists_name = []
    
    response = requests.get(url) #fetch the page

    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')
        
        #parse List Items
        items = soup.select('li h3#title-of-a-story.c-title')
        
        #extract song titles
        song_names = [item.text.strip() for item in items]
        
        #extract artists
        items = soup.select('li h3#title-of-a-story.c-title + span')
        
        artist_names = [item.text.strip() for item in items]

        # Create a DataFrame to store the data
        df = pd.DataFrame({'song_name': song_names, 'artist': artist_names})

        # Save the DataFrame to a CSV file in the current folder
        df.to_csv(r'..\data\billboard_hot100.csv', index=False)
        print("Data has been scraped and saved to 'billboard_hot100.csv'.")
        return df
    else:
        print("Failed to retrieve the webpage.")

In [67]:
top_100=scrape_hot100(url)
top_100["hot"]="H"


Data has been scraped and saved to 'billboard_hot100.csv'.


In [68]:
#hot_songs.rename(columns={"Songs": "song_name"}, inplace=True)  # Rename the column
top_100.to_csv(r'..\data\top100.csv', index=False)

Lab 6_2 NOT Hot Songs

In [69]:
not_top_100 = pd.read_csv(r"..\data\genres_v2.csv", low_memory=False)#.sample(10000)
display(not_top_100.isna().sum())

danceability            0
energy                  0
key                     0
loudness                0
mode                    0
speechiness             0
acousticness            0
instrumentalness        0
liveness                0
valence                 0
tempo                   0
type                    0
id                      0
uri                     0
track_href              0
analysis_url            0
duration_ms             0
time_signature          0
genre                   0
song_name           20786
Unnamed: 0          21525
title               21525
dtype: int64

In [70]:
#clean & rearrange dataset
not_top_100.dropna(subset=["song_name"], inplace=True) # drop with no name
columns_to_drop = [col for col in not_top_100.columns if ":" in col]+["title"]
display(columns_to_drop)
not_top_100.drop(columns=columns_to_drop, inplace=True)
not_top_100.head()

['Unnamed: 0', 'title']

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,genre,song_name
0,0.831,0.814,2,-7.364,1,0.42,0.0598,0.0134,0.0556,0.389,156.985,audio_features,2Vc6NJ9PW9gD9q343XFRKx,spotify:track:2Vc6NJ9PW9gD9q343XFRKx,https://api.spotify.com/v1/tracks/2Vc6NJ9PW9gD...,https://api.spotify.com/v1/audio-analysis/2Vc6...,124539,4,Dark Trap,Mercury: Retrograde
1,0.719,0.493,8,-7.23,1,0.0794,0.401,0.0,0.118,0.124,115.08,audio_features,7pgJBLVz5VmnL7uGHmRj6p,spotify:track:7pgJBLVz5VmnL7uGHmRj6p,https://api.spotify.com/v1/tracks/7pgJBLVz5Vmn...,https://api.spotify.com/v1/audio-analysis/7pgJ...,224427,4,Dark Trap,Pathology
2,0.85,0.893,5,-4.783,1,0.0623,0.0138,4e-06,0.372,0.0391,218.05,audio_features,0vSWgAlfpye0WCGeNmuNhy,spotify:track:0vSWgAlfpye0WCGeNmuNhy,https://api.spotify.com/v1/tracks/0vSWgAlfpye0...,https://api.spotify.com/v1/audio-analysis/0vSW...,98821,4,Dark Trap,Symbiote
3,0.476,0.781,0,-4.71,1,0.103,0.0237,0.0,0.114,0.175,186.948,audio_features,0VSXnJqQkwuH2ei1nOQ1nu,spotify:track:0VSXnJqQkwuH2ei1nOQ1nu,https://api.spotify.com/v1/tracks/0VSXnJqQkwuH...,https://api.spotify.com/v1/audio-analysis/0VSX...,123661,3,Dark Trap,ProductOfDrugs (Prod. The Virus and Antidote)
4,0.798,0.624,2,-7.668,1,0.293,0.217,0.0,0.166,0.591,147.988,audio_features,4jCeguq9rMTlbMmPHuO7S3,spotify:track:4jCeguq9rMTlbMmPHuO7S3,https://api.spotify.com/v1/tracks/4jCeguq9rMTl...,https://api.spotify.com/v1/audio-analysis/4jCe...,123298,4,Dark Trap,Venom


In [71]:
#reorder & add "hot="N"
column_order = ['song_name'] + [col for col in not_top_100.columns if col != 'song_name']
not_top_100 = not_top_100.reindex(columns=column_order)
not_top_100["hot"]="N"

display(not_top_100.head(), not_top_100.shape)
display(not_top_100.isna().sum())

Unnamed: 0,song_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,genre,hot
0,Mercury: Retrograde,0.831,0.814,2,-7.364,1,0.42,0.0598,0.0134,0.0556,...,156.985,audio_features,2Vc6NJ9PW9gD9q343XFRKx,spotify:track:2Vc6NJ9PW9gD9q343XFRKx,https://api.spotify.com/v1/tracks/2Vc6NJ9PW9gD...,https://api.spotify.com/v1/audio-analysis/2Vc6...,124539,4,Dark Trap,N
1,Pathology,0.719,0.493,8,-7.23,1,0.0794,0.401,0.0,0.118,...,115.08,audio_features,7pgJBLVz5VmnL7uGHmRj6p,spotify:track:7pgJBLVz5VmnL7uGHmRj6p,https://api.spotify.com/v1/tracks/7pgJBLVz5Vmn...,https://api.spotify.com/v1/audio-analysis/7pgJ...,224427,4,Dark Trap,N
2,Symbiote,0.85,0.893,5,-4.783,1,0.0623,0.0138,4e-06,0.372,...,218.05,audio_features,0vSWgAlfpye0WCGeNmuNhy,spotify:track:0vSWgAlfpye0WCGeNmuNhy,https://api.spotify.com/v1/tracks/0vSWgAlfpye0...,https://api.spotify.com/v1/audio-analysis/0vSW...,98821,4,Dark Trap,N
3,ProductOfDrugs (Prod. The Virus and Antidote),0.476,0.781,0,-4.71,1,0.103,0.0237,0.0,0.114,...,186.948,audio_features,0VSXnJqQkwuH2ei1nOQ1nu,spotify:track:0VSXnJqQkwuH2ei1nOQ1nu,https://api.spotify.com/v1/tracks/0VSXnJqQkwuH...,https://api.spotify.com/v1/audio-analysis/0VSX...,123661,3,Dark Trap,N
4,Venom,0.798,0.624,2,-7.668,1,0.293,0.217,0.0,0.166,...,147.988,audio_features,4jCeguq9rMTlbMmPHuO7S3,spotify:track:4jCeguq9rMTlbMmPHuO7S3,https://api.spotify.com/v1/tracks/4jCeguq9rMTl...,https://api.spotify.com/v1/audio-analysis/4jCe...,123298,4,Dark Trap,N


(21519, 21)

song_name           0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
type                0
id                  0
uri                 0
track_href          0
analysis_url        0
duration_ms         0
time_signature      0
genre               0
hot                 0
dtype: int64

In [74]:
#top_100 = pd.read_csv(r'..\data\top100.csv')
display(top_100.head())

Unnamed: 0,song_name,artist,hot
0,Paint The Town Red,Doja Cat,H
1,Snooze,SZA,H
2,Cruel Summer,Taylor Swift,H
3,Fast Car,Luke Combs,H
4,3D,Jung Kook & Jack Harlow,H


In [75]:
# remove the songs from "non_top_100" that are also in "top_100"
not_top_100 = not_top_100[~not_top_100['song_name'].isin(top_100['song_name'])]

In [76]:
#save to file
not_top_100.to_csv(r'..\data\not_top100.csv', index=False)