In [1]:
## import libraries
import requests  # Makes HTTP requests to fetch web pages from URLs
from bs4 import BeautifulSoup  # Parses HTML content into navigable Python objects for web scraping
import pandas as pd  # Creates and manipulates DataFrames for organizing scraped data into tables
import time  # Adds delays between requests to avoid overwhelming the server
from random import uniform  # Generates random time intervals to make scraping delays less predictable

In [3]:
## create headers

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}

In [5]:
all_dfs = []  ## hold all dfs (dataframes) that will be created for each page
base_url = "https://bestsellingalbums.org/decade/2010"  ## base URL of the site to scrape
end_page = 10  ## total number of pages we want to scrape

for url_number in range(1, end_page + 1):
    try:  ## attempt to request the page; useful if a request fails
        if url_number != 1:
            ## for all pages except the first, append "-page_number" to the base URL
            response = requests.get(f"{base_url}-{url_number}")
        else:
            ## first page has no suffix, so just use the base URL
            response = requests.get(base_url)
    except:  ## if the request fails, handle the exception gracefully
        print(f"Problem with {base_url}-{url_number}")
    finally:  ## whether successful or not, proceed to parse any response obtained
        ## convert the page’s HTML into a BeautifulSoup object for parsing
        soup = BeautifulSoup(response.text, "html.parser")

        ## find all album entries on the page (each album is inside a div with class 'album_card')
        all_targets = soup.find_all("div", class_="album_card")

        ## extract artist names from each album card
        artists_list = [target.find("div", class_="artist").get_text() for target in all_targets]
        ## extract album titles from each album card
        albums_list = [target.find("div", class_="album").get_text() for target in all_targets]
        ## extract links for more info (the href attribute of the <a> tag)
        more_info_list = [target.find("a").get("href") for target in all_targets]
        ## extract sales numbers, clean text (“Sales: ” and commas), and convert to integers
        sales_list = [int(target.find("div", class_="sales").get_text().replace("Sales: ", "").replace(",", "")) \
                      for target in all_targets]
        
        ## combine the extracted lists into a dictionary and convert it into a DataFrame
        all_dfs.append(pd.DataFrame({"artist": artists_list, "album": albums_list,
                           "sales": sales_list, "more_info": more_info_list}))
            
        ## pause between page requests to avoid overwhelming the server (random delay between 30–45 seconds)
        snoozer = uniform(30,45)
        print(f"Created DF from page {url_number} and snoozing for {snoozer} seconds before next page")
        time.sleep(snoozer)  ## actually wait the random time before continuing
print(f"Done scraping all {end_page} pages")  ## confirm completion once all pages are processed

Created DF from page 1 and snoozing for 36.89904923117118 seconds before next page
Created DF from page 2 and snoozing for 41.75067165699052 seconds before next page
Created DF from page 3 and snoozing for 42.02276311919892 seconds before next page
Created DF from page 4 and snoozing for 41.79314933053509 seconds before next page
Created DF from page 5 and snoozing for 41.17386190328989 seconds before next page
Created DF from page 6 and snoozing for 30.310483080631524 seconds before next page
Created DF from page 7 and snoozing for 43.38829212725903 seconds before next page
Created DF from page 8 and snoozing for 34.14939987370336 seconds before next page
Created DF from page 9 and snoozing for 33.133345127362084 seconds before next page
Created DF from page 10 and snoozing for 30.543097555112695 seconds before next page
Done scraping all 10 pages


In [7]:
## turn into single data frame
df = pd.concat(all_dfs, ignore_index = True)
df

Unnamed: 0,artist,album,sales,more_info
0,ADELE,21,30000000,https://bestsellingalbums.org/album/1034
1,ADELE,25,23000000,https://bestsellingalbums.org/album/1035
2,MICHAEL BUBLÉ,CHRISTMAS,15000000,https://bestsellingalbums.org/album/30524
3,TAYLOR SWIFT,1989,14748116,https://bestsellingalbums.org/album/45488
4,JUSTIN BIEBER,PURPOSE,14000000,https://bestsellingalbums.org/album/23318
...,...,...,...,...
495,LOGIC,UNDER PRESSURE,1060000,https://bestsellingalbums.org/album/27268
496,HALESTORM,THE STRANGE CASE OF,1060000,https://bestsellingalbums.org/album/17960
497,ZAC BROWN BAND,UNCAGED,1055000,https://bestsellingalbums.org/album/56701
498,FUTURE,FUTURE,1050371,https://bestsellingalbums.org/album/16036
