In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re 
import random
from time import sleep

In [17]:
#first we need the master list of all the pages
url = "https://en.wikipedia.org/wiki/Lists_of_Billboard_Hot_100_top-ten_singles"
headers = {"User-Agent": "Mozilla/5.0"}  # Important to avoid HTTP 403
response = requests.get(url, headers=headers)
response.raise_for_status()  # Ensure it succeeded
soup = BeautifulSoup(response.text, "html.parser")

In [28]:
# Find all relevant Wikipedia URLs for Billboard Hot 100 top-ten singles lists
links = soup.find_all('a', href=True)
billboard_links = [
    "https://en.wikipedia.org" + link['href']
    for link in links
    if link['href'].startswith('/wiki/List_of_Billboard_Hot_100_top-ten_singles_in')
]
billboard_links = list(set(billboard_links))

# Display the found URLs
for url in billboard_links:
    print(url)

https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_2015
https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1986
https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1962
https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1970
https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1996
https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1979
https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1997
https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1972
https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1994
https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_2002
https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_2017
https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1984
https://en.wikipedia.org/wiki/List_of_Bi

In [29]:
print(f"We found {len(billboard_links)} URLs.")

We found 68 URLs.


In [6]:
test = requests.get(billboard_links[0], headers=headers)
test.raise_for_status()  # Ensure it succeeded
soup_test = BeautifulSoup(test.text, "html.parser")

In [37]:
def data_scraper(urls):
    headers = {"User-Agent": "Mozilla/5.0"}  # Avoid HTTP 403
    all_data = []

    # Final column names
    column_names = [
        'Top Ten Entry Date',
        'Single Name',
        'Artist(s)',
        'Peak',
        'Peak Date',
        'Weeks in Top Ten',
        'Ref',
        'Year'
    ]

    for url in urls:
        print(f"Scraping URL: {url}")
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")
        # We only need the first table on each page
        table = soup.find("table", {"class": "wikitable"})

        # Convert sentinel <th colspan="7"> into <td colspan="7"> so pandas keeps them
        for sentinel in table.find_all("th"):
            if sentinel.get("colspan"):  # only the multi-col sentinel headers
                sentinel.name = "td"

        # Extract default year from URL
        year = int(url[-4:])

        # Read table into DataFrame, but keep all rows (including sentinel rows)
        df = pd.read_html(str(table), header=0)[0]

        # Handle missing 'Ref' column
        if len(df.columns) == 6: 
            df['Ref'] = None

        # Add year column, initially filled with default
        df['Year'] = pd.NA

        # --- Sentinel logic ---
        # Find sentinel rows like "Singles from 2024"
        mask = df.apply(
            lambda row: row.astype(str).str.contains(r"Singles from \d{4}").any(),
            axis=1
        )
        print(f"Found {mask.sum()} sentinel rows in this table.")
        if mask.sum() == 0:
            # If no sentinels, fill year column with default year
            df["Year"] = year
        else:
            # Extract year from sentinel rows
            df.loc[mask, "Year"] = df.loc[mask].apply(
                lambda row: int(re.search(r"\d{4}", " ".join(row.astype(str))).group()),
                axis=1
            )
        print(df.head(6))

        # Forward-fill the year column
        df["Year"] = df["Year"].ffill()

        # Drop the sentinel rows
        df = df[~mask].reset_index(drop=True)

        # Rename columns consistently
        df.columns = column_names

        # Add year to peak date since this will always takes the url's year
        df['Peak Date'] = (
            df['Peak Date']
            .str.replace(r'\(.*\)', '', regex=True)    # remove anything in parentheses
            .str.replace(r'\[\d+\]', '', regex=True)   # remove citation brackets like [1]
            .str.strip()                               # remove leading/trailing whitespace
        )
        df['Peak Date'] = pd.to_datetime(df['Peak Date'].astype(str) + ' ' + str(year), errors='coerce')

        # add year to entry date
        df['Top Ten Entry Date'] = (
            df['Top Ten Entry Date']
            .str.replace(r'\(.*\)', '', regex=True)    # remove anything in parentheses
            .str.replace(r'\[\d+\]', '', regex=True)   # remove citation brackets like [1]
            .str.strip()                               # remove leading/trailing whitespace
        )
        df['Top Ten Entry Date'] = pd.to_datetime(df['Top Ten Entry Date'].astype(str) + ' ' + df['Year'].astype(str), errors='coerce')

        ### lets clean the data up a bit
        #remove everything after the second " in the single name
        df['Single Name'] = df['Single Name'].str.split('"').str[1]
        #convert weeks in top ten to int
        df['Weeks in Top Ten'] = df['Weeks in Top Ten'].astype(str).str.extract(r'(\d+)')  # extract the number
        df['Weeks in Top Ten'] = df['Weeks in Top Ten'].astype(int)
        #convert peak to int
        # Keep only digits and convert to integer
        df['Peak'] = df['Peak'].astype(str).str.extract(r'(\d+)')  # extract the number
        df['Peak'] = df['Peak'].astype(int)  # convert to integer

        all_data.append(df)
        sleep(random.uniform(1, 3))  # sleep between 1 and 3 seconds

    return pd.concat(all_data, ignore_index=True) if all_data else pd.DataFrame()


In [8]:
df = data_scraper(billboard_links[:5])

Scraping URL: https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_2025


  df = pd.read_html(str(table), header=0)[0]
  df["Year"] = df["Year"].ffill()


Found 2 sentinel rows in this table.
  Top ten entry date                      Single                 Artist(s)  \
0  Singles from 2024           Singles from 2024         Singles from 2024   
1          August 31  "Die with a Smile"[B][P] ↑  Lady Gaga and Bruno Mars   
2         November 2              "Apt."[B][H] ↑       Rosé and Bruno Mars   
3         December 7            "Luther"[B][O] ↑    Kendrick Lamar and SZA   
4  Singles from 2025           Singles from 2025         Singles from 2025   

                Peak          Peak date   Weeks in top ten               Ref.  \
0  Singles from 2024  Singles from 2024  Singles from 2024  Singles from 2024   
1                  1         January 11                 50             [2][3]   
2                  3         February 1                 14             [4][5]   
3                  1            March 1                 32             [6][7]   
4  Singles from 2025  Singles from 2025  Singles from 2025  Singles from 2025   

   Year

  df = pd.read_html(str(table), header=0)[0]


Scraping URL: https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1959


  df = pd.read_html(str(table), header=0)[0]
  df["Year"] = df["Year"].ffill()


Found 2 sentinel rows in this table.
  Top ten entry date                     Single          Artist(s)  \
0  Singles from 1958          Singles from 1958  Singles from 1958   
1        December 15  "Smoke Gets in Your Eyes"       The Platters   
2        December 22       "A Lover's Question"    Clyde McPhatter   
3        December 29       "Whole Lotta Lovin'"        Fats Domino   
4  Singles from 1959          Singles from 1959  Singles from 1959   

                Peak          Peak date   Weeks in top ten   Ref  Year  
0  Singles from 1958  Singles from 1958  Singles from 1958  None  1958  
1                  1         January 19                 10  None  <NA>  
2                  6         January 19                  7  None  <NA>  
3                  6         January 12                  5  None  <NA>  
4  Singles from 1959  Singles from 1959  Singles from 1959  None  1959  
Scraping URL: https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_top-ten_singles_in_1960


  df = pd.read_html(str(table), header=0)[0]
  df["Year"] = df["Year"].ffill()


Found 2 sentinel rows in this table.
  Top ten entry date                            Single          Artist(s)  \
0  Singles from 1959                 Singles from 1959  Singles from 1959   
1        December 21                         "El Paso"      Marty Robbins   
2        December 21  "Way Down Yonder in New Orleans"      Freddy Cannon   
3        December 28                "Pretty Blue Eyes"     Steve Lawrence   
4  Singles from 1960                 Singles from 1960  Singles from 1960   

                Peak          Peak date   Weeks in top ten   Ref  Year  
0  Singles from 1959  Singles from 1959  Singles from 1959  None  1959  
1                  1          January 4                  9  None  <NA>  
2                  3         January 11                  7  None  <NA>  
3                  9          January 4                  6  None  <NA>  
4  Singles from 1960  Singles from 1960  Singles from 1960  None  1960  
Scraping URL: https://en.wikipedia.org/wiki/List_of_Billboard_

  df = pd.read_html(str(table), header=0)[0]
  df["Year"] = df["Year"].ffill()


Found 2 sentinel rows in this table.
  Top ten entry date                 Single                Artist(s)  \
0  Singles from 1960      Singles from 1960        Singles from 1960   
1        December 12  "Wonderland by Night"           Bert Kaempfert   
2        December 12               "Exodus"       Ferrante & Teicher   
3        December 26     "Corrina, Corinna"             Ray Peterson   
4     December 31[1]           "Angel Baby"  Rosie and the Originals   

                Peak          Peak date   Weeks in top ten   Ref  Year  
0  Singles from 1960  Singles from 1960  Singles from 1960  None  1960  
1                  1          January 9                 10  None  <NA>  
2                  2         January 23                 11  None  <NA>  
3                  9          January 9                  5  None  <NA>  
4                  5         January 23                  7  None  <NA>  


In [9]:
print(len(df))
print(df['Year'].unique())
print(df['Year'].value_counts())

365
[2024 2025 1958 1959 1960 1961]
Year
1961    101
1960     95
1959     91
1958     43
2025     32
2024      3
Name: count, dtype: int64


In [10]:
df[df['Year'] == 1958].head(20)

Unnamed: 0,Top Ten Entry Date,Single Name,Artist(s),Peak,Peak Date,Weeks in Top Ten,Ref,Year
35,1958-08-04,Poor Little Fool,Ricky Nelson,1,1958-08-04,6,,1958
36,1958-08-04,Patricia,Pérez Prado,2,1958-08-04,6,,1958
37,1958-08-04,Splish Splash,Bobby Darin,3,1958-08-04,3,,1958
38,1958-08-04,Hard Headed Woman,Elvis Presley,4,1958-08-04,2,,1958
39,1958-08-04,When,Kalin Twins,5,1958-08-04,5,,1958
40,1958-08-04,Rebel 'Rouser,Duane Eddy,6,1958-08-04,3,,1958
41,1958-08-04,Yakety Yak,The Coasters,7,1958-08-04,1,,1958
42,1958-08-04,My True Love,Jack Scott,3,1958-08-18,6,,1958
43,1958-08-04,Willie and the Hand Jive,The Johnny Otis Show,9,1958-08-04,2,,1958
44,1958-08-04,Fever,Peggy Lee,8,1958-08-25,3,,1958


In [15]:
df

Unnamed: 0,Top Ten Entry Date,Single Name,Artist(s),Peak,Peak Date,Weeks in Top Ten,Ref,Year
0,2024-08-31,Die with a Smile,Lady Gaga and Bruno Mars,1,2025-01-11,50,[2][3],2024
1,2024-11-02,Apt.,Rosé and Bruno Mars,3,2025-02-01,14,[4][5],2024
2,2024-12-07,Luther,Kendrick Lamar and SZA,1,2025-03-01,32,[6][7],2024
3,2025-01-18,Smile,Morgan Wallen,4,2025-01-18,1,[8],2025
4,2025-01-25,DTMF,Bad Bunny,2,2025-01-25,3,[9],2025
...,...,...,...,...,...,...,...,...
360,1961-12-04,I Understand (Just How You Feel),The G-Clefs,9,1961-12-04,1,,1961
361,1961-12-11,The Lion Sleeps Tonight,The Tokens,1,1961-12-18,8,,1961
362,1961-12-11,Run to Him,Bobby Vee,2,1961-12-25,6,,1961
363,1961-12-11,Tonight,Ferrante & Teicher,8,1961-12-11,1,,1961
