In [1]:
import json
import pandas as pd
import re
import requests
import spotipy  # Note spotipy package must first be installed: go to terminal and type "pip install spotipy"
from spotipy.oauth2 import SpotifyClientCredentials
import time

In [2]:
FEATURE_STRINGS = {"featuring", "feat", "ft", "with"}

## Get music charts

#### Data shape:

A $n\times6$ dataframe where $n$ is the number of songs scraped.

Columns include:  
- `number`: where the song was positioned in the chart
- `song`: the name of the song
- `artist`: the name of the artist(s)
- `chart_name`: the name of the chart it appeared in
- `year`: the year it charted
- `region`: the country the chart is from


#### Billboard top 100 annual charts

Access the US Billboards Chart data from Wikipedia and create a DataFrame of the Top 100 Songs for each year between 1980 to 2023.

In [3]:
startYear, endYear = 2020, 2020 # Reduced year range for working.
fileOut = "data/charts.json"


def scrape_billboard_wiki(
        year,
        url="https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_{}",
        chartName="Billboard Top 100",
        chartRegion="US"
        ):
    
    url = url.format(year)
    print(f"Scraping URL: {url}", end="\r")

    # Get tables from URL
    wikitables = pd.read_html(url, attrs={'class': 'wikitable'},)

    # Get first table on page
    chart = wikitables[0]

    # Fix inconsistent header name for chart position
    chart = chart.rename(columns={
        chart.columns[0]: 'number',
        chart.columns[1]: 'song',
        chart.columns[2]: 'artist'
        })

    # Remove quotation marks from song names
    chart["song"] = chart["song"].apply(str.strip, args=('"'))

    # Add metadata to entries
    chart["chart_name"] = chartName
    chart["year"] = year
    chart["region"] = chartRegion

    return chart


# Create dataframe of charts
billboard_charts = pd.concat([
    scrape_billboard_wiki(year)
    for year
    in range(startYear, endYear+1)
    ])

print(f"A total of {endYear - startYear + 1} years have been scraped successfully.".ljust(120))

A total of 1 years have been scraped successfully.                                                                      


#### Combine all charts to one DataFrame

In [4]:
debugLimit = None

# Combine all charts into one DataFrame
charts = pd.concat([billboard_charts])
if debugLimit: charts = charts.iloc[:debugLimit]

print(f"A total of {len(charts)} song objects have been created for years {startYear}-{endYear}.".ljust(120))

A total of 100 song objects have been created for years 2020-2020.                                                      


## Get Spotify ID for songs

In [5]:
waitBetweenQueries = 0.2
credentialsPath = "credentials.json"

# Load credentials from JSON file
with open(credentialsPath, 'r') as fh:
    credentials = json.load(fh)[0]

# Spotify setup
client_credentials_manager = SpotifyClientCredentials(**credentials)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)


def search_spotify(query, quiet=False):
    """
    This function takes an input parameter of a search query; searches Spotify
    and returns the Spotify ID for the track if found, else None.
    """

    # see https://developer.spotify.com/documentation/web-api/reference/#category-search for information
    # regarding using the Search API

    search = sp.search(query, type="track", limit=1)  # Assuming first song returned will be best match
    song = search["tracks"]["items"]

    if not quiet: print(f"""{"--- Result" if song else "*** No result"} found for '{query}'""".ljust(120), end="\r")

    time.sleep(waitBetweenQueries)

    if song:
        return song[0]
    else:
        # Try again without special characters
        substring = re.sub(r"[(){}[,.&/'\]]", "", query)
        if substring != query: return search_spotify(substring, quiet)

        # Try again without feature strings
        substring = " ".join([w for w in query.lower().split(" ") if w not in FEATURE_STRINGS])
        if substring != query: return search_spotify(substring, quiet)

        # Give up if no matching song found
        return None


charts["id"] = pd.Series(charts["song"]+" "+charts["artist"]).apply(search_spotify).apply(dict.get, args=("id", None))

print(f"""Found Spotify IDs for {charts["id"].notna().sum()} of {len(charts)} songs ({charts["id"].notna().sum()/len(charts)*100:.2f}% success)""".ljust(120))

Found Spotify IDs for 100 of 100 songs (100.00% success)                                                                


## Get Music Features for songs

In [6]:
selectFeatures = {
    "danceability",     "acousticness",
    "liveness",         "speechiness",
    "instrumentalness", "energy",
    "valence",          "tempo",
    "duration_ms"
    }


def get_music_feautures(id, features=selectFeatures, quiet=False):
    result = sp.audio_features(id)[0]
    if not quiet: print(f"""{"--- Result" if result else "*** No result"} found for '{id}'""".ljust(120), end="\r")
    return {
        k: v
        for k, v
        in result.items()
        if k in features
        }


# Get a feature dictionary for each track
charts["features"] = charts["id"].apply(get_music_feautures)

# Expand feature dictionary keys to individual columns
charts = pd.concat([charts, pd.DataFrame(charts["features"].to_list())], axis=1)

# Convert duration feature to seconds
charts["duration"] = charts["duration_ms"]/1000
charts.drop(["duration_ms", "features"], inplace=True, axis=1)

print(f"""Found Music Features for {charts["duration"].notna().sum()} of {len(charts)} songs ({charts["duration"].notna().sum()/len(charts)*100:.2f}% success)""".ljust(120))

Found Music Features for 100 of 100 songs (100.00% success)                                                             


## Get Lyrics for songs

In [7]:
waitBetweenQueries = 0.5
lyricsAPI = "https://some-random-api.ml/lyrics?title={}"


def get_lyrics(query, quiet=False):
    response = requests.get(lyricsAPI.format(query))
    song = json.loads(response.text).get("lyrics", None)
    
    if not quiet: print(f"""{"--- Result" if song else "*** No result"} found for '{query}'""".ljust(120), end="\r")
    
    time.sleep(waitBetweenQueries)

    if song:
        verses = song.split("\n\n")
        lyrics = [verse.split("\n") for verse in verses]
        return lyrics
    else:
        # Try again without special characters
        substring = re.sub(r"[(){}[,.&/'\]]", "", query)
        if substring != query: return get_lyrics(substring, quiet)

        # Try again without feature strings
        substring = " ".join([w for w in query.lower().split(" ") if w not in FEATURE_STRINGS])
        if substring != query: return get_lyrics(substring, quiet)

        # Give up if no matching song found
        return None
    

charts["lyrics"] = pd.Series(charts["song"]+" "+charts["artist"]).apply(get_lyrics)

print(f"""Found lyrics for {charts["lyrics"].notna().sum()} of {len(charts)} songs ({charts["lyrics"].notna().sum()/len(charts)*100:.2f}% success)""".ljust(120))

Found lyrics for 61 of 100 songs (61.00% success)                                                                       


## Write charts to JSON

In [8]:
with open(fileOut, 'w') as fh:
    charts.to_json(fh, orient="records", indent=4)