# ETL - Dataset Collection

## Documentation
- Spotipy: https://spotipy.readthedocs.io/en/latest/#
- billboard-charts: https://github.com/guoguo12/billboard-charts

## Dependencies

In [1]:
!pip install spotipy --upgrade

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install billboard.py

Defaulting to user installation because normal site-packages is not writeable


In [3]:
!pip install python-dotenv

Defaulting to user installation because normal site-packages is not writeable


In [4]:
### Import Libraries
import os
from dotenv import load_dotenv
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pandas as pd
import requests
import billboard
from billboard import BillboardNotFoundException
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import time

In [5]:
### Set pandas settings
pd.set_option('display.max_columns', None)

## Spotipy Credentials

In [8]:
### Set Spotify API credentials

# Load environment variables from .env file
load_dotenv('config.env')

# Spotify API credentials
SPOTIPY_CLIENT_ID = os.environ.get('SPOTIPY_CLIENT_ID')
SPOTIPY_CLIENT_SECRET = os.environ.get('SPOTIPY_CLIENT_SECRET')

# initialize Spotipy with client credentials
client_credentials_manager = SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID,
                                                      client_secret=SPOTIPY_CLIENT_SECRET)

spotify = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

## Retrieve Billboard Data

In [7]:
### FUNCTION: Constructs Billboard charts with timeout
def construct_chart_with_timeout(chart_name, chart_date, chart_fetch, chart_max_retries, chart_timeout):
    url = f"https://www.billboard.com/charts/{chart_name}/{chart_date}"
    try:
        response = requests.get(url, timeout=chart_timeout)
        response.raise_for_status()  # Raise an exception for HTTP errors (e.g., 404)
        chart = billboard.ChartData(name=chart_name, date=chart_date, fetch=chart_fetch,
                                    max_retries=chart_max_retries, timeout=chart_timeout)
        return chart
    except requests.Timeout:
        print(f"Timeout occurred while fetching chart data for {chart_date}.")
        return None
    except requests.RequestException as e:
        print(f"Error occurred while fetching chart data for {chart_date}: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

## Retrieve Spotify API Data

In [8]:
### FUNCTION: gathers metadata associated with a song when song_name passed as an argument
def get_song_info(song_name, spotify):
    
    # Search for the song
    results = spotify.search(q='track:' + song_name, type='track', limit=1)
    
    # Check if there are any tracks returned in the search results
    if len(results['tracks']['items']) == 0:
        print(f"No results found for the song '{song_name}'")
        return None
    
    # Extract relevant information
    track = results['tracks']['items'][0]
    
    # Get audio features of the track
    audio_features = spotify.audio_features(track['id'])[0]
    
    # Extract artist id to get genres
    artist_id = results['tracks']['items'][0]['artists'][0]['id']
    artist_info = spotify.artist(artist_id)
    
    # Combine basic track info and audio features
    song_info = {
        'track_name': track['name'],
        'main_artist': track['artists'][0]['name'],
        'all_artists': [artist['name'] for artist in track['artists']],
        'album': track['album']['name'],
        'release_date': track['album']['release_date'],
        'popularity': track['popularity'],
        'Duration (ms)': track['duration_ms'],
        'preview_url': track['preview_url'],
        'genres': artist_info['genres'], # The Spotify API returns ALL genres associated with the artist, not for an individual track
        'followers': artist_info['followers']['total'],
        'cover_art_url': track['album']['images'][0]['url'],
        'external_url': track['external_urls']['spotify'],
        'track_number': track['track_number'],
        'explicit': track['explicit'],
        'available_markets': track['available_markets'],
        'track_id': track['id'],
        'album_type': track['album']['album_type']
    }

    # Put extrated audio features in dictionary
    song_info.update(audio_features)
    
    return song_info

## Generate Dataframe(s)

In [9]:
### DICTIONARY: maps keys to respective tonal counterpart and solfege
pitch_class = {
    -1: [None, None],
    0: ["C (also B♯, Ddouble flat)", "do"],
    1: ["C♯, D♭ (also Bdouble sharp)", None],
    2: ["D (also Cdouble sharp, Edouble flat)", "re"],
    3: ["D♯, E♭ (also Fdouble flat)", None],
    4: ["E (also Ddouble sharp, F♭)", "mi"],
    5: ["F (also E♯, Gdouble flat)", "fa"],
    6: ["F♯, G♭ (also Edouble sharp)", None],
    7: ["G (also Fdouble sharp, Adouble flat)", "sol"],
    8: ["G♯, A♭", None],
    9: ["A (also Gdouble sharp, Bdouble flat)", "la"],
    10: ["A♯, B♭ (also Cdouble flat)", None],
    11: ["B (also Adouble sharp, C♭)", "si"]
}

In [10]:
### FUNCTION: iterate over the Billboard chart and retrieve song from Spotify information
all_song_info = []

def billboard_df(chart):
    
    for song in chart:

        song_title = song.title
        song_artist = song.artist
        song_weeks = song.weeks
        song_info = get_song_info(song_title, spotify)
        
        '''
        # ! NOT WORKING: retrieve lyrics using Genius API
        genius_artist = genius.search_artist(str(song_artist))
        if genius_artist is not None:
            genius_song = genius_artist.song(song_title)
            
            if genius_song is not None:
                if song_info is not None:
                    song_info['Weeks'] = song_weeks
                    song_info['Lyrics'] = genius_song.lyrics
                    all_song_info.append(song_info)
        '''
        if song_info is not None:
            song_info['weeks_on_chart'] = song_weeks
            all_song_info.append(song_info)

    # convert list of song information dictionaries to DataFrame
    df = pd.DataFrame(all_song_info)
    
    # adjust df before returing
    df['release_date'] = pd.to_datetime(df['release_date'])
    df['duration_sec'] = df['Duration (ms)'] * 0.001
    df['tonal_counterparts'] = df['key'].map(lambda x: pitch_class[x][0]) # The key the track is in. Integers map to pitches using standard Pitch Class notation. E.g. 0 = C, 1 = C♯/D♭, 2 = D, and so on. If no key was detected, the value is -1.
    df['solfege'] = df['key'].map(lambda x: pitch_class[x][1]) # The key the track is in. Integers map to pitches using standard Pitch Class notation. E.g. 0 = C, 1 = C♯/D♭, 2 = D, and so on. If no key was detected, the value is -1.
    df = df.drop(['type', 'Duration (ms)', 'duration_ms', 'id'], axis=1)
    
    return df

# Generate Giant Dataframe

In [11]:
### Store date range for desired data in list
start_date = datetime(2014, 4, 1) # Start date
end_date = datetime(2024, 4, 1) # End date (inclusive)

# Initialize an empty list to store the first day of each month
chart_dates = []

# Loop through each month within the date range
current_date = start_date
while current_date <= end_date:
    chart_dates.append(current_date.strftime('%Y-%m-%d'))  # Append the current date to the list
    # Move to the first day of the next month
    current_date += relativedelta(months=1)

# Type cast all values to strings
chart_dates = [str(month) for month in chart_dates]

In [14]:
### LOOP: Gather data for each month in date range and append to final_df
final_df = pd.DataFrame()

for month in chart_dates:
    chart_name = 'hot-100'
    chart_fetch = True
    chart_max_retries = 10
    chart_timeout = 30  # Set your desired timeout value in seconds
    
    try:
        month_str = datetime.strptime(month, '%Y-%m-%d')
        chart = construct_chart_with_timeout(chart_name, month, chart_fetch, chart_max_retries, chart_timeout)
        if chart:
            df = billboard_df(chart)
            df['chart_month'] = month_str
            final_df = pd.concat([final_df, df], ignore_index=True)
            final_df[['all_artists', 'genres', 'available_markets']] = final_df[['all_artists', 'genres', 'available_markets']].applymap(str)
            print(month_str, " done.")
    except BillboardNotFoundException:
        print(f"No chart found for {month}. Skipping...")
        continue
        
    # Add a time buffer to avoid rate limiting
    #time.sleep(90)

final_df = final_df.drop_duplicates()

'\n### LOOP: Gather data for each month in date range and append to final_df\nfinal_df = pd.DataFrame()\n\nfor month in chart_dates:\n    chart_name = \'hot-100\'\n    chart_fetch = True\n    chart_max_retries = 10\n    chart_timeout = 30  # Set your desired timeout value in seconds\n    \n    try:\n        month_str = datetime.strptime(month, \'%Y-%m-%d\')\n        chart = construct_chart_with_timeout(chart_name, month, chart_fetch, chart_max_retries, chart_timeout)\n        if chart:\n            df = billboard_df(chart)\n            df[\'chart_month\'] = month_str\n            final_df = pd.concat([final_df, df], ignore_index=True)\n            final_df[[\'all_artists\', \'genres\', \'available_markets\']] = final_df[[\'all_artists\', \'genres\', \'available_markets\']].applymap(str)\n            print(month_str, " done.")\n    except BillboardNotFoundException:\n        print(f"No chart found for {month}. Skipping...")\n        continue\n        \n    # Add a time buffer to avoid r

In [None]:
### Final changes to final_df before exporting
final_df['mode'] = final_df['mode'].replace({0: 'minor', 1: 'major'})
final_df['pitch_key'] = final_df['key']

In [13]:
### Show final_df before exporting
final_df

Unnamed: 0,track_name,main_artist,all_artists,album,release_date,popularity,preview_url,genres,followers,cover_art_url,external_url,track_number,explicit,available_markets,track_id,album_type,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,uri,track_href,analysis_url,time_signature,weeks_on_chart,duration_sec,tonal_counterparts,solfege,chart_month
0,HAPPY,NF,['NF'],HOPE,2023-04-07,74,,"['hip hop', 'pop rap']",9685562,https://i.scdn.co/image/ab67616d0000b273ff8a42...,https://open.spotify.com/track/3ZEno9fORwMA1HP...,5,False,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",3ZEno9fORwMA1HPecdLi0R,album,0.732,0.8550,7,-4.064,1,0.0415,0.30600,0.000000,0.120,0.221,106.004,spotify:track:3ZEno9fORwMA1HPecdLi0R,https://api.spotify.com/v1/tracks/3ZEno9fORwMA...,https://api.spotify.com/v1/audio-analysis/3ZEn...,4,12,242.691,"G (also Fdouble sharp, Adouble flat)",sol,2014-04-01
1,all of me,21 Savage,['21 Savage'],american dream,2024-01-12,80,https://p.scdn.co/mp3-preview/fac460f3b30ac75e...,"['atl hip hop', 'hip hop', 'rap']",17962466,https://i.scdn.co/image/ab67616d0000b273bbdceb...,https://open.spotify.com/track/2FoahzOSxJnalPA...,2,True,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",2FoahzOSxJnalPA8aBUme3,album,0.653,0.8050,5,-5.708,0,0.3040,0.12000,0.000012,0.842,0.791,159.947,spotify:track:2FoahzOSxJnalPA8aBUme3,https://api.spotify.com/v1/tracks/2FoahzOSxJna...,https://api.spotify.com/v1/audio-analysis/2Foa...,4,24,198.292,"F (also E♯, Gdouble flat)",fa,2014-04-01
2,Dark Horse,Katy Perry,"['Katy Perry', 'Juicy J']",PRISM,2013-01-01,81,,['pop'],32455667,https://i.scdn.co/image/ab67616d0000b2731e9a05...,https://open.spotify.com/track/4jbmgIyjGoXjY01...,6,False,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",4jbmgIyjGoXjY01XxatOx6,album,0.647,0.5850,6,-6.123,1,0.0512,0.00314,0.000000,0.165,0.353,131.934,spotify:track:4jbmgIyjGoXjY01XxatOx6,https://api.spotify.com/v1/tracks/4jbmgIyjGoXj...,https://api.spotify.com/v1/audio-analysis/4jbm...,4,27,215.672,"F♯, G♭ (also Edouble sharp)",,2014-04-01
3,Talk Dirty (feat. 2 Chainz),Jason Derulo,"['Jason Derulo', '2 Chainz']",Talk Dirty,2013-09-10,67,https://p.scdn.co/mp3-preview/1bc07a5c8add9558...,"['dance pop', 'pop']",11892587,https://i.scdn.co/image/ab67616d0000b2730376bd...,https://open.spotify.com/track/6g6A7qNhTfUgOSH...,1,True,"['IT', 'US']",6g6A7qNhTfUgOSH7ROOxTD,album,0.760,0.6520,6,-7.321,1,0.2320,0.03480,0.000000,0.307,0.759,100.315,spotify:track:6g6A7qNhTfUgOSH7ROOxTD,https://api.spotify.com/v1/tracks/6g6A7qNhTfUg...,https://api.spotify.com/v1/audio-analysis/6g6A...,4,14,177.685,"F♯, G♭ (also Edouble sharp)",,2014-04-01
4,Let It Go,James Bay,['James Bay'],Chaos And The Calm,2014-12-15,70,,"['neo mellow', 'pop', 'uk pop']",3973330,https://i.scdn.co/image/ab67616d0000b273b36f5e...,https://open.spotify.com/track/13HVjjWUZFaWilh...,3,False,"['CA', 'MX', 'US']",13HVjjWUZFaWilh2QUJKsP,album,0.546,0.3110,1,-10.396,1,0.0288,0.81800,0.000017,0.107,0.246,147.464,spotify:track:13HVjjWUZFaWilh2QUJKsP,https://api.spotify.com/v1/tracks/13HVjjWUZFaW...,https://api.spotify.com/v1/audio-analysis/13HV...,4,17,260.532,"C♯, D♭ (also Bdouble sharp)",,2014-04-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20987,Russ Millions x Fumez The Engineer - Plugged In,Fumez The Engineer,"['Fumez The Engineer', 'Russ Millions']",Russ Millions x Fumez The Engineer - Plugged In,2021-01-14,62,https://p.scdn.co/mp3-preview/eaa050b513ec3efa...,"['melodic drill', 'uk drill', 'uk hip hop', 'u...",490607,https://i.scdn.co/image/ab67616d0000b273441c1b...,https://open.spotify.com/track/2Oojar4HgMDujLW...,1,True,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",2Oojar4HgMDujLWRzkK2bz,single,0.859,0.7430,4,-9.342,1,0.4590,0.38000,0.021800,0.160,0.665,140.969,spotify:track:2Oojar4HgMDujLWRzkK2bz,https://api.spotify.com/v1/tracks/2Oojar4HgMDu...,https://api.spotify.com/v1/audio-analysis/2Ooj...,4,5,119.148,"E (also Ddouble sharp, F♭)",mi,2015-11-01
20988,Home,Good Neighbours,['Good Neighbours'],Home,2024-01-17,89,https://p.scdn.co/mp3-preview/b51601b08cf27141...,[],59336,https://i.scdn.co/image/ab67616d0000b27343a153...,https://open.spotify.com/track/6dpLxbF7lfCAnC9...,1,False,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",6dpLxbF7lfCAnC9QRTjNLK,single,0.583,0.6760,6,-4.440,1,0.0298,0.02890,0.063400,0.121,0.134,77.002,spotify:track:6dpLxbF7lfCAnC9QRTjNLK,https://api.spotify.com/v1/tracks/6dpLxbF7lfCA...,https://api.spotify.com/v1/audio-analysis/6dpL...,4,1,157.485,"F♯, G♭ (also Edouble sharp)",,2015-11-01
20989,Irresistible,Fall Out Boy,['Fall Out Boy'],American Beauty/American Psycho,2015-01-16,63,,"['emo', 'modern rock', 'pop', 'rock']",10489429,https://i.scdn.co/image/ab67616d0000b2733cf1c1...,https://open.spotify.com/track/3znPiywA0q1VK2j...,1,False,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",3znPiywA0q1VK2jgAZFDoI,album,0.484,0.9380,2,-3.123,1,0.1120,0.07060,0.000000,0.236,0.444,165.049,spotify:track:3znPiywA0q1VK2jgAZFDoI,https://api.spotify.com/v1/tracks/3znPiywA0q1V...,https://api.spotify.com/v1/audio-analysis/3znP...,4,2,206.506,"D (also Cdouble sharp, Edouble flat)",re,2015-11-01
20990,"Can You Feel the Love Tonight (From ""The Lion ...",Sleepyheadz,['Sleepyheadz'],Baby Love - Lullaby Love Songs for You and You...,2021-01-29,63,https://p.scdn.co/mp3-preview/6c76dfe22ffaa568...,['lullaby'],7170,https://i.scdn.co/image/ab67616d0000b273bf1770...,https://open.spotify.com/track/0OuXsSVuxNZLeQQ...,9,False,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",0OuXsSVuxNZLeQQQP3HJjc,album,0.281,0.0228,10,-27.763,1,0.0412,0.98800,0.928000,0.104,0.176,108.671,spotify:track:0OuXsSVuxNZLeQQQP3HJjc,https://api.spotify.com/v1/tracks/0OuXsSVuxNZL...,https://api.spotify.com/v1/audio-analysis/0OuX...,4,1,250.666,"A♯, B♭ (also Cdouble flat)",,2015-11-01


In [None]:
### Save the DataFrame as a CSV file to directory

# Get current date and time
current_datetime = datetime.now()

# Format the date and time to include in the file name
timestamp = current_datetime.strftime("%Y%m%d_%H%M%S")

# Construct the file name with timestamp
file_name = f'bulk_data_{timestamp}.csv'

# Specify the path to the 'data' subfolder relative to the current directory
data_folder = os.path.join('data')

# Ensure that the 'data' subfolder exists, create it if it doesn't
os.makedirs(data_folder, exist_ok=True)

# Construct the file path within the 'data' subfolder
file_path = os.path.join(data_folder, file_name)

# Save the DataFrame as a CSV file to the 'data' subfolder
final_df.to_csv(file_path, index=False)

# Print confirmation message
print(f"DataFrame saved as CSV file: {file_path}")