## Import Modules

In [1]:
# import modules
import pandas as pd
import numpy as np
import spotipy

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.2f}'.format)
# NOTE: underscore separaters ('_') are better than commas (',') because 
# numbers with underscores work in Python without any extra effort.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

### Data Sources

**The Billboard 100**

https://en.wikipedia.org/wiki/Billboard_Hot_100

https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs

**1.2M Songs with Metadata (csv)**

https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs

**8+ M. Spotify Tracks, Genre, Audio Features (SQL)**

https://www.kaggle.com/datasets/maltegrosse/8-m-spotify-tracks-genre-audio-features


**Spotify API**

https://developer.spotify.com/documentation/web-api/

https://developer.spotify.com/console/get-search-item

https://developer.spotify.com/console/get-audio-features-track/

https://developer.spotify.com/documentation/web-api/reference/#/operations/get-audio-features

Spotipy Library:  https://spotipy.readthedocs.io/en/master/


**Data Description and Discussion:**

* The Billboard 100 data did not include audio features. It was combined with audio features from the following sources:
    * 1.2M Songs with Metadata (csv format)
    * 8+ M. Spotify Tracks, Genre, Audio Features (SQLite format)
    * Spotify API data gathered via the library Spotipy
* Overall, audio features was gathered for approximately 75% of songs from the Billboard 100.
    * Some songs were excluded based on data repetition issues
        * Typically this was only hard to find songs with very similar names
        * For example searching for 'Metallica The Unforgiven' and 'Metallica The Unforgiven Part 2' yielded the same Spotify id
            * It was determined that excluding these songs was less error-prone than manually fixing the issues
            * Alternatively, we could have kept 1 song. In this case, there is up to a 50% chance that the song is mislabelled, so this option appeared less favourable than dropping both repeat instances.
* A Quality Assurance (QA) check was performed on the final dataset.
    * Audio features from 100 songs were gathered from the Spotify API and compared to the datasets listed above.
    * There were 3 non-trivial issues noted in 2 of the 100 songs:
        * Madonna Live To Tell
            * A significant increase in loudness (~7 dB)
            * Appoximately 1 second different in length
            * All other audio features consistent between data sources
            * Both of these changes appear to result from remastering and re-uploading the track
                * https://artists.spotify.com/help/article/re-uploading-music
        * Lil Wayne Let It All Work Out
            * The key signature was not consistent between the 2 sources
            * The newer source (the API request from Sept 11, 2022) was correct (B major)
            * The SQL database was also different
            * My supposition is that these errors are due to the characteristics of the song:
                * atonal (most notably the sining)
                * detuned (bass pitch automation, and low-fi detuning effects)
    * Overall, there is a large degree of consistency between datasets. Furthermore, inconsistencies are all explainable with reasonable suppositions.

In [2]:
desired_formatting = [
    'id', 'song', 'artist',
    'acousticness', 'danceability', 'duration_ms', 'energy', 
    'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 
    'speechiness', 'tempo', 'time_signature', 'valence'
]

desired_formatting_timeseries = [
    'date', 'id', 'song', 'artist',
    'rank', 'last-week', 'peak-rank', 'weeks-on-board',
    'acousticness', 'danceability', 'duration_ms', 'energy', 
    'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 
    'speechiness', 'tempo', 'time_signature', 'valence'
]

# all songs with audio features (combined from 3 sources)
df_10M = pd.read_csv('every_song_with_data.csv')
df_10M = df_10M[desired_formatting]

# all Billboard 100 lists, audio features included where possible
df_B100 = pd.read_csv('all_audio_features_billboard_100.csv')
df_B100 = df_B100[desired_formatting_timeseries]
df_B100['date'] = pd.to_datetime(df_B100['date'])

# all unique songs from the Billboard 100 lists, audio features included where possible
df_B100_songs = pd.read_csv('all_audio_features_billboard_100_songs.csv')
df_B100_songs = df_B100_songs[desired_formatting]

## Data Description

In [3]:
# sizes of the datasets
df_10M.shape, df_B100.shape, df_B100_songs.shape

((9595992, 16), (329930, 21), (29681, 16))

In [4]:
df_B100[['rank', 'last-week', 'peak-rank', 'weeks-on-board']].describe().loc[
    ['mean', 'std', 'min', '25%', '50%', '75%', 'max']
]

Unnamed: 0,rank,last-week,peak-rank,weeks-on-board
mean,50.5,47.59,40.97,9.16
std,28.87,28.05,29.35,7.62
min,1.0,1.0,1.0,1.0
25%,26.0,23.0,13.0,4.0
50%,51.0,47.0,38.0,7.0
75%,76.0,72.0,65.0,13.0
max,100.0,100.0,100.0,90.0


In [5]:
# truncate column names so they print better
df_10M.rename(columns=lambda x: x[:4], inplace=True)
df_B100.rename(columns=lambda x: x[:4], inplace=True)
df_B100_songs.rename(columns=lambda x: x[:4], inplace=True)

In [6]:
df_10M.describe().loc[['mean', 'std', 'min', '25%', '50%', '75%', 'max']]

Unnamed: 0,acou,danc,dura,ener,inst,key,live,loud,mode,spee,temp,time,vale
mean,0.42,0.53,238209.59,0.54,0.26,5.24,0.21,-10.89,0.66,0.1,118.56,3.84,0.48
std,0.38,0.19,159341.59,0.28,0.37,3.54,0.18,6.36,0.47,0.14,31.03,0.57,0.28
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0
25%,0.03,0.4,169600.0,0.31,0.0,2.0,0.1,-13.68,0.0,0.04,95.08,4.0,0.23
50%,0.34,0.55,216933.0,0.57,0.0,5.0,0.13,-9.2,1.0,0.05,118.95,4.0,0.47
75%,0.82,0.68,275080.0,0.79,0.64,8.0,0.26,-6.4,1.0,0.08,137.45,4.0,0.71
max,1.0,1.0,19672058.0,1.0,1.0,11.0,1.0,7.23,1.0,0.97,249.99,5.0,1.0


In [7]:
df_B100[['acou', 'danc', 'dura', 'ener', 'inst', 'key', 'live', 'loud', 
    'mode', 'spee', 'temp', 'time', 'vale']].describe().loc[['mean', 
    'std', 'min', '25%', '50%', '75%', 'max']]

Unnamed: 0,acou,danc,dura,ener,inst,key,live,loud,mode,spee,temp,time,vale
mean,0.28,0.6,226879.65,0.63,0.03,5.22,0.19,-8.61,0.73,0.06,120.4,3.94,0.61
std,0.27,0.15,66552.15,0.2,0.14,3.56,0.16,3.59,0.44,0.07,27.79,0.3,0.24
min,0.0,0.0,30213.0,0.01,0.0,0.0,0.01,-30.35,0.0,0.0,0.0,0.0,0.0
25%,0.04,0.51,183360.0,0.48,0.0,2.0,0.09,-10.97,0.0,0.03,99.93,4.0,0.42
50%,0.18,0.61,221306.0,0.64,0.0,5.0,0.13,-8.15,1.0,0.04,119.0,4.0,0.63
75%,0.47,0.71,258399.0,0.79,0.0,8.0,0.24,-5.79,1.0,0.06,136.0,4.0,0.81
max,1.0,0.99,1561133.0,1.0,0.99,11.0,1.0,2.29,1.0,0.95,241.01,5.0,0.99


In [8]:
df_B100_songs.describe().loc[['mean', 'std', 'min', '25%', '50%', '75%', 'max']]

Unnamed: 0,acou,danc,dura,ener,inst,key,live,loud,mode,spee,temp,time,vale
mean,0.32,0.59,217638.34,0.61,0.04,5.2,0.2,-8.93,0.74,0.07,120.48,3.92,0.61
std,0.29,0.15,68403.26,0.2,0.15,3.56,0.17,3.62,0.44,0.08,28.09,0.33,0.24
min,0.0,0.0,30213.0,0.01,0.0,0.0,0.01,-30.35,0.0,0.0,0.0,0.0,0.0
25%,0.05,0.49,169533.0,0.46,0.0,2.0,0.09,-11.31,0.0,0.03,99.79,4.0,0.42
50%,0.22,0.6,210426.0,0.62,0.0,5.0,0.13,-8.55,1.0,0.04,119.07,4.0,0.64
75%,0.56,0.7,251333.0,0.77,0.0,8.0,0.25,-6.11,1.0,0.06,136.34,4.0,0.81
max,1.0,0.99,1561133.0,1.0,0.99,11.0,1.0,2.29,1.0,0.95,241.01,5.0,0.99


**Proportion of Songs With Audio Feature Data:**

~75% of songs on the Billboard list are available on Spotify, and weren't removed for data errors

In [9]:
# All Billboard 100 lists
# number not null, total, proportion not null
(
    df_B100[df_B100.id.notnull()].shape[0], 
    df_B100.shape[0], 
    df_B100[df_B100.id.notnull()].shape[0] / df_B100.shape[0]
)

(253254, 329930, 0.7675991877064832)

In [10]:
# All songs from Billboard 100 lists
# number not null, total, proportion not null
(
    df_B100_songs[df_B100_songs.id.notnull()].shape[0], 
    df_B100_songs.shape[0], 
    df_B100_songs[df_B100_songs.id.notnull()].shape[0] / df_B100_songs.shape[0]
)

(22189, 29681, 0.7475826286176341)