# Import

In [1]:
# import modules
import pandas as pd
import numpy as np
import spotipy

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.2f}'.format)
# NOTE: underscore separaters ('_') are better than commas (',') because 
# numbers with underscores work in Python without any extra effort.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

# Data Sources

**The Billboard 100**

https://en.wikipedia.org/wiki/Billboard_Hot_100

https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs

**1.2M Songs with Metadata (csv)**

https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs

**8+ M. Spotify Tracks, Genre, Audio Features (SQL)**

https://www.kaggle.com/datasets/maltegrosse/8-m-spotify-tracks-genre-audio-features


**Spotify API**

https://developer.spotify.com/documentation/web-api/

https://developer.spotify.com/console/get-search-item

https://developer.spotify.com/console/get-audio-features-track/

https://developer.spotify.com/documentation/web-api/reference/#/operations/get-audio-features

Spotipy Library:  https://spotipy.readthedocs.io/en/master/


**Data Description and Discussion:**

* The Billboard 100 data did not include audio features. It was combined with audio features from the following sources:
    * 1.2M Songs with Metadata (csv format)
    * 8+ M. Spotify Tracks, Genre, Audio Features (SQLite format)
    * Spotify API data gathered via the library Spotipy
* Overall, audio features was gathered for approximately 75% of songs from the Billboard 100.
    * Some songs were excluded based on data repetition issues
        * Typically this was only hard to find songs with very similar names
        * For example searching for 'Metallica The Unforgiven' and 'Metallica The Unforgiven Part 2' yielded the same Spotify id
            * It was determined that excluding these songs was less error-prone than manually fixing the issues
            * Alternatively, we could have kept 1 song. In this case, there is up to a 50% chance that the song is mislabelled, so this option appeared less favourable than dropping both repeat instances.
* A Quality Assurance (QA) check was performed on the final dataset.
    * Audio features from 100 songs were gathered from the Spotify API and compared to the datasets listed above.
    * There were 3 non-trivial issues noted in 2 of the 100 songs:
        * Madonna Live To Tell
            * A significant increase in loudness (~7 dB)
            * Appoximately 1 second different in length
            * All other audio features consistent between data sources
            * Both of these changes appear to result from remastering and re-uploading the track
                * https://artists.spotify.com/help/article/re-uploading-music
        * Lil Wayne Let It All Work Out
            * The key signature was not consistent between the 2 sources
            * The newer source (the API request from Sept 11, 2022) was correct (B major)
            * The SQL database was also different
            * My supposition is that these errors are due to the characteristics of the song:
                * atonal (most notably the sining)
                * detuned (bass pitch automation, and low-fi detuning effects)
    * Overall, there is a large degree of consistency between datasets. Furthermore, inconsistencies are all explainable with reasonable suppositions.

In [2]:
desired_formatting = [
    'id', 'song', 'artist',
    'acousticness', 'danceability', 'duration_ms', 'energy', 
    'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 
    'speechiness', 'tempo', 'time_signature', 'valence'
]

desired_formatting_timeseries = [
    'date', 
    'id', 'song', 'artist',
    'rank', 'last-week', 'peak-rank', 'weeks-on-board',
    'acousticness', 'danceability', 'duration_ms', 'energy', 
    'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 
    'speechiness', 'tempo', 'time_signature', 'valence'
]

# all songs with audio features (combined from 3 sources)
df_10M = pd.read_csv('every_song_with_data.csv')
df_10M = df_10M[desired_formatting]

# all Billboard 100 lists, audio features included where possible
df_B100 = pd.read_csv('all_audio_features_billboard_100.csv')
df_B100 = df_B100[desired_formatting_timeseries]
df_B100['date'] = pd.to_datetime(df_B100['date'])

# all unique songs from the Billboard 100 lists, audio features included where possible
df_B100_songs = pd.read_csv('all_audio_features_billboard_100_songs.csv')
df_B100_songs = df_B100_songs[desired_formatting]

# Data Description

In [3]:
# sizes of the datasets
df_10M.shape, df_B100.shape, df_B100_songs.shape

((9595992, 16), (329930, 21), (29681, 16))

In [4]:
df_B100.date.min(), df_B100.date.max()

(Timestamp('1958-08-04 00:00:00'), Timestamp('2021-11-06 00:00:00'))

In [13]:
df_B100[['rank', 'last-week', 'peak-rank', 'weeks-on-board']].describe().loc['mean':'max'].T

Unnamed: 0,mean,std,min,25%,50%,75%,max
rank,50.5,28.87,1.0,26.0,51.0,76.0,100.0
last-week,47.59,28.05,1.0,23.0,47.0,72.0,100.0
peak-rank,40.97,29.35,1.0,13.0,38.0,65.0,100.0
weeks-on-board,9.16,7.62,1.0,4.0,7.0,13.0,90.0


In [8]:
df_10M.describe().loc['mean':'max'].T

Unnamed: 0,mean,std,min,25%,50%,75%,max
acousticness,0.42,0.38,0.0,0.03,0.34,0.82,1.0
danceability,0.53,0.19,0.0,0.4,0.55,0.68,1.0
duration_ms,238209.59,159341.59,0.0,169600.0,216933.0,275080.0,19672058.0
energy,0.54,0.28,0.0,0.31,0.57,0.79,1.0
instrumentalness,0.26,0.37,0.0,0.0,0.0,0.64,1.0
key,5.24,3.54,0.0,2.0,5.0,8.0,11.0
liveness,0.21,0.18,0.0,0.1,0.13,0.26,1.0
loudness,-10.89,6.36,-60.0,-13.68,-9.2,-6.4,7.23
mode,0.66,0.47,0.0,0.0,1.0,1.0,1.0
speechiness,0.1,0.14,0.0,0.04,0.05,0.08,0.97


In [9]:
df_B100.describe().loc['mean':'max'].T

Unnamed: 0,mean,std,min,25%,50%,75%,max
rank,50.5,28.87,1.0,26.0,51.0,76.0,100.0
last-week,47.59,28.05,1.0,23.0,47.0,72.0,100.0
peak-rank,40.97,29.35,1.0,13.0,38.0,65.0,100.0
weeks-on-board,9.16,7.62,1.0,4.0,7.0,13.0,90.0
acousticness,0.28,0.27,0.0,0.04,0.18,0.47,1.0
danceability,0.6,0.15,0.0,0.51,0.61,0.71,0.99
duration_ms,226879.65,66552.15,30213.0,183360.0,221306.0,258399.0,1561133.0
energy,0.63,0.2,0.01,0.48,0.64,0.79,1.0
instrumentalness,0.03,0.14,0.0,0.0,0.0,0.0,0.99
key,5.22,3.56,0.0,2.0,5.0,8.0,11.0


In [10]:
df_B100_songs.describe().loc['mean':'max'].T

Unnamed: 0,mean,std,min,25%,50%,75%,max
acousticness,0.32,0.29,0.0,0.05,0.22,0.56,1.0
danceability,0.59,0.15,0.0,0.49,0.6,0.7,0.99
duration_ms,217638.34,68403.26,30213.0,169533.0,210426.0,251333.0,1561133.0
energy,0.61,0.2,0.01,0.46,0.62,0.77,1.0
instrumentalness,0.04,0.15,0.0,0.0,0.0,0.0,0.99
key,5.2,3.56,0.0,2.0,5.0,8.0,11.0
liveness,0.2,0.17,0.01,0.09,0.13,0.25,1.0
loudness,-8.93,3.62,-30.35,-11.31,-8.55,-6.11,2.29
mode,0.74,0.44,0.0,0.0,1.0,1.0,1.0
speechiness,0.07,0.08,0.0,0.03,0.04,0.06,0.95


**Proportion of Songs With Audio Feature Data:**

~75% of songs on the Billboard list are available on Spotify, and weren't removed for data errors

In [11]:
# All Billboard 100 lists
# number not null, total, proportion not null
(
    df_B100[df_B100.id.notnull()].shape[0], 
    df_B100.shape[0], 
    df_B100[df_B100.id.notnull()].shape[0] / df_B100.shape[0]
)

(253254, 329930, 0.7675991877064832)

In [12]:
# All songs from Billboard 100 lists
# number not null, total, proportion not null
(
    df_B100_songs[df_B100_songs.id.notnull()].shape[0], 
    df_B100_songs.shape[0], 
    df_B100_songs[df_B100_songs.id.notnull()].shape[0] / df_B100_songs.shape[0]
)

(22189, 29681, 0.7475826286176341)