# Import

In [1]:
# import modules
import pandas as pd
import numpy as np
import spotipy

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
# NOTE: underscore separaters ('_') are better than commas (',') because 
# numbers with underscores work in Python without any extra effort.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

# Data Sources

**The Billboard 100**

https://en.wikipedia.org/wiki/Billboard_Hot_100

https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs

**1.2M Songs with Metadata (csv)**

https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs

**8+ M. Spotify Tracks, Genre, Audio Features (SQL)**

https://www.kaggle.com/datasets/maltegrosse/8-m-spotify-tracks-genre-audio-features


**Spotify API**

https://developer.spotify.com/documentation/web-api/

https://developer.spotify.com/console/get-search-item

https://developer.spotify.com/console/get-audio-features-track/

https://developer.spotify.com/documentation/web-api/reference/#/operations/get-audio-features

Spotipy Library:  https://spotipy.readthedocs.io/en/master/


# Data Description and Discussion

* The Billboard 100 data did not include audio features. It was combined with audio features from the following sources:
    * 1.2M Songs with Metadata (csv format)
    * 8+ M. Spotify Tracks, Genre, Audio Features (SQLite format)
    * Spotify API data gathered via the library Spotipy
* Overall, audio features was gathered for approximately 75% of songs from the Billboard 100.
    * Some songs were excluded based on data repetition issues
        * Typically this was only hard to find songs with very similar names
        * For example searching for 'Metallica The Unforgiven' and 'Metallica The Unforgiven Part 2' yielded the same Spotify id
            * It was determined that excluding these songs was less error-prone than manually fixing the issues
            * Alternatively, we could have kept 1 song. In this case, there is up to a 50% chance that the song is mislabelled, so this option appeared less favourable than dropping both repeat instances.
* A Quality Assurance (QA) check was performed on the final dataset.
    * Audio features from 100 songs were gathered from the Spotify API and compared to the datasets listed above.
    * There were 3 non-trivial issues noted in 2 of the 100 songs:
        * Madonna Live To Tell
            * A significant increase in loudness (~7 dB)
            * Appoximately 1 second different in length
            * All other audio features consistent between data sources
            * Both of these changes appear to result from remastering and re-uploading the track
                * https://artists.spotify.com/help/article/re-uploading-music
        * Lil Wayne Let It All Work Out
            * The key signature was not consistent between the 2 sources
            * The newer source (the API request from Sept 11, 2022) was correct (B major)
            * The SQL database was also different
            * My supposition is that these errors are due to the characteristics of the song:
                * atonal (most notably the sining)
                * detuned (bass pitch automation, and low-fi detuning effects)
    * Overall, there is a large degree of consistency between datasets. Furthermore, inconsistencies are all explainable with reasonable suppositions.

# Import Data
from: Kevin 820 Data Pickling.ipynb

In [2]:
# all songs with audio features (combined from 3 sources)
df_10M = pd.read_pickle('df_10M.pickle')

# all Billboard 100 lists, audio features included where possible
df_B100 = pd.read_pickle('df_B100.pickle')

# all unique songs from the Billboard 100 lists, audio features included where possible
df_B100_songs = pd.read_pickle('df_B100_songs.pickle')

# Data Description

In [3]:
# sizes of the datasets
df_10M.shape, df_B100.shape, df_B100_songs.shape

((9595992, 16), (329930, 21), (29681, 16))

In [4]:
# data types
pd.concat(
    [df_10M.dtypes, df_B100.dtypes, df_B100_songs.dtypes], 
    keys=['df_10M.dtypes', 'df_B100.dtypes', 'df_B100_songs.dtypes'],
    axis=1
)

Unnamed: 0,df_10M.dtypes,df_B100.dtypes,df_B100_songs.dtypes
id,object,object,object
song,object,object,object
artist,object,object,object
acousticness,float32,float32,float32
danceability,float32,float32,float32
duration_ms,Int32,Int32,Int32
energy,float32,float32,float32
instrumentalness,float32,float32,float32
key,Int16,Int16,Int16
liveness,float32,float32,float32


In [5]:
# Date Range for Billboard Hot 100

df_B100.date.min(), df_B100.date.max()

(Timestamp('1958-08-04 00:00:00'), Timestamp('2021-11-06 00:00:00'))

In [6]:
df_B100[['rank', 'last-week', 'peak-rank', 'weeks-on-board']].describe().loc['mean':'max'].T

Unnamed: 0,mean,std,min,25%,50%,75%,max
rank,50.502,28.866,1.0,26.0,51.0,76.0,100.0
last-week,47.593,28.055,1.0,23.0,47.0,72.0,100.0
peak-rank,40.973,29.348,1.0,13.0,38.0,65.0,100.0
weeks-on-board,9.162,7.619,1.0,4.0,7.0,13.0,90.0


In [7]:
df_10M.describe().loc['mean':'max'].T

Unnamed: 0,mean,std,min,25%,50%,75%,max
acousticness,0.421,0.374,0.0,0.034,0.336,0.817,0.996
danceability,0.528,0.19,0.0,0.396,0.545,0.676,1.0
duration_ms,97.408,159341.591,0.0,169600.0,216933.0,275080.0,19672058.0
energy,0.545,0.282,0.0,0.31,0.567,0.789,1.0
instrumentalness,0.258,0.374,0.0,0.0,0.002,0.645,1.0
key,5.237,3.542,0.0,2.0,5.0,8.0,11.0
liveness,0.21,0.18,0.0,0.096,0.129,0.262,1.0
loudness,-10.967,6.318,-60.0,-13.675,-9.196,-6.398,7.234
mode,0.661,0.473,0.0,0.0,1.0,1.0,1.0
speechiness,0.098,0.135,0.0,0.036,0.047,0.082,0.974


In [8]:
df_B100.describe().loc['mean':'max'].T

Unnamed: 0,mean,std,min,25%,50%,75%,max
rank,50.502,28.866,1.0,26.0,51.0,76.0,100.0
last-week,47.593,28.055,1.0,23.0,47.0,72.0,100.0
peak-rank,40.973,29.348,1.0,13.0,38.0,65.0,100.0
weeks-on-board,9.162,7.619,1.0,4.0,7.0,13.0,90.0
acousticness,0.278,0.275,0.0,0.041,0.178,0.468,0.995
danceability,0.603,0.149,0.0,0.507,0.611,0.708,0.988
duration_ms,6410.969,66552.146,30213.0,183360.0,221306.0,258399.0,1561133.0
energy,0.625,0.199,0.007,0.479,0.643,0.786,0.999
instrumentalness,0.034,0.139,0.0,0.0,0.0,0.001,0.985
key,5.218,3.564,0.0,2.0,5.0,8.0,11.0


In [9]:
df_B100_songs.describe().loc['mean':'max'].T

Unnamed: 0,mean,std,min,25%,50%,75%,max
acousticness,0.316,0.29,0.0,0.052,0.224,0.556,0.995
danceability,0.59,0.152,0.0,0.491,0.598,0.697,0.988
duration_ms,24075.435,68403.261,30213.0,169533.0,210426.0,251333.0,1561133.0
energy,0.611,0.203,0.007,0.463,0.624,0.775,0.999
instrumentalness,0.039,0.151,0.0,0.0,0.0,0.001,0.985
key,5.196,3.556,0.0,2.0,5.0,8.0,11.0
liveness,0.197,0.168,0.012,0.091,0.132,0.255,0.999
loudness,-8.927,3.622,-30.346,-11.314,-8.554,-6.111,2.291
mode,0.741,0.438,0.0,0.0,1.0,1.0,1.0
speechiness,0.067,0.076,0.0,0.032,0.04,0.061,0.951


**Proportion of Songs With Audio Feature Data:**

~75% of songs on the Billboard list are available on Spotify, and weren't removed for data errors

In [10]:
# All Billboard 100 lists
# number not null, total, proportion not null
(
    df_B100[df_B100.id.notnull()].shape[0], 
    df_B100.shape[0], 
    df_B100[df_B100.id.notnull()].shape[0] / df_B100.shape[0]
)

(253254, 329930, 0.7675991877064832)

In [11]:
# All songs from Billboard 100 lists
# number not null, total, proportion not null
(
    df_B100_songs[df_B100_songs.id.notnull()].shape[0], 
    df_B100_songs.shape[0], 
    df_B100_songs[df_B100_songs.id.notnull()].shape[0] / df_B100_songs.shape[0]
)

(22189, 29681, 0.7475826286176341)

# Exploratory Data Analysis