# Import Modules

In [1]:
# import modules
import pandas as pd
import numpy as np
import spotipy

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:_.2f}'.format)
# NOTE: underscore separaters ('_') are better than commas (',') because 
# numbers with underscores work in Python without any extra effort.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

# Import Data and Update Formatting

In [11]:
desired_formatting = [
    'id', 'song', 'artist',
    'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 
    'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence'
]

# all songs with audio features (combined from 3 sources)
df_10M = pd.read_csv('every_song_with_data.csv')
df_10M = df_10M[desired_formatting]

# all Billboard 100 lists, audio features included where possible
df_B100 = pd.read_csv('all_audio_features_billboard_100.csv')
df_B100 = df_B100[desired_formatting]

# all unique songs from the Billboard 100 lists, audio features included where possible
df_B100_songs = pd.read_csv('all_audio_features_billboard_100_songs.csv')
df_B100_songs = df_B100_songs[desired_formatting]

In [12]:
df_10M.shape, df_B100.shape, df_B100_songs.shape

((9595992, 16), (329930, 16), (29681, 16))

# Inspect and Describe Data

In [13]:
df_10M.describe()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
count,9_595_992.00,9_595_992.00,9_595_992.00,9_595_992.00,9_595_992.00,9_595_992.00,9_595_992.00,9_595_992.00,9_595_992.00,9_595_992.00,9_595_992.00,9_595_992.00,9_595_992.00
mean,0.42,0.53,238_209.59,0.54,0.26,5.24,0.21,-10.89,0.66,0.10,118.56,3.84,0.48
std,0.38,0.19,159_341.59,0.28,0.37,3.54,0.18,6.36,0.47,0.14,31.03,0.57,0.28
min,0.00,0.00,0.00,0.00,0.00,0.00,0.00,-60.00,0.00,0.00,0.00,0.00,0.00
25%,0.03,0.40,169_600.00,0.31,0.00,2.00,0.10,-13.68,0.00,0.04,95.08,4.00,0.23
50%,0.34,0.55,216_933.00,0.57,0.00,5.00,0.13,-9.20,1.00,0.05,118.95,4.00,0.47
75%,0.82,0.68,275_080.00,0.79,0.64,8.00,0.26,-6.40,1.00,0.08,137.45,4.00,0.71
max,1.00,1.00,19_672_058.00,1.00,1.00,11.00,1.00,7.23,1.00,0.97,249.99,5.00,1.00


In [16]:
df_B100.describe()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
count,253_254.00,253_254.00,253_254.00,253_254.00,253_254.00,253_254.00,253_254.00,253_254.00,253_254.00,253_254.00,253_254.00,253_254.00,253_254.00
mean,0.28,0.60,226_879.65,0.63,0.03,5.22,0.19,-8.61,0.73,0.06,120.40,3.94,0.61
std,0.27,0.15,66_552.15,0.20,0.14,3.56,0.16,3.59,0.44,0.07,27.79,0.30,0.24
min,0.00,0.00,30_213.00,0.01,0.00,0.00,0.01,-30.35,0.00,0.00,0.00,0.00,0.00
25%,0.04,0.51,183_360.00,0.48,0.00,2.00,0.09,-10.97,0.00,0.03,99.93,4.00,0.42
50%,0.18,0.61,221_306.00,0.64,0.00,5.00,0.13,-8.15,1.00,0.04,119.00,4.00,0.63
75%,0.47,0.71,258_399.00,0.79,0.00,8.00,0.24,-5.79,1.00,0.06,136.00,4.00,0.81
max,1.00,0.99,1_561_133.00,1.00,0.99,11.00,1.00,2.29,1.00,0.95,241.01,5.00,0.99


In [15]:
df_B100_songs.describe()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
count,22_189.00,22_189.00,22_189.00,22_189.00,22_189.00,22_189.00,22_189.00,22_189.00,22_189.00,22_189.00,22_189.00,22_189.00,22_189.00
mean,0.32,0.59,217_638.34,0.61,0.04,5.20,0.20,-8.93,0.74,0.07,120.48,3.92,0.61
std,0.29,0.15,68_403.26,0.20,0.15,3.56,0.17,3.62,0.44,0.08,28.09,0.33,0.24
min,0.00,0.00,30_213.00,0.01,0.00,0.00,0.01,-30.35,0.00,0.00,0.00,0.00,0.00
25%,0.05,0.49,169_533.00,0.46,0.00,2.00,0.09,-11.31,0.00,0.03,99.79,4.00,0.42
50%,0.22,0.60,210_426.00,0.62,0.00,5.00,0.13,-8.55,1.00,0.04,119.07,4.00,0.64
75%,0.56,0.70,251_333.00,0.77,0.00,8.00,0.25,-6.11,1.00,0.06,136.34,4.00,0.81
max,1.00,0.99,1_561_133.00,1.00,0.99,11.00,1.00,2.29,1.00,0.95,241.01,5.00,0.99


In [18]:
# All Billboard 100 lists
# number not null, total, proportion not null
df_B100[df_B100.id.notnull()].shape[0], df_B100.shape[0], df_B100[df_B100.id.notnull()].shape[0] / df_B100.shape[0]

(253254, 329930, 0.7675991877064832)

In [20]:
# All songs from Billboard 100 lists
# number not null, total, proportion not null
df_B100_songs[df_B100_songs.id.notnull()].shape[0], df_B100_songs.shape[0], df_B100_songs[df_B100_songs.id.notnull()].shape[0] / df_B100_songs.shape[0]

(22189, 29681, 0.7475826286176341)

**Proportion of Songs With Audio Feature Data:**

~75% of songs on the Billboard list are available on Spotify, and weren't removed for data errors