In [3]:
# import modules
import pandas as pd
import numpy as np

# full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas format
pd.set_option('display.float_format', '{:_.0f}'.format)
# NOTE: underscore separaters ('_') are better than commas (',') because 
# numbers with underscores work in Python without any extra effort.
pd.set_option('display.max_columns', None)

### Can we find Metadata for The Billboard 100?

In [57]:
# Billboard Top 100 Historical Data
# link:  https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs
# via:  https://toolbox.google.com/datasetsearch
url_billboard = r'D:\RYERSON\820\Datasets\Billboard The Hot 100 Songs\charts.csv'

In [None]:
df_billboard = pd.read_csv(url_billboard)
df_billboard.head(1)

### Can we merge Billboard 100 with Song Metadata? (~90% missing)
    * we could use this merge for NLP
    * this could be a representative sample
    * discard for now because it would be nice to use the entire Billboard 100 dataset

In [7]:
# Billboard Top 100 Historical Data
# link:  https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs
# via:  https://toolbox.google.com/datasetsearch
url_billboard = r'D:\RYERSON\820\Datasets\Billboard The Hot 100 Songs\charts.csv'

# Music Dataset: Lyrics and Metadata from 1950 to 2019
# link:  https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs
# via:  https://toolbox.google.com/datasetsearch
# info re metadata:  https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-audio-features
# also available via Spotify web API:
    # link:  https://developer.spotify.com/documentation/web-api/
url_songdata = r'D:\RYERSON\820\Datasets\Music Dataset Lyrics and Metadata from 1950 to 2019\tcc_ceds_music.csv'


In [29]:
df_billboard = pd.read_csv(url_billboard)

# lowercase for joining datasets
# these match the names in the other dataframe, and are lowercase, as the other dataset is
df_billboard['artist_name'] = df_billboard['artist'].str.lower()
df_billboard['track_name'] = df_billboard['song'].str.lower()

df_billboard.head(1)

Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board,artist_name,track_name
0,2021-11-06,1,Easy On Me,Adele,1,1,3,adele,easy on me


In [31]:
df_songdata = pd.read_csv(url_songdata)
# df_songdata.rename(columns={'artist_name': 'artist', 'track_name': 'song'}, inplace=True)
df_songdata.head(1)

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,night/time,shake the audience,family/gospel,romantic,communication,obscene,music,movement/places,light/visual perceptions,family/spiritual,like/girls,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,sadness,1


In [17]:
df_billboard.shape, df_songdata.shape

((330087, 7), (28372, 31))

In [40]:
df = pd.merge(df_billboard, df_songdata, how='left', on=['artist_name', 'track_name'], indicator=True)
df.shape, df[df._merge=='both'].shape

((330087, 39), (41146, 39))

In [41]:
# lots of missing hits, let's check what's up
temp_columns = ['artist_name', 'track_name', '_merge']
temp = df[temp_columns]

In [42]:
temp.sample(10)

Unnamed: 0,artist_name,track_name,_merge
19707,dua lipa,new rules,both
279886,vikki carr,she'll be there,left_only
302031,sunny & the sunliners,rags to riches,left_only
60335,the black eyed peas,i gotta feeling,both
150676,richard marx,chains around my heart,both
216835,felix cavaliere,only a lonely heart sees,left_only
156847,emf,lies,left_only
33503,walk the moon,shut up and dance,left_only
157926,cathy dennis,too many walls,left_only
4646,"jack harlow featuring dababy, tory lanez & lil...",whats poppin,left_only


In [45]:
df_songdata[df_songdata.artist_name=='vikki carr']

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,night/time,shake the audience,family/gospel,romantic,communication,obscene,music,movement/places,light/visual perceptions,family/spiritual,like/girls,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age


In [44]:
df_songdata[df_songdata.artist_name=='walk the moon']

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,night/time,shake the audience,family/gospel,romantic,communication,obscene,music,movement/places,light/visual perceptions,family/spiritual,like/girls,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
6147,17721,walk the moon,tightrope,2012,pop,easy heart easy heart walk little tightrope wa...,65,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,sadness,0
6160,17770,walk the moon,anna sun,2012,pop,screen fall door door hang hinge feet sore fri...,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,violence,0
26798,79084,walk the moon,shiver shiver,2012,rock,grip hand throat strip button coat choose meth...,87,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,1,romantic,0


In [49]:
# ok, how many songs are missing? should we use the spotify API instead?

# missing tracks
missing = temp[temp._merge=='left_only'].drop_duplicates(keep='first')

# total tracks
all_tracks = df_billboard[['song', 'artist']].drop_duplicates(keep='first')

In [56]:
missing.shape[0] / all_tracks.shape[0]
# 90% of songs are missing

0.9031366867693137