In [1]:
# import modules
import pandas as pd
import numpy as np

# full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas format
pd.set_option('display.float_format', '{:_.0f}'.format)
# NOTE: underscore separaters ('_') are better than commas (',') because 
# numbers with underscores work in Python without any extra effort.
pd.set_option('display.max_columns', None)

### Can we find Metadata for The Billboard 100?

In [2]:
# Billboard Top 100 Historical Data
# link:  https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs
# via:  https://toolbox.google.com/datasetsearch
url_billboard = r'D:\RYERSON\820\Datasets\Billboard The Hot 100 Songs\charts.csv'

In [15]:
df_billboard = pd.read_csv(url_billboard)
df_billboard['date'] = pd.to_datetime(df_billboard['date'])
df_billboard.head(1)

Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board
0,2021-11-06,1,Easy On Me,Adele,1,1,3


In [16]:
# date range for data
df_billboard.date.min(), df_billboard.date.max()

(Timestamp('1958-08-04 00:00:00'), Timestamp('2021-11-06 00:00:00'))

In [8]:
list_of_songs = df_billboard[['song', 'artist']].drop_duplicates().sort_values(['artist', 'song']).reset_index(drop=True)
list_of_songs.head()

Unnamed: 0,song,artist
0,Misty,"""Groove"" Holmes"
1,What Now My Love,"""Groove"" Holmes"
2,May The Bird Of Paradise Fly Up Your Nose,"""Little"" Jimmy Dickens"
3,I Know I Know,"""Pookie"" Hudson"
4,Amish Paradise,"""Weird Al"" Yankovic"


In [9]:
list_of_songs.tail()

Unnamed: 0,song,artist
29676,Check It Out,will.i.am & Nicki Minaj
29677,#thatPOWER,will.i.am Featuring Justin Bieber
29678,T.H.E (The Hardest Ever),will.i.am Featuring Mick Jagger & Jennifer Lopez
29679,Fall Down,will.i.am Featuring Miley Cyrus
29680,Feelin' Myself,"will.i.am Featuring Miley Cyrus, French Montan..."


In [18]:
# there are ~30k songs on the Billboard 100 list
# let's see how many are here:
# https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs
# via:  https://www.kaggle.com/datasets/
# Spotify 1.2M+ Songs
url_1M_songs = r'D:\RYERSON\820\Datasets\Spotify 1.2M+ Songs\tracks_features.csv'

df_1M_songs = pd.read_csv(url_1M_songs)

In [21]:
df_1M_songs.columns

Index(['id', 'name', 'album', 'album_id', 'artists', 'artist_ids',
       'track_number', 'disc_number', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'time_signature', 'year', 'release_date'],
      dtype='object')

In [56]:
df_1M_songs.tail(5)

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date,artist_lower,track_lower
1204020,0EsMifwUmMfJZxzoMPXJKZ,Gospel of Juke,Notch - EP,38O5Ys0W9PFS5K7dMb7yKb,['FVLCRVM'],['7AjItKsRnEYRSiBt2OxK1y'],2,1,False,0,1,5,-7,0,0,0,0,0,0,160,276213,4,2014,2014-01-09,['rage against the machine'],gospel of juke
1204021,2WSc2TB1CSJgGE0PEzVeiu,Prism Visions,Notch - EP,38O5Ys0W9PFS5K7dMb7yKb,['FVLCRVM'],['7AjItKsRnEYRSiBt2OxK1y'],3,1,False,1,1,11,-7,0,0,0,1,0,1,122,363179,4,2014,2014-01-09,['rage against the machine'],prism visions
1204022,6iProIgUe3ETpO6UT0v5Hg,Tokyo 360,Notch - EP,38O5Ys0W9PFS5K7dMb7yKb,['FVLCRVM'],['7AjItKsRnEYRSiBt2OxK1y'],4,1,False,1,1,9,-6,0,0,0,1,0,0,122,385335,4,2014,2014-01-09,['rage against the machine'],tokyo 360
1204023,37B4SXC8uoBsUyKCWnhPfX,Yummy!,Notch - EP,38O5Ys0W9PFS5K7dMb7yKb,['FVLCRVM'],['7AjItKsRnEYRSiBt2OxK1y'],5,1,False,1,1,6,-7,0,0,0,1,0,0,125,324455,4,2014,2014-01-09,['rage against the machine'],yummy!
1204024,3GgQmOxxLyRoAb4j86zOBX,That's The Way It Is,Notch - EP,38O5Ys0W9PFS5K7dMb7yKb,['FVLCRVM'],['7AjItKsRnEYRSiBt2OxK1y'],6,1,False,1,1,2,-9,0,0,0,0,0,0,118,304982,4,2014,2014-01-09,['rage against the machine'],that's the way it is


In [26]:
# merge billboard 100 with 1.2M songs

# change merging fields to lowercase with matching names
df_billboard['artist_lower'] = df_billboard['artist'].str.lower()
df_billboard['track_lower'] = df_billboard['song'].str.lower()
df_1M_songs['artist_lower'] = df_1M_songs['artists'].str.lower()
df_1M_songs['track_lower'] = df_1M_songs['name'].str.lower()

# join tables
df = pd.merge(df_billboard, df_1M_songs, how='left', on=['artist_lower', 'track_lower'], indicator=True)
df.shape, df[df._merge=='both'].shape

((330087, 34), (0, 34))

In [54]:
# temp_columns = ['artist_lower']
df_1M_songs['artist_lower'].str[2:-2].describe()  # nope

count                      1204025
unique                           1
top       rage against the machine
freq                       1204025
Name: artist_lower, dtype: object

In [61]:
df_1M_songs['artists'].sample(10)

360433     ['Gerald Finzi', 'Roderick Williams', 'Iain Bu...
477938                  ['Flatbush Zombies', 'Sophie Faith']
146456         ['Claude Debussy', 'François-Joël Thiollier']
505809                                     ['Front Country']
358559                                            ['Kölsch']
1133547                                      ['Chris Beaty']
18972                                  ['xLooking Forwardx']
733314                                     ['The Afterbeat']
897768                                  ['YYY', 'Dem Atlas']
468783                               ['Merengue Latin Band']
Name: artists, dtype: object

In [62]:
df_1M_songs['artists'].dtype
# it's not a list, it's an object that looks like a list
# probably need regex

"""
https://stackoverflow.com/questions/71469808/how-to-replace-a-list-with-first-element-of-list-in-pandas-dataframe-column

regex:

(             # start capturing
(?<=\[["\'])  # if preceded by [" or ['
[^"\']*       # get all text until " or '
|             # OR
^[^"\']+$     # get whole string if it doesn't contain " or '
)             # stop capturing

"""

dtype('O')

In [73]:
temp = df_1M_songs
temp['artist_lower'] = temp['artists'].str.extract('((?<=\[["\'])[^"\']*|^[^"\']+$)')

In [74]:
temp.sample(10)
# seems to work

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date,artist_lower,track_lower
913431,1v1gQLQTX5kLkrggbndesT,Sack Time,High School Caesar,0ELXB7nQaimBN00AUy88Z5,Nicholas Carras,['0wBQOu2dFCK8Vq3A7fgr5b'],9,1,False,0,0,6,-17,1,0,1,0,0,0,138,63444,3,2019,2019-08-23,Nicholas Carras,sack time
388656,5sSXTZeLV25Byz0ryxoc4O,Hysteria - Live,Mirror Ball - Live & More,3BO4hNrSnspOVwlloqedRc,Def Leppard,['6H1RjVyNruCmrBEWRbD0VZ'],4,2,False,1,1,1,-7,1,0,0,0,1,0,110,380880,4,2011,2011-06-07,Def Leppard,hysteria - live
815925,3f2X8zA1WpUgRamj5Yw9gF,Koko Sufu,Stereoexotique,1VCCIuXaFGx6nGbuBBhWld,The Tikiyaki Orchestra,['7kJPKUUnHCennOOEBxjidx'],5,1,False,1,0,0,-14,1,0,1,1,0,0,86,186340,4,2007,2007-01-01,The Tikiyaki Orchestra,koko sufu
60666,4qkmkAw5ZqBsQs33tpL4pF,Out of Your Head,Champ,2gBkyNwdBhnCDVdJJeA6c6,Kim Barlow,['1DFCGOLdv5hhgk0rR0sTZm'],3,1,False,1,0,2,-12,1,0,1,0,0,0,118,154653,4,2007,2007-03-16,Kim Barlow,out of your head
1011096,4PlNvD6CUr0c0OpZeKshcv,I Was Wondering,Chicago Blues Harmonica,5R6EqZTcPpRRm1EDBniWGV,Birmingham Jones,['4oHsdALsUUCPgNq253Y2RT'],15,1,False,1,1,2,-7,1,0,0,0,0,1,106,200520,4,1998,1998,Birmingham Jones,i was wondering
898429,0dnhIjn9sr662uZmvgrrty,Ra!,Null,7eG4VZBx0g7QzbujRo0Y4V,Copernicus,['1e9YpihmorSFjbeyr2GcmF'],2,1,False,0,1,10,-15,0,0,0,0,0,0,115,654373,4,2008,2008-12-02,Copernicus,ra!
641287,0bTy8utm40FTCeyPHXcSlR,My Second Record,Fred,1lwv4UMweqXLkAxp4K1lil,Mitch Friedman,['5weXUjgse8IMbMG4iuc204'],1,1,False,1,1,9,-4,1,0,0,0,0,1,93,38293,4,2002,2002-07-16,Mitch Friedman,my second record
8393,0JkSeeexqYmmMGXN1nXQxS,There's No Business Like Propa' Rungleclotted ...,Full English Breakfest,2wxfaIeMpCNWUPJivbA8sG,Shitmat,['6mPzVfnTE4JQJAuAdz2csx'],5,1,False,0,1,1,-6,1,0,0,0,0,1,85,316760,4,2004,2004-10-18,Shitmat,there's no business like propa' rungleclotted ...
1143826,2sZZ4n5SQmGPoCe32yB3sB,Gialinos Kosmos,Gialinos Kosmos,3BA1ToQzV1NTanqzWgNRAe,Nikos Kouroupakis,"['6YBoCHYFjKJvQrdD5Ycqdc', '3P3ITOu89EkU4tBevv...",1,1,False,0,0,6,-11,1,0,1,0,0,0,118,227007,4,2020,2020-05-18,Nikos Kouroupakis,gialinos kosmos
5749,0rQOtQ2N8IZr6fWMI5Rxhh,I'm Nuthin',Reality Bites,5SRINK0YUVEOEDMpIl57qA,Ethan Hawke,['3ZvNOfWpxcMrlB9DLq7jjN'],10,1,False,1,1,9,-7,1,0,0,0,0,0,125,217827,4,1990,1990-01-01,Ethan Hawke,i'm nuthin'


In [75]:
temp['artist_lower'] = temp['artists'].str.lower()

In [82]:
temp.artist_lower.sample(10)

479435     allison brewster franzetti
858487                           atfc
388265                     will pound
1185239                          fish
293061                    david nevue
433444                   bob chilcott
360082                     dan israel
616595                rodolfo mederos
445666                 freon icy cold
1045591                  mother mercy
Name: artist_lower, dtype: object

### Can we merge Billboard 100 with Song Metadata? (~90% missing)
    * we could use this merge for NLP
    * this could be a representative sample
    * discard for now because it would be nice to use the entire Billboard 100 dataset

In [7]:
# Billboard Top 100 Historical Data
# link:  https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs
# via:  https://toolbox.google.com/datasetsearch
url_billboard = r'D:\RYERSON\820\Datasets\Billboard The Hot 100 Songs\charts.csv'

# Music Dataset: Lyrics and Metadata from 1950 to 2019
# link:  https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs
# via:  https://toolbox.google.com/datasetsearch
# info re metadata:  https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-audio-features
# also available via Spotify web API:
    # link:  https://developer.spotify.com/documentation/web-api/
url_songdata = r'D:\RYERSON\820\Datasets\Music Dataset Lyrics and Metadata from 1950 to 2019\tcc_ceds_music.csv'


In [29]:
df_billboard = pd.read_csv(url_billboard)

# lowercase for joining datasets
# these match the names in the other dataframe, and are lowercase, as the other dataset is
df_billboard['artist_name'] = df_billboard['artist'].str.lower()
df_billboard['track_name'] = df_billboard['song'].str.lower()

df_billboard.head(1)

Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board,artist_name,track_name
0,2021-11-06,1,Easy On Me,Adele,1,1,3,adele,easy on me


In [31]:
df_songdata = pd.read_csv(url_songdata)
# df_songdata.rename(columns={'artist_name': 'artist', 'track_name': 'song'}, inplace=True)
df_songdata.head(1)

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,night/time,shake the audience,family/gospel,romantic,communication,obscene,music,movement/places,light/visual perceptions,family/spiritual,like/girls,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,sadness,1


In [17]:
df_billboard.shape, df_songdata.shape

((330087, 7), (28372, 31))

In [40]:
df = pd.merge(df_billboard, df_songdata, how='left', on=['artist_name', 'track_name'], indicator=True)
df.shape, df[df._merge=='both'].shape

((330087, 39), (41146, 39))

In [41]:
# lots of missing hits, let's check what's up
temp_columns = ['artist_name', 'track_name', '_merge']
temp = df[temp_columns]

In [42]:
temp.sample(10)

Unnamed: 0,artist_name,track_name,_merge
19707,dua lipa,new rules,both
279886,vikki carr,she'll be there,left_only
302031,sunny & the sunliners,rags to riches,left_only
60335,the black eyed peas,i gotta feeling,both
150676,richard marx,chains around my heart,both
216835,felix cavaliere,only a lonely heart sees,left_only
156847,emf,lies,left_only
33503,walk the moon,shut up and dance,left_only
157926,cathy dennis,too many walls,left_only
4646,"jack harlow featuring dababy, tory lanez & lil...",whats poppin,left_only


In [45]:
df_songdata[df_songdata.artist_name=='vikki carr']

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,night/time,shake the audience,family/gospel,romantic,communication,obscene,music,movement/places,light/visual perceptions,family/spiritual,like/girls,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age


In [44]:
df_songdata[df_songdata.artist_name=='walk the moon']

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,night/time,shake the audience,family/gospel,romantic,communication,obscene,music,movement/places,light/visual perceptions,family/spiritual,like/girls,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
6147,17721,walk the moon,tightrope,2012,pop,easy heart easy heart walk little tightrope wa...,65,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,sadness,0
6160,17770,walk the moon,anna sun,2012,pop,screen fall door door hang hinge feet sore fri...,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,violence,0
26798,79084,walk the moon,shiver shiver,2012,rock,grip hand throat strip button coat choose meth...,87,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,1,romantic,0


In [49]:
# ok, how many songs are missing? should we use the spotify API instead?

# missing tracks
missing = temp[temp._merge=='left_only'].drop_duplicates(keep='first')

# total tracks
all_tracks = df_billboard[['song', 'artist']].drop_duplicates(keep='first')

In [56]:
missing.shape[0] / all_tracks.shape[0]
# 90% of songs are missing

0.9031366867693137