In [34]:
# import modules
import pandas as pd
import numpy as np

# full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas format
pd.set_option('display.float_format', '{:_.0f}'.format)
# NOTE: underscore separaters ('_') are better than commas (',') because 
# numbers with underscores work in Python without any extra effort.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

### Can we find Metadata for The Billboard 100?

In [2]:
# Billboard Top 100 Historical Data
# link:  https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs
# via:  https://toolbox.google.com/datasetsearch
url_billboard = r'D:\RYERSON\820\Datasets\Billboard The Hot 100 Songs\charts.csv'

In [3]:
df_billboard = pd.read_csv(url_billboard)
df_billboard['date'] = pd.to_datetime(df_billboard['date'])
df_billboard.head(1)

Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board
0,2021-11-06,1,Easy On Me,Adele,1,1,3


In [4]:
# date range for data
df_billboard.date.min(), df_billboard.date.max()

(Timestamp('1958-08-04 00:00:00'), Timestamp('2021-11-06 00:00:00'))

In [5]:
# list of billboard 100 songs
list_of_songs = df_billboard[['song', 'artist']].drop_duplicates().sort_values(['artist', 'song']).reset_index(drop=True)
list_of_songs.shape

(29681, 2)

In [6]:
# there are ~30k songs on the Billboard 100 list
# let's see how many are here:
# https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs
# via:  https://www.kaggle.com/datasets/
# Spotify 1.2M+ Songs
url_1M_songs = r'D:\RYERSON\820\Datasets\Spotify 1.2M+ Songs\tracks_features.csv'

df_1M_songs = pd.read_csv(url_1M_songs)

In [7]:
# merge billboard 100 with 1.2M songs

# change merging fields to lowercase with matching names
df_billboard['artist_lower'] = df_billboard['artist'].str.lower()
df_billboard['track_lower'] = df_billboard['song'].str.lower()

# need to extract list-style formatting using Regex:
"""
https://stackoverflow.com/questions/71469808/how-to-replace-a-list-with-first-element-of-list-in-pandas-dataframe-column
regex:
(             # start capturing
(?<=\[["\'])  # if preceded by [" or ['
[^"\']*       # get all text until " or '
|             # OR
^[^"\']+$     # get whole string if it doesn't contain " or '
)             # stop capturing
"""
df_1M_songs['artist_lower'] = df_1M_songs['artists'].str.extract('((?<=\[["\'])[^"\']*|^[^"\']+$)')
df_1M_songs['artist_lower'] = df_1M_songs['artist_lower'].str.lower()
df_1M_songs['track_lower'] = df_1M_songs['name'].str.lower()

# join tables
df = pd.merge(df_billboard, df_1M_songs, how='outer', on=['artist_lower', 'track_lower'], indicator=True)
df.shape, df[df._merge=='both'].shape

((1618300, 34), (167191, 34))

In [8]:
# how many missing songs?
df[df._merge == 'left_only'].drop_duplicates(['artist_lower', 'track_lower']).reset_index().shape

(24488, 35)

In [9]:
# how many found songs?
df[df._merge == 'both'].drop_duplicates(['artist_lower', 'track_lower']).reset_index().shape
# lame

(5192, 35)

In [23]:
missing_songs = df[df._merge == 'left_only'].drop_duplicates(['artist_lower', 'track_lower']).sort_values(['artist_lower', 'track_lower']).reset_index()[['artist_lower', 'track_lower']]
missing_songs

Unnamed: 0,artist_lower,track_lower
0,"""groove"" holmes",misty
1,"""groove"" holmes",what now my love
2,"""little"" jimmy dickens",may the bird of paradise fly up your nose
3,"""pookie"" hudson",i know i know
4,"""weird al"" yankovic",amish paradise
...,...,...
24483,zz top,legs
24484,zz top,leila
24485,zz top,sleeping bag
24486,zz top,stages


In [24]:
# manually check database for these
manual_check = df_1M_songs[['artist_lower', 'track_lower', 'album']]

In [40]:
manual_check[manual_check.track_lower == 'velcro fly']

Unnamed: 0,artist_lower,track_lower,album


In [42]:
# can i use this to join?
manual_check[manual_check.track_lower.str.contains('velcro fly')]

# don't like this solution, it will be humongous
# https://stackoverflow.com/questions/58011182/pandas-merge-a-dataframe-on-an-exact-and-partial-match

Unnamed: 0,artist_lower,track_lower,album
1080750,zz top,velcro fly - 2019 remaster,Goin' 50


In [35]:
manual_check[manual_check.artist_lower=='zz top'].sort_values('track_lower')
# they're all in there, maybe we need a partial match for the merge?

Unnamed: 0,artist_lower,track_lower,album
771527,zz top,(let me be your) teddy bear,X X X
1080755,zz top,(somebody else been) shaking your tree - 2019 ...,Goin' 50
1080788,zz top,36-22-36,Goin' 50
771520,zz top,36-22-36,X X X
700937,zz top,antenna head,Antenna
1061198,zz top,arrested for driving while blind,Easy Riders: Rock
1080763,zz top,arrested for driving while blind - 2019 remaster,Goin' 50
1080761,zz top,backdoor medley: backdoor love affair / mellow...,Goin' 50
1080785,zz top,bang bang,Goin' 50
1061021,zz top,bar-b-q,Holiday Rock


In [10]:
# which songs were not matched to billboard 100?
unmatched = df[df._merge == 'right_only'].drop_duplicates(['artist_lower', 'track_lower'])[['artists', 'artist_lower', 'track_lower']].sort_values(['artist_lower', 'track_lower']).reset_index()

In [22]:
unmatched.sample(20)

Unnamed: 0,index,artists,artist_lower,track_lower
147983,1408905,"['BSCBR', 'Deradoorian', 'Nick Zinner', 'Mick ...",bscbr,sweet leaf
458862,1203498,['J Alvarez'],j alvarez,la verdad
372621,1545004,['George Benson'],george benson,give me the night - single version; 2000 remaster
866951,1282329,['Sean Sheridan'],sean sheridan,searchlights
1081638,1166066,['Wendy Beckerman'],wendy beckerman,afraid to say goodbye
338029,1167425,['Flight of Mavis'],flight of mavis,it's so easy
185093,618113,['Chris Isaak'],chris isaak,blue hotel
771411,807511,['Phil Driscoll'],phil driscoll,highway to heaven
1013929,1361280,['The Weight Band'],the weight band,common man
961792,937364,['The Cog is Dead'],the cog is dead,the ballad of stuart the sailor


### Can we merge Billboard 100 with Song Metadata?
### not well (~90% missing)
### NLP section only
    * we could use this merge for NLP
    * this could be a representative sample
    * discard for now because it would be nice to use the entire Billboard 100 dataset

In [None]:
# Billboard Top 100 Historical Data
# link:  https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs
# via:  https://toolbox.google.com/datasetsearch
url_billboard = r'D:\RYERSON\820\Datasets\Billboard The Hot 100 Songs\charts.csv'

# Music Dataset: Lyrics and Metadata from 1950 to 2019
# link:  https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs
# via:  https://toolbox.google.com/datasetsearch
# info re metadata:  https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-audio-features
# also available via Spotify web API:
    # link:  https://developer.spotify.com/documentation/web-api/
url_songdata = r'D:\RYERSON\820\Datasets\Music Dataset Lyrics and Metadata from 1950 to 2019\tcc_ceds_music.csv'


In [None]:
df_billboard = pd.read_csv(url_billboard)

# lowercase for joining datasets
# these match the names in the other dataframe, and are lowercase, as the other dataset is
df_billboard['artist_name'] = df_billboard['artist'].str.lower()
df_billboard['track_name'] = df_billboard['song'].str.lower()

df_billboard.head(1)

In [None]:
df_songdata = pd.read_csv(url_songdata)
# df_songdata.rename(columns={'artist_name': 'artist', 'track_name': 'song'}, inplace=True)
df_songdata.head(1)

In [None]:
df_billboard.shape, df_songdata.shape

In [None]:
df = pd.merge(df_billboard, df_songdata, how='left', on=['artist_name', 'track_name'], indicator=True)
df.shape, df[df._merge=='both'].shape

In [None]:
# lots of missing hits, let's check what's up
temp_columns = ['artist_name', 'track_name', '_merge']
temp = df[temp_columns]

In [None]:
temp.sample(10)

In [None]:
df_songdata[df_songdata.artist_name=='vikki carr']

In [None]:
df_songdata[df_songdata.artist_name=='walk the moon']

In [None]:
# ok, how many songs are missing? should we use the spotify API instead?

# missing tracks
missing = temp[temp._merge=='left_only'].drop_duplicates(keep='first')

# total tracks
all_tracks = df_billboard[['song', 'artist']].drop_duplicates(keep='first')

In [None]:
missing.shape[0] / all_tracks.shape[0]
# 90% of songs are missing