In [1]:
# import modules
import pandas as pd
import numpy as np
import spotipy

# full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas format
pd.set_option('display.float_format', '{:_.0f}'.format)
# NOTE: underscore separaters ('_') are better than commas (',') because 
# numbers with underscores work in Python without any extra effort.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

### Can we find Metadata for The Billboard 100?

In [2]:
# Billboard Top 100 Historical Data
# link:  https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs
# via:  https://toolbox.google.com/datasetsearch
url_billboard = r'D:\RYERSON\820\Datasets\Billboard The Hot 100 Songs\charts.csv'

In [3]:
df_billboard = pd.read_csv(url_billboard)
df_billboard['date'] = pd.to_datetime(df_billboard['date'])
df_billboard.head(1)

Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board
0,2021-11-06,1,Easy On Me,Adele,1,1,3


In [4]:
# date range for data
df_billboard.date.min(), df_billboard.date.max()

(Timestamp('1958-08-04 00:00:00'), Timestamp('2021-11-06 00:00:00'))

In [8]:
# just the songs on the billboard 100
df_billboard_songs = df_billboard[['song', 'artist']].drop_duplicates().sort_values(['artist', 'song']).reset_index(drop=True)
df_billboard_songs.shape

(29681, 2)

In [9]:
df_billboard_songs.sample(10)

Unnamed: 0,song,artist
29508,My Window,YoungBoy Never Broke Again Featuring Lil Wayne
13930,Back That Thang Up,Juvenile Featuring Mannie Fresh & Lil' Wayne
2287,Upgrade U,Beyonce Featuring Jay Z
15408,No Sucker,Lil Baby & Moneybagg Yo
16097,Ringo,Lorne Greene
23005,Beat Patrol,Starship
4972,High On Emotion,Chris de Burgh
9901,Praying For Time,George Michael
22323,Foolish Heart,Sharon Bryant
874,The Philly Freeze,Alvin Cash & The Registers


In [10]:
# there are ~30k songs on the Billboard 100 list
# let's see how many are here:
# https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs
# via:  https://www.kaggle.com/datasets/
# Spotify 1.2M+ Songs
url_1M_songs = r'D:\RYERSON\820\Datasets\Spotify 1.2M+ Songs\tracks_features.csv'

df_1M_songs = pd.read_csv(url_1M_songs)

In [None]:
# many indirect matches, need be clever to merge
# match artist, look for billboard.song in 1M.name.str


In [15]:
# change merging fields to lowercase with matching names
df_billboard_songs['artist_lower'] = df_billboard_songs['artist'].str.lower()
df_billboard_songs['track_lower'] = df_billboard_songs['song'].str.lower()

df_billboard_songs = df_billboard_songs[['artist_lower', 'track_lower']]

In [17]:
# formatting in df_1M_songs is difficult to parse
# need to extract list-style formatting using Regex:
"""
https://stackoverflow.com/questions/71469808/how-to-replace-a-list-with-first-element-of-list-in-pandas-dataframe-column
regex:
(             # start capturing
(?<=\[["\'])  # if preceded by [" or ['
[^"\']*       # get all text until " or '
|             # OR
^[^"\']+$     # get whole string if it doesn't contain " or '
)             # stop capturing
"""
df_1M_songs['artist_lower'] = df_1M_songs['artists'].str.extract('((?<=\[["\'])[^"\']*|^[^"\']+$)')
df_1M_songs['artist_lower'] = df_1M_songs['artist_lower'].str.lower()
df_1M_songs['track_lower'] = df_1M_songs['name'].str.lower()

In [23]:
# now, let's see how many id matches we can get with the billboard tracks
merge_on = ['artist_lower']

df_temp = pd.merge(df_billboard_songs, df_1M_songs, how='left', on=merge_on, indicator=True)
df_temp.shape

(1216426, 28)

In [24]:
df_temp.head()

Unnamed: 0,artist_lower,track_lower_x,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date,track_lower_y,_merge
0,"""groove"" holmes",misty,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
1,"""groove"" holmes",what now my love,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
2,"""little"" jimmy dickens",may the bird of paradise fly up your nose,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
3,"""pookie"" hudson",i know i know,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
4,"""weird al"" yankovic",amish paradise,,,,,,,,,,,,,,,,,,,,,,,,,,left_only


In [27]:
df_1M_songs[df_1M_songs.artist_lower.str.contains('yankovic')].shape

(76, 26)

In [31]:
df_1M_songs[df_1M_songs.artists.str.contains('Yankovic')].shape

(362, 26)

In [None]:
# merge billboard 100 with 1.2M songs

# change merging fields to lowercase with matching names
df_billboard['artist_lower'] = df_billboard['artist'].str.lower()
df_billboard['track_lower'] = df_billboard['song'].str.lower()

# need to extract list-style formatting using Regex:
"""
https://stackoverflow.com/questions/71469808/how-to-replace-a-list-with-first-element-of-list-in-pandas-dataframe-column
regex:
(             # start capturing
(?<=\[["\'])  # if preceded by [" or ['
[^"\']*       # get all text until " or '
|             # OR
^[^"\']+$     # get whole string if it doesn't contain " or '
)             # stop capturing
"""
df_1M_songs['artist_lower'] = df_1M_songs['artists'].str.extract('((?<=\[["\'])[^"\']*|^[^"\']+$)')
df_1M_songs['artist_lower'] = df_1M_songs['artist_lower'].str.lower()
df_1M_songs['track_lower'] = df_1M_songs['name'].str.lower()

# join tables
df = pd.merge(df_billboard, df_1M_songs, how='outer', on=['artist_lower', 'track_lower'], indicator=True)
df.shape, df[df._merge=='both'].shape

In [None]:
# how many missing songs?
df[df._merge == 'left_only'].drop_duplicates(['artist_lower', 'track_lower']).reset_index().shape

In [None]:
# how many found songs?
df[df._merge == 'both'].drop_duplicates(['artist_lower', 'track_lower']).reset_index().shape
# lame

In [None]:
missing_songs = df[df._merge == 'left_only'].drop_duplicates(['artist_lower', 'track_lower']).sort_values(['artist_lower', 'track_lower']).reset_index()[['artist_lower', 'track_lower']]
missing_songs

In [None]:
# manually check database for these
manual_check = df_1M_songs[['artist_lower', 'track_lower', 'album']]

In [None]:
manual_check[manual_check.track_lower == 'velcro fly']

In [None]:
# can i use this to join?
manual_check[manual_check.track_lower.str.contains('velcro fly')]

# don't like this solution, it will be humongous
# https://stackoverflow.com/questions/58011182/pandas-merge-a-dataframe-on-an-exact-and-partial-match

In [None]:
manual_check[manual_check.artist_lower=='zz top'].sort_values('track_lower')
# they're all in there, maybe we need a partial match for the merge?

In [None]:
# which songs were not matched to billboard 100?
unmatched = df[df._merge == 'right_only'].drop_duplicates(['artist_lower', 'track_lower'])[['artists', 'artist_lower', 'track_lower']].sort_values(['artist_lower', 'track_lower']).reset_index()

In [None]:
unmatched.sample(20)

### Can we merge Billboard 100 with Song Metadata?
### not well (~90% missing)
### NLP section only
    * we could use this merge for NLP
    * this could be a representative sample
    * discard for now because it would be nice to use the entire Billboard 100 dataset

In [None]:
# Billboard Top 100 Historical Data
# link:  https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs
# via:  https://toolbox.google.com/datasetsearch
url_billboard = r'D:\RYERSON\820\Datasets\Billboard The Hot 100 Songs\charts.csv'

# Music Dataset: Lyrics and Metadata from 1950 to 2019
# link:  https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs
# via:  https://toolbox.google.com/datasetsearch
# info re metadata:  https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-audio-features
# also available via Spotify web API:
    # link:  https://developer.spotify.com/documentation/web-api/
url_songdata = r'D:\RYERSON\820\Datasets\Music Dataset Lyrics and Metadata from 1950 to 2019\tcc_ceds_music.csv'


In [None]:
df_billboard = pd.read_csv(url_billboard)

# lowercase for joining datasets
# these match the names in the other dataframe, and are lowercase, as the other dataset is
df_billboard['artist_name'] = df_billboard['artist'].str.lower()
df_billboard['track_name'] = df_billboard['song'].str.lower()

df_billboard.head(1)

In [None]:
df_songdata = pd.read_csv(url_songdata)
# df_songdata.rename(columns={'artist_name': 'artist', 'track_name': 'song'}, inplace=True)
df_songdata.head(1)

In [None]:
df_billboard.shape, df_songdata.shape

In [None]:
df = pd.merge(df_billboard, df_songdata, how='left', on=['artist_name', 'track_name'], indicator=True)
df.shape, df[df._merge=='both'].shape

In [None]:
# lots of missing hits, let's check what's up
temp_columns = ['artist_name', 'track_name', '_merge']
temp = df[temp_columns]

In [None]:
temp.sample(10)

In [None]:
df_songdata[df_songdata.artist_name=='vikki carr']

In [None]:
df_songdata[df_songdata.artist_name=='walk the moon']

In [None]:
# ok, how many songs are missing? should we use the spotify API instead?

# missing tracks
missing = temp[temp._merge=='left_only'].drop_duplicates(keep='first')

# total tracks
all_tracks = df_billboard[['song', 'artist']].drop_duplicates(keep='first')

In [None]:
missing.shape[0] / all_tracks.shape[0]
# 90% of songs are missing