In [5]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import billboard
import requests, os, sys, json, orjson

import deepdish
import tables
import h5py
import sklearn,scipy
from tqdm import tqdm

import torch
print(f"Torch Version: {torch.__version__}")
print(f"Using GPU: {torch.cuda.is_available()}")

Torch Version: 2.0.0
Using GPU: True


In [80]:
data_files = [x for x in os.listdir('data/') if x.endswith('.json')]

def extract_songs(row):
    """Helper function to be used with apply"""
    return pd.DataFrame(row).drop('pos', axis=1).drop_duplicates()

def rip_songs(json_files):
    """Generates the song dataset"""
    df_list = []

    for file in tqdm(json_files, total=len(json_files), desc='Reading JSON Data'): 
        with open(os.path.join('data/', file), 'rb') as f: 
            content = orjson.loads(f.read())
        
        playlists = pd.DataFrame(content['playlists'])['tracks']
        df_list.append(pd.concat(playlists.apply(extract_songs).tolist(), ignore_index=True).drop_duplicates().reset_index(drop=True))
    
    return pd.concat(df_list).drop_duplicates().reset_index(drop=True)

data = rip_songs(json_files=data_files)

if 'All_unique_songs.pkl' not in os.listdir():
    data.to_pickle('All_unique_songs.pkl')
    
data

Reading JSON Data: 100%|██████████| 1000/1000 [34:44<00:00,  2.08s/it]


Unnamed: 0,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name
0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook
1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Toxic,spotify:album:0z7pVBGOD7HCIB7S8eLkLI,198800,In The Zone
2,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Crazy In Love,spotify:album:25hVFAxTlDvXbx2X2QkUkE,235933,Dangerously In Love (Alben für die Ewigkeit)
3,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:artist:31TPClRtHm23RisEBtV3X7,Rock Your Body,spotify:album:6QPkyl04rXwTGlGlcYaRoW,267266,Justified
4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,spotify:album:6NmFmPX56pcLBOFMhIiKvF,227600,Hot Shot
...,...,...,...,...,...,...,...
2262287,Pax217,spotify:track:2aOoiTTV0OR8DYxCk8o0JR,spotify:artist:52BrJWa29mZPy5hgDIs6d0,Forevermind - Pax217 Album Version,spotify:album:5ULVqJFwZ8AQ1Dldq9ZDby,206400,Two Seventeen
2262288,Pax217,spotify:track:1Uo65qTxnCg1N1X00lgcjr,spotify:artist:52BrJWa29mZPy5hgDIs6d0,Shalom - Pax217 Album Version,spotify:album:5ULVqJFwZ8AQ1Dldq9ZDby,268266,Two Seventeen
2262289,Pax217,spotify:track:5uEE5tii66I0cC7kZ7IMxE,spotify:artist:52BrJWa29mZPy5hgDIs6d0,Free To Be - Pax217 Album Version,spotify:album:5ULVqJFwZ8AQ1Dldq9ZDby,334280,Two Seventeen
2262290,Pax217,spotify:track:6A1RfnrMdxb24OYllzzTUX,spotify:artist:52BrJWa29mZPy5hgDIs6d0,Skwid - Pax217 Album Version,spotify:album:5ULVqJFwZ8AQ1Dldq9ZDby,240453,Two Seventeen


In [96]:
artist_counts = data.artist_name.value_counts()
valid_artists = artist_counts[artist_counts > 10].index.values

data[(data.artist_name.isin(valid_artists)) & (data.duration_ms < 600000)].reset_index(drop=True)#.describe()

Unnamed: 0,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name
0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook
1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Toxic,spotify:album:0z7pVBGOD7HCIB7S8eLkLI,198800,In The Zone
2,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Crazy In Love,spotify:album:25hVFAxTlDvXbx2X2QkUkE,235933,Dangerously In Love (Alben für die Ewigkeit)
3,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:artist:31TPClRtHm23RisEBtV3X7,Rock Your Body,spotify:album:6QPkyl04rXwTGlGlcYaRoW,267266,Justified
4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,spotify:album:6NmFmPX56pcLBOFMhIiKvF,227600,Hot Shot
...,...,...,...,...,...,...,...
1651012,Pax217,spotify:track:10UkkV6hmRLToXqnNbhdEF,spotify:artist:52BrJWa29mZPy5hgDIs6d0,Gratitude - Pax217 Album Version,spotify:album:5ULVqJFwZ8AQ1Dldq9ZDby,158386,Two Seventeen
1651013,Pax217,spotify:track:2aOoiTTV0OR8DYxCk8o0JR,spotify:artist:52BrJWa29mZPy5hgDIs6d0,Forevermind - Pax217 Album Version,spotify:album:5ULVqJFwZ8AQ1Dldq9ZDby,206400,Two Seventeen
1651014,Pax217,spotify:track:1Uo65qTxnCg1N1X00lgcjr,spotify:artist:52BrJWa29mZPy5hgDIs6d0,Shalom - Pax217 Album Version,spotify:album:5ULVqJFwZ8AQ1Dldq9ZDby,268266,Two Seventeen
1651015,Pax217,spotify:track:5uEE5tii66I0cC7kZ7IMxE,spotify:artist:52BrJWa29mZPy5hgDIs6d0,Free To Be - Pax217 Album Version,spotify:album:5ULVqJFwZ8AQ1Dldq9ZDby,334280,Two Seventeen


# data.world/typhon/billboard-hot-100-songs-2000-2018-w-spotify-data-lyrics

In [161]:
data = pd.read_csv('billboard_2000_2018_spotify_lyrics.csv').drop(columns=['simple_title','main_artist', 'spotify_link','spotify_id','video_link','analysis_url', 'time_signature','speechiness','acousticness','instrumentalness','key','mode','date','year','duration_ms']).dropna(axis=0).drop_duplicates(subset=['artist','title'])
data = data[~(data.lyrics == 'Error: Could not find lyrics.')].reset_index(drop=True)
data

Unnamed: 0,title,artist,peak_pos,last_pos,weeks,rank,change,genre,broad_genre,energy,liveness,tempo,danceability,loudness,valence,lyrics
0,...Ready For It?,taylor swift,4,59,17,61,-2,"[u'dance pop', u'pop', u'post-teen pop']",pop,0.776,0.155,160.053,0.616,-6.519,0.459,"Knew he was a killer, first time that I saw hi..."
1,1-800-273-8255,logic,3,34,35,34,0,[u'rap'],rap,0.572,0.192,100.015,0.629,-7.733,0.386,I've been on the low\nI been taking my time\nI...
2,A Holly Jolly Christmas,burl ives,38,38,5,46,-8,[u'christmas'],unknown,0.363,0.15,140.296,0.665,-12.983,0.824,Ding Dong Ding\nDing dong ding\nHave a holly j...
3,All I Want For Christmas Is You,mariah carey,9,9,24,13,-4,"[u'dance pop', u'hip pop', u'pop', u'pop rap',...",r&b,0.625,0.0708,150.277,0.335,-7.462,0.346,I don't want a lot for Christmas\nThere is jus...
4,Attention,charlie puth,5,44,36,39,5,"[u'dance pop', u'pop', u'post-teen pop', u'vir...",pop,0.626,0.0848,100.041,0.774,-4.432,0.777,"You've been running 'round, running 'round, ru..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6758,This Gift,98 degrees,49,49,3,79,-30,[u'dance pop'],pop,0.529,0.0824,86.984,0.623,-8.649,0.423,The snow is fallin' The city is white\nYou're ...
6759,Deck The Halls,shedaisy,61,69,3,61,8,"[u'contemporary country', u'country', u'countr...",country,0.837,0.144,118.827,0.575,-7.141,0.444,Deck the halls\n(Deck the halls)\nWith boughs ...
6760,I Love You,martina mcbride,24,66,20,71,-5,"[u'contemporary country', u'country', u'countr...",country,0.586,0.132,106.754,0.687,-5.991,0.732,She dropped the phone and burst into tears\nTh...
6761,Left & Right,d'angelo,70,70,5,91,-21,"[u'funk', u'hip hop', u'indie r&b', u'neo soul...",r&b,0.405,0.383,92.115,0.838,-9.684,0.843,"Yo, yo\nMy flows remarkable\n\nDoc walk like K..."


# Million Song Subset

In [137]:
shpfiles = [os.path.join(d, x)
            for d, dirs, files in os.walk('MillionSongSubset')
            for x in files if x.endswith(".h5")]

#pd.DataFrame(deepdish.io.load(shpfiles[0])).T

keys = ['analysis', 'metadata', 'musicbrainz']
f = h5py.File(shpfiles[0], 'r')
t = f[keys[0]]
tt = f[keys[1]]
list(t.values())

[<HDF5 dataset "bars_confidence": shape (83,), type "<f8">,
 <HDF5 dataset "bars_start": shape (83,), type "<f8">,
 <HDF5 dataset "beats_confidence": shape (344,), type "<f8">,
 <HDF5 dataset "beats_start": shape (344,), type "<f8">,
 <HDF5 dataset "sections_confidence": shape (10,), type "<f8">,
 <HDF5 dataset "sections_start": shape (10,), type "<f8">,
 <HDF5 dataset "segments_confidence": shape (971,), type "<f8">,
 <HDF5 dataset "segments_loudness_max": shape (971,), type "<f8">,
 <HDF5 dataset "segments_loudness_max_time": shape (971,), type "<f8">,
 <HDF5 dataset "segments_loudness_start": shape (971,), type "<f8">,
 <HDF5 dataset "segments_pitches": shape (971, 12), type "<f8">,
 <HDF5 dataset "segments_start": shape (971,), type "<f8">,
 <HDF5 dataset "segments_timbre": shape (971, 12), type "<f8">,
 <HDF5 dataset "songs": shape (1,), type "|V220">,
 <HDF5 dataset "tatums_confidence": shape (688,), type "<f8">,
 <HDF5 dataset "tatums_start": shape (688,), type "<f8">]

In [81]:
pd.read_csv('top10s.csv', encoding='ISO-8859-1', index_col=0)

Unnamed: 0,title,artist,top genre,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop
1,"Hey, Soul Sister",Train,neo mellow,2010,97,89,67,-4,8,80,217,19,4,83
2,Love The Way You Lie,Eminem,detroit hip hop,2010,87,93,75,-5,52,64,263,24,23,82
3,TiK ToK,Kesha,dance pop,2010,120,84,76,-3,29,71,200,10,14,80
4,Bad Romance,Lady Gaga,dance pop,2010,119,92,70,-4,8,71,295,0,4,79
5,Just the Way You Are,Bruno Mars,pop,2010,109,84,64,-5,9,43,221,2,4,78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599,Find U Again (feat. Camila Cabello),Mark Ronson,dance pop,2019,104,66,61,-7,20,16,176,1,3,75
600,Cross Me (feat. Chance the Rapper & PnB Rock),Ed Sheeran,pop,2019,95,79,75,-6,7,61,206,21,12,75
601,"No Brainer (feat. Justin Bieber, Chance the Ra...",DJ Khaled,dance pop,2019,136,76,53,-5,9,65,260,7,34,70
602,Nothing Breaks Like a Heart (feat. Miley Cyrus),Mark Ronson,dance pop,2019,114,79,60,-6,42,24,217,1,7,69


In [16]:
token = f"953a5886c15d403294f4d974c513e44e"
endpoint = 'https://api.spotify.com/v1/me/top/tracks'
bearer = f'95d5c1d2865d4413b9312244603a5547'
headers = {bearer : token}
response = requests.get(endpoint, headers=headers)#, params=params)

In [6]:
chart = billboard.ChartData('hot-100-songs', year=2012)
chart

billboard.ChartData('hot-100-songs', year=2012)

In [7]:
chart2 = billboard.ChartData('pop-songs', year=2012)

In [11]:
list(chart2) 

[billboard.YearEndChartEntry(title='Lights', artist='Ellie Goulding'),
 billboard.YearEndChartEntry(title='We Found Love', artist='Rihanna Featuring Calvin Harris'),
 billboard.YearEndChartEntry(title='Glad You Came', artist='The Wanted'),
 billboard.YearEndChartEntry(title='Call Me Maybe', artist='Carly Rae Jepsen'),
 billboard.YearEndChartEntry(title='Payphone', artist='Maroon 5 Featuring Wiz Khalifa'),
 billboard.YearEndChartEntry(title='Good Feeling', artist='Flo Rida'),
 billboard.YearEndChartEntry(title='We Are Young', artist='fun. Featuring Janelle Monae'),
 billboard.YearEndChartEntry(title='Somebody That I Used To Know', artist='Gotye Featuring Kimbra'),
 billboard.YearEndChartEntry(title='Wide Awake', artist='Katy Perry'),
 billboard.YearEndChartEntry(title='Set Fire To The Rain', artist='Adele'),
 billboard.YearEndChartEntry(title="Stronger (What Doesn't Kill You)", artist='Kelly Clarkson'),
 billboard.YearEndChartEntry(title='It Will Rain', artist='Bruno Mars'),
 billboard.