# Imports

In [15]:
# import modules
import pandas as pd
import numpy as np
import spotipy
from ast import literal_eval

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:_.2f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [None]:
%%time
##### SQL 8+M
"""
    SELECT * FROM tracks
    JOIN r_track_artist ON tracks.id = r_track_artist.track_id
    JOIN artists ON r_track_artist.artist_id = artists.id
    JOIN audio_features ON audio_features.id = tracks.audio_feature_id
"""
df_SQL = pd.read_csv('all_audio_features_sql.csv')

# import genre and release date data
"""
    SELECT tracks.id AS id, release_date, genre_id as genre FROM tracks
    JOIN r_albums_tracks ON tracks.id = r_albums_tracks.track_id
    JOIN albums ON r_albums_tracks.album_id = albums.id
    JOIN r_track_artist ON tracks.id = r_track_artist.track_id
    JOIN r_artist_genre ON r_track_artist.artist_id = r_artist_genre.artist_id
"""
df_genre = pd.read_csv('SQL_track_release_date_and_genre.csv')
df_genre['genre_count'] = df_genre.groupby('genre')['genre'].transform('count')  # add a count column
df_TEMP = df_genre.copy()  # create temp df, sort by most common genre, merge with SQL data
df_TEMP = df_TEMP.sort_values('genre_count', ascending=False).drop_duplicates(['id']).reset_index(drop=True)
df_SQL = df_SQL.merge(df_TEMP, on='id')

# now format and save df_genre
df_genre = df_genre['genre', 'genre_count'].sort_values('genre_count', ascending=False).drop_duplicates(['genre']).reset_index(drop=True)

# formatting
df_SQL['release_date'] = pd.to_datetime(df_SQL['release_date'], unit='ms', origin='unix', errors = 'coerce')
df_SQL = df_SQL.rename({
    'name:1': 'artist',
    'name': 'song',
    'duration:1': 'duration_ms'
}, axis=1)[[
    'id', 'song', 'artist', 'genre', 'release_date',
    'acousticness', 'danceability', 'duration_ms', 
    'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 
    'mode', 'speechiness', 'tempo','time_signature', 'valence'
]].reset_index(drop=True)

# NOTE: DO NOT DROP DUPLICATES YET, NEED FOR LOOKUP WITH B100

# save files as pickle
df_genre.to_pickle('df_genre.pickle')
df_SQL.to_pickle('df_SQL.pickle')

In [16]:
%%time
##### Spotify 1.2M+ Songs
url_1M_songs = r'D:\RYERSON\820\Datasets\Spotify 1.2M+ Songs\tracks_features.csv'
df_1M_songs = pd.read_csv(url_1M_songs)

# doesn't have genre (would take 1000+ to get using API, will stay NA)
df_1M_songs['genre'] = pd.NA

# explode artists
df_1M_songs['artists'] = df_1M_songs['artists'].apply(literal_eval) #convert to list type
df_1M_songs = df_1M_songs.explode('artists', ignore_index=True)

# formatting
df_1M_songs['release_date'] = pd.to_datetime(df_1M_songs['release_date'], errors = 'coerce')
df_1M_songs = df_1M_songs.rename({
    'name': 'song',
    'artists': 'artist'
}, axis=1)[[
    'id', 'song', 'artist', 'genre', 'release_date',
    'acousticness', 'danceability', 'duration_ms', 
    'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 
    'mode', 'speechiness', 'tempo','time_signature', 'valence'
]].reset_index(drop=True)

# NOTE: DO NOT DROP DUPLICATES YET, NEED FOR LOOKUP WITH B100

# save files as pickle
# df_genre.to_pickle('df_1M_songs.pickle')

Wall time: 15.8 s


In [17]:
df_1M_songs.sample(10)

Unnamed: 0,id,song,artist,genre,release_date,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
1740313,73s2pe7BuwkjXqu1z7eclY,A Trumpter's Lullaby,Boston Pops Orchestra,,1964-01-01,0.98,0.54,166013,0.32,0.92,5,0.18,-12.73,1,0.05,86.77,3.0,0.33
830569,7aNG58Ok0iGhzFjVLwBrC6,Light - Extended,Sanjoy,,2016-05-20,0.03,0.77,253125,0.82,0.21,10,0.07,-4.55,0,0.16,128.01,4.0,0.5
1369381,0TAbL8RndgG9zjDTOhaqjY,Kool-Aid's My Cologne (Remastered),Compass Clique,,2019-02-01,0.11,0.32,191009,0.55,0.0,11,0.35,-13.15,0,0.34,66.08,4.0,0.47
1397331,4dmY8mWmGKvkOUYb73vlaA,Mademoiselle,Qveen Herby,,2019-09-04,0.41,0.9,84762,0.57,0.0,11,0.21,-6.25,1,0.19,128.01,4.0,0.52
1503667,7uTMSURSsZKg53DFr83R6c,Не надо быть мной,Marselle,,2008-01-01,0.44,0.72,271293,0.71,0.0,2,0.32,-5.7,1,0.31,143.58,4.0,0.58
34575,7nKq1r5B1tFXtDVPWcqJqr,Dart At The Map,The Futureheads,,2010-04-27,0.0,0.45,244753,0.83,0.01,7,0.05,-4.99,1,0.05,163.91,3.0,0.6
87543,3K1nFviVW5PMG9qqguFK6j,The Old Rugged Cross,Attalus,,2013-04-19,0.89,0.44,230840,0.24,0.0,9,0.1,-12.14,1,0.03,102.79,3.0,0.24
299833,2zypHT4C4ZWZ58TZZChvsU,The North Green Down II,Dakota Suite,,2011-02-25,0.99,0.59,234000,0.09,0.96,1,0.38,-22.13,0,0.03,101.74,4.0,0.04
1148005,1MRAJUsUOggr2gOMarB9Jm,I'll Never Let You Go,The Jokers,,2009-01-01,0.44,0.51,140227,0.45,0.0,7,0.1,-9.63,1,0.03,121.63,4.0,0.59
1396749,6m2Tzv5ttaO1KdaBbKIzbr,Never Let Her Go,Daniel Davies,,2015-09-18,0.89,0.5,205573,0.46,0.9,10,0.19,-9.09,1,0.03,130.71,4.0,0.11


In [18]:
df_1M_songs.shape

(1798207, 18)

In [None]:
%%time
##### Billboard Top 100 Historical Data
url_B100 = r'D:\RYERSON\820\Datasets\Billboard The Hot 100 Songs\charts.csv'
df_B100 = pd.read_csv(url_B100)
df_B100['date'] = pd.to_datetime(df_B100['date'])

# Unique Songs from The Billboard 100 Dataset
df_B100_songs = df_B100[['song', 'artist']].drop_duplicates().sort_values(['artist', 'song']).reset_index(drop=True)
df_B100_songs['id'] = ''  # add a blank id column

# save files as pickle
df_genre.to_pickle('df_B100.pickle')
df_genre.to_pickle('df_B100_songs.pickle')