# Imports

In [274]:
# import modules
import pandas as pd
import numpy as np
import re
from ast import literal_eval
import spotipy

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:_.2f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [4]:
# helper functions

def find_all_tracks(track_title, artist_name):
    """ returns list of lists ['id', 'song', 'artist'] """
    track_info = spotify.search(q='artist:' + artist_name + ' track:' + track_title, type='track')
    
    if track_info['tracks']['items'] == []:  # if track doesn't exist on Spotify
        return 'MISSING'
    else:
        all_tracks = []
        number_of_results = len(track_info['tracks']['items'])
        
        # check if there is a better match
        for i in range(number_of_results):
            track_id = track_info['tracks']['items'][i]['id']
            artist_name = track_info['tracks']['items'][i]['artists'][0]['name']
            song_name = track_info['tracks']['items'][i]['name']
            all_tracks.append([track_id, song_name, artist_name])
        
        # if we made it through the loop without returning, note 'MISSING' and return the 0th id
        return all_tracks


def remove_punctuation(text_input):
    text_input = str(text_input)  # avoid float errors in applymap()
    text_input = re.sub(r'&', 'and', text_input)  # replaces & with 'and'
    text_input = re.compile(r'[^a-zA-Z 0-9]').sub('', text_input)
    return text_input.lower().strip()


def clean_text(text_input):
    text_input = str(text_input)  # avoid float errors in applymap()
    text_input = text_input.strip().lower()
    text_input = re.sub(r'&', 'and', text_input)  # replaces & with 'and'
    text_input = re.sub(r'and.+', '', text_input)  # removes text after the 'and'
    text_input = re.compile(r'the').sub('', text_input)  # remove all 'the' (maybe just need the 1st word?)
    text_input = re.sub(r',.+', '', text_input)  # removes all misc artists, after comma 
    text_input = re.sub(r'(?:feat).+', '', text_input)  # removes all misc artists, after 'feat' 
    text_input = re.sub(r'\(.+', '', text_input)  # removes text after first bracket
    text_input = re.sub(r'\-.+', '', text_input)  # removes text after first dash
    text_input = re.compile(r'[^a-zA-Z 0-9]').sub('', text_input)  # remove punctuation
    text_input = re.sub(' +', ' ', text_input)  # remove multiple spaces
    return text_input.strip()

# STEP 1: Import and Setup Datatypes

In [27]:
formatting = [
    'id', 'song', 'artist', 'genre', 'release_date',
    'acousticness', 'danceability', 'duration_ms', 
    'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 
    'mode', 'speechiness', 'tempo','time_signature', 'valence'
]

dtypes = {
    'key': 'Int16', 'mode': 'Int16', 'time_signature': 'Int16', 'tempo': 'float32', 
    'acousticness': 'float32', 'danceability': 'float32', 'duration_ms': 'Int64',  
    'energy': 'float32', 'instrumentalness': 'float32', 'liveness': 'float32', 
    'loudness': 'float32', 'speechiness': 'float32', 'valence': 'float32'
} 

In [28]:
%%time
##### Billboard Top 100 Historical Data
url_B100 = r'D:\RYERSON\820\Datasets\Billboard The Hot 100 Songs\charts.csv'
dtypes_timeseries = {
    'rank': 'Int16', 'last-week': 'Int16', 'peak-rank': 'Int16', 'weeks-on-board': 'Int16'
}
df_B100 = pd.read_csv(url_B100, dtype=dtypes_timeseries)
df_B100['date'] = pd.to_datetime(df_B100['date'])

# Unique Songs from The Billboard 100 Dataset
df_B100_songs = df_B100[['song', 'artist']].drop_duplicates().sort_values(['artist', 'song']).reset_index(drop=True)

# Add Columns
df_TEMP = pd.DataFrame()
df_TEMP['release_date'] = pd.to_datetime(np.nan)
df_TEMP['genre'] = pd.NA
df_TEMP[formatting[5:]] = np.nan
df_TEMP = df_TEMP.astype(dtypes)
df_B100_songs = pd.concat([df_B100_songs, df_TEMP], axis=1)
df_B100_songs['id'] = pd.NA
df_B100_songs = df_B100_songs[formatting]

# save files as pickle
df_B100.to_pickle('init_df_B100.pickle')
df_B100_songs.to_pickle('init_df_B100_songs.pickle')

Wall time: 812 ms


In [86]:
%%time
##### SQL 8+M
"""
    SELECT * FROM tracks
    JOIN r_track_artist ON tracks.id = r_track_artist.track_id
    JOIN artists ON r_track_artist.artist_id = artists.id
    JOIN audio_features ON audio_features.id = tracks.audio_feature_id
"""
df_SQL = pd.read_csv('all_audio_features_sql.csv', dtype=dtypes)

# import genre and release date data
"""
    SELECT tracks.id AS id, release_date, genre_id as genre FROM tracks
    JOIN r_albums_tracks ON tracks.id = r_albums_tracks.track_id
    JOIN albums ON r_albums_tracks.album_id = albums.id
    JOIN r_track_artist ON tracks.id = r_track_artist.track_id
    JOIN r_artist_genre ON r_track_artist.artist_id = r_artist_genre.artist_id
"""
df_genre = pd.read_csv('SQL_track_release_date_and_genre.csv')
df_genre['genre_count'] = df_genre.groupby('genre')['genre'].transform('count')  # add a count column
df_TEMP = df_genre.copy()  # create temp df, sort by most common genre, merge with SQL data
df_TEMP = df_TEMP.sort_values('genre_count', ascending=False).drop_duplicates(['id']).reset_index(drop=True)
df_SQL = df_SQL.merge(df_TEMP, on='id', how='left')

# format and save df_genre
df_genre = (
    df_genre[['genre', 'genre_count']]
    .sort_values('genre_count', ascending=False)
    .drop_duplicates(['genre'])
    .reset_index(drop=True)
)

# formatting
df_SQL['release_date'] = pd.to_datetime(df_SQL['release_date'], unit='ms', origin='unix', errors = 'coerce')
df_SQL = df_SQL.rename(
    {'name:1': 'artist', 'name': 'song','duration:1': 'duration_ms'}, 
    axis=1
)[formatting].reset_index(drop=True)

# save files as pickle
df_genre.to_pickle('df_genre.pickle')
df_SQL.to_pickle('df_SQL.pickle')

Wall time: 3min 35s


In [87]:
len(set(df_SQL.id))

8741110

In [79]:
%%time
##### Spotify 1.2M+ Songs
url_1M_songs = r'D:\RYERSON\820\Datasets\Spotify 1.2M+ Songs\tracks_features.csv'
dtypes = {
    'key': 'Int16', 'mode': 'Int16', 'time_signature': 'Int16', 'tempo': 'float32', 
    'acousticness': 'float32', 'danceability': 'float32', 'duration_ms': 'Int64',  
    'energy': 'float32', 'instrumentalness': 'float32', 'liveness': 'float32', 
    'loudness': 'float32', 'speechiness': 'float32', 'valence': 'float32'
} 
df_1M_songs = pd.read_csv(url_1M_songs, dtype=dtypes)
print(df_1M_songs.shape[0])

# doesn't have genre (would take 1000+ hrs to get using API, will stay NA)
df_1M_songs['genre'] = pd.NA

# explode artists
df_1M_songs['artists'] = df_1M_songs['artists'].apply(literal_eval) #convert to list type
# print(df_1M_songs.explode('artists', ignore_index=True).shape[0]) # correct
df_1M_songs = df_1M_songs.explode('artists').reset_index(drop=True)
print(df_1M_songs.shape[0])

# formatting
df_1M_songs['release_date'] = pd.to_datetime(df_1M_songs['release_date'], errors = 'coerce')
df_1M_songs = df_1M_songs.rename({
    'name': 'song',
    'artists': 'artist'
}, axis=1)[[
    'id', 'song', 'artist', 'genre', 'release_date',
    'acousticness', 'danceability', 'duration_ms', 
    'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 
    'mode', 'speechiness', 'tempo','time_signature', 'valence'
]].reset_index(drop=True)

# NOTE: DO NOT DROP DUPLICATES YET, NEED FOR LOOKUP WITH B100

# save files as pickle
df_1M_songs.to_pickle('df_1M_songs.pickle')

1204025
1798207
Wall time: 19.7 s


In [91]:
%%time
# merge SQL and CSV AF to get df_10M
ids_SQL = set(df_SQL['id'].to_list())
df_TEMP = df_1M_songs.copy()  
df_TEMP = df_TEMP[~df_TEMP.id.isin(ids_SQL)]  # drop songs already in SQL before merge
df_10M = pd.concat([df_SQL, df_TEMP]).reset_index(drop=True)

# save as pickle
df_10M.to_pickle('init_df_10M.pickle')

Wall time: 25.8 s


In [92]:
len(set(df_10M.id))

9592419

# STEP 2: Spotify API - GET id, then audio features

In [93]:
# reload data if required
df_B100_songs = pd.read_pickle('init_df_B100_songs.pickle')

# number missing ids
df_B100_songs[df_B100_songs.id.isnull()].shape[0]

29681

##### get a temporary authorization token from: https://developer.spotify.com/console/get-search-item

In [109]:
# input the temporary token
TEMP_TOKEN = input('Enter token: ')

# create a spotify object
spotify = spotipy.Spotify(auth=TEMP_TOKEN)

Enter token: BQC7g6bIGQdnc-SGz2llxofZdjOcE6-TD9T4ft9AYVjBUEGyxDetGEojqsp98avFRwnLKCtOa07iFlvvqm_ZZCHtfOnqVj6JgjBNvVyE9nWH8oyhqw6O4QPSbBDnTTZ1wfSSuHM3-W5hrRGn92Jzb2JfoKmUMJ4F_QDX7qGZoo2H


In [100]:
%%time
# loop to GET id

counter = 0
start_over_at = 17600
if start_over_at == 0:
    id_from_API = set()

for i, row in df_B100_songs.iterrows():
        
    if counter % 100 == 0:
        print(counter, end=' ')
    if counter % 1000 == 0:
        print()
    
    counter += 1
    
    if i < start_over_at:  # where we timed out last time
        continue
    
    # save temp file
    if counter % 1000 == 0:
        df_B100_songs.to_pickle('df_B100_songs_ID_TEMP.pickle')
    
    if not df_B100_songs.iloc[[i]].isnull()['id'].values[0]:
        continue    
    
    # these are the actual song and artist from the Billboard Hot 100
    song = df_B100_songs.loc[df_B100_songs.index[i], 'song']
    artist = df_B100_songs.loc[df_B100_songs.index[i], 'artist']
    
    # get all track info from Spotify API matching 'song' and 'artist'
    all_track_info = find_all_tracks(song, artist)
    
    # restart loop if there are no results
    if all_track_info == 'MISSING':
        continue
    
    # first subloop - check for direct matched
    is_exact_match = False
    id_found = False
    
    for track_info in all_track_info:
        temp_id = track_info[0]
        temp_song = track_info[1]
        temp_artist = track_info[2]
        
        # if there is an exact text match
        is_exact_match = remove_punctuation(temp_song) == remove_punctuation(song) and \
                         remove_punctuation(temp_artist) == remove_punctuation(artist)
        
        # continue subloop
        if is_exact_match:
            df_B100_songs.loc[df_B100_songs.index[i], 'id'] = temp_id
            id_from_API.add(temp_id)
            id_found = True
            break
    
    # if we found the id, go to the next row item, else check for approx matches
    if not id_found:
      
        is_probable_match = False    
            
        # second subloop - check for indirect matches (shouldn't run if direct match occurs)
        for track_info in all_track_info:
            temp_id = track_info[0]
            temp_song = track_info[1]
            temp_artist = track_info[2]

            # if there is a probable text match
            is_probable_match = clean_text(temp_song) == clean_text(song) and \
                                clean_text(temp_artist) == clean_text(artist)

            if is_probable_match:
                df_B100_songs.loc[df_B100_songs.index[i], 'id'] = temp_id
                id_from_API.add(temp_id)
                break
            

df_B100_songs.to_pickle('df_B100_songs_ID_COMPLETE.pickle')

0 
100 200 300 400 500 600 700 800 900 1000 
1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 
2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 
3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 
4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 
5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 
6100 6200 6300 6400 6500 6600 6700 6800 6900 7000 
7100 7200 7300 7400 7500 7600 7700 7800 7900 8000 
8100 8200 8300 8400 8500 8600 8700 8800 8900 9000 
9100 9200 9300 9400 9500 9600 9700 9800 9900 10000 
10100 10200 10300 10400 10500 10600 10700 10800 10900 11000 
11100 11200 11300 11400 11500 11600 11700 11800 11900 12000 
12100 12200 12300 12400 12500 12600 12700 12800 12900 13000 
13100 13200 13300 13400 13500 13600 13700 13800 13900 14000 
14100 14200 14300 14400 14500 14600 14700 14800 14900 15000 
15100 15200 15300 15400 15500 15600 15700 15800 15900 16000 
16100 16200 16300 16400 16500 16600 16700 16800 16900 17000 
17100 17200 17300 17400 17500 17600 17700 17800 17900 18000 
18100 1

In [101]:
# new ids
RUN2_id_from_API = id_from_API.copy()
len(id_from_API)

21277

In [103]:
df_id_from_API = pd.DataFrame(list(id_from_API))
df_id_from_API.to_pickle('df_id_from_API.pickle')

In [105]:
# 3 duplicates, remove later
# number of songs in new ids (should match len(id_from_API))
df_B100_songs[df_B100_songs.id.isin(id_from_API)].id.count()

21280

In [107]:
# was 4140 with sloppier (faster) method
# still missing ids
df_B100_songs[df_B100_songs.id.isnull()].shape[0]

8401

##### get a temporary authorization token from: https://developer.spotify.com/console/get-search-item

In [188]:
# input the temporary token
TEMP_TOKEN = input('Enter token: ')

# create a spotify object
spotify = spotipy.Spotify(auth=TEMP_TOKEN)

Enter token: BQDDf0krvUaAgJcHwIpBspgI4OQvVewwi1L8MW90b72DNhEHDQqVXCH0WTh20m8pgBewyDOXnkrB-fBc4okZUFoLXOCMNPxrUtWFd6dQ2iXtArUgGwL-IH1O17KpWUMZbm-xjwb5PC7Beem8EFXMAvHAkFd-fRwOl3dt15fVWGke


In [189]:
%%time
# loop to get genre, release_date, and audio features

# ordered list of genres for choosing the best genre
list_of_ordered_genres = list(df_genre.genre)

start_over_at = 25894
counter = 0
if start_over_at == 0:
    how_many_passes = 0  # for QA, how many genre should be missing
    how_many_fails = 0

for i, row in df_B100_songs.iterrows():
    
    if counter % 100 == 0:
        print(counter, end=' ')
    if counter % 1000 == 0:
        print()
    
    counter += 1
    
    if i < start_over_at:  # where we timed out last time
        continue
    
    # save temp file
    if counter % 1000 == 0:
        df_B100_songs.to_pickle('df_B100_songs_AF_TEMP.pickle')
    
    # get the current track id
    track_id = df_B100_songs['id'].iloc[i]
    
    # if we don't have a new ID, skip the entry (it's not on Spotify)
    if track_id not in id_from_API:
        continue
    
    # Get Audio Features - 1st GET request
    list_of_features = [
        'acousticness', 'danceability', 'duration_ms', 'energy',
        'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
        'speechiness', 'tempo', 'time_signature', 'valence'
    ]
    
    try:
        temp_audio_features = spotify.audio_features(track_id)
        for key in list_of_features:
            df_B100_songs.loc[i, key] = temp_audio_features[0][key]
    except:  # madonna has none AF, maybe a remaster????
        how_many_fails += 1
        if how_many_fails < 100:
            print('fail')
        pass

    # Get Release Date - 2nd GET request
    track_info = spotify.track(track_id)
    df_B100_songs.loc[i, 'release_date'] = track_info['album']['release_date']
    
    try:
        # Get Release Date Genre - 3rd GET request
        artist_id = track_info['artists'][0]['id']
        artist_info = spotify.artist(artist_id)
        list_of_artist_genres = artist_info['genres']
        
        most_common_genre = list_of_artist_genres[0] # default to first genre
        if len(list_of_artist_genres) == 1:
            pass
        else:
            for genre in list_of_ordered_genres:
                if genre in list_of_artist_genres:
                    most_common_genre = genre
                    break
        df_B100_songs.loc[i, 'genre'] = most_common_genre
    except:
        how_many_passes += 1
        pass  # didn't have any genres (or other error), move on

    
df_B100_songs.to_pickle('df_B100_songs_AF_COMPLETE.pickle')

0 
100 200 300 400 500 600 700 800 900 1000 
1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 
2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 
3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 
4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 
5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 
6100 6200 6300 6400 6500 6600 6700 6800 6900 7000 
7100 7200 7300 7400 7500 7600 7700 7800 7900 8000 
8100 8200 8300 8400 8500 8600 8700 8800 8900 9000 
9100 9200 9300 9400 9500 9600 9700 9800 9900 10000 
10100 10200 10300 10400 10500 10600 10700 10800 10900 11000 
11100 11200 11300 11400 11500 11600 11700 11800 11900 12000 
12100 12200 12300 12400 12500 12600 12700 12800 12900 13000 
13100 13200 13300 13400 13500 13600 13700 13800 13900 14000 
14100 14200 14300 14400 14500 14600 14700 14800 14900 15000 
15100 15200 15300 15400 15500 15600 15700 15800 15900 16000 
16100 16200 16300 16400 16500 16600 16700 16800 16900 17000 
17100 17200 17300 17400 17500 17600 17700 17800 17900 18000 
18100 1

In [190]:
# started 5:20
df_B100_songs.to_pickle('df_B100_songs_AF_TEMP.pickle')

# how far did we get - Run 4
passes_counter = (how_many_passes, counter)
passes_counter

(1524, 29681)

In [192]:
(
    df_B100_songs.shape[0], 
    df_B100_songs.shape[0] - df_B100_songs.key.isnull().sum(), 
    df_B100_songs.shape[0] - df_B100_songs.genre.isnull().sum()
)
# all, with AF, with genre

(29681, 21277, 19756)

In [193]:
df_B100_songs.to_pickle('df_B100_songs_POST_API.pickle')

# STEP 3: Merge, Clean, and Save Datasets

In [221]:
%%time
# reimport data
df_10M = pd.read_pickle('init_df_10M.pickle')
df_B100 = pd.read_pickle('init_df_B100.pickle')
df_B100_songs = pd.read_pickle('df_B100_songs_POST_API.pickle')

Wall time: 6.63 s


In [222]:
dtypes = {
    'key': 'Int16', 'mode': 'Int16', 'time_signature': 'Int16', 'tempo': 'float32', 
    'acousticness': 'float32', 'danceability': 'float32', 'duration_ms': 'Int64',  
    'energy': 'float32', 'instrumentalness': 'float32', 'liveness': 'float32', 
    'loudness': 'float32', 'speechiness': 'float32', 'valence': 'float32'
} 
df_B100_songs = df_B100_songs.astype(dtypes)

In [223]:
%%time
# drop rows in df_10M matching B100 songs
B100_song_id = set(df_B100_songs[~df_B100_songs.id.isna()].id)
df_10M = (
    df_10M[~df_10M.id.isin(B100_song_id)]
    .sort_values('release_date')
    .drop_duplicates(subset=['id'])
    .sort_values('artist')
    .reset_index(drop=True)
)

Wall time: 45.9 s


In [224]:
df_10M.shape

(9577395, 18)

In [225]:
%%time
# also remove partial matches (based on cleaned text, alt versions, etc)

# add approx song and artist columns for approx match
df_10M['approx_song'] = df_10M[['song']].applymap(clean_text)
df_10M['approx_artist'] = df_10M[['artist']].applymap(clean_text)

# add approx song and artist columns to B100 copy() for merging
df_TEMP = df_B100_songs.copy()
df_TEMP['approx_song'] = df_TEMP[['song']].applymap(clean_text)
df_TEMP['approx_artist'] = df_TEMP[['artist']].applymap(clean_text)
df_TEMP['REMOVE'] = 1
df_TEMP = df_TEMP[['approx_song', 'approx_artist', 'REMOVE']]

# remove partial matches
df_10M = df_10M.merge(df_TEMP, on=['approx_song', 'approx_artist'], how='left')
df_10M = df_10M[df_10M.REMOVE != 1].reset_index(drop=True)

Wall time: 2min 36s


In [226]:
df_10M.shape

(9410481, 21)

In [227]:
# fix formatting
formatting = [
    'id', 'song', 'artist', 'genre', 'release_date',
    'acousticness', 'danceability', 'duration_ms', 
    'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 
    'mode', 'speechiness', 'tempo','time_signature', 'valence'
]

df_B100_songs = df_B100_songs[formatting]

In [277]:
# drop duplicate ID and null values from df_B100_songs 
# NOTE: formerly this would be df_B100_songs_AF, redundant data
df_B100_songs = df_B100_songs[~df_B100_songs.key.isna()]
df_B100_songs = df_B100_songs.drop_duplicates(subset = ['id']).dropna(subset = ['id']).reset_index(drop=True)

In [229]:
# add in a flag from whether or not the song is in the B100
df_B100_songs['in_B100'] = True
df_10M['in_B100'] = False

# concat B100 songs at end of df_10M (all songs w data)
df_10M = pd.concat([df_10M, df_B100_songs]).sort_values('artist').reset_index(drop=True)

In [252]:
# FINAL DATASET: df_10M pickle file
df_10M = df_10M.drop(['approx_song', 'approx_artist', 'REMOVE'], axis=1)
df_10M.to_pickle('df_10M.pickle')

In [231]:
# FINAL DATASET: df_B100_songs pickle file
df_B100_songs = df_B100_songs[formatting]  # get rid of the in_B100 columns
df_B100_songs.to_pickle('df_B100_songs.pickle')

In [283]:
# merge B100 songs into B100
# EXACT match only
df_B100 = df_B100.merge(df_B100_songs, on=['song', 'artist'], how='left')

In [284]:
# confirm no songs in B100 songs, not in B100
len(set(df_B100_songs.id)), len(set(df_B100.id))
# looks good (only NA in B100)

(21274, 21275)

In [285]:
# FINAL DATASET: df_B100 pickle file
df_B100.to_pickle('df_B100.pickle')

### check datasets

In [253]:
# data types
pd.concat(
    [df_10M.dtypes, df_B100.dtypes, df_B100_songs.dtypes], 
    keys=['df_10M.dtypes', 'df_B100.dtypes', 'df_B100_songs.dtypes'],
    axis=1
)

Unnamed: 0,df_10M.dtypes,df_B100.dtypes,df_B100_songs.dtypes
id,object,object,object
song,object,object,object
artist,object,object,object
genre,object,object,object
release_date,datetime64[ns],datetime64[ns],datetime64[ns]
acousticness,float32,float32,float32
danceability,float32,float32,float32
duration_ms,Int64,Int64,Int64
energy,float32,float32,float32
instrumentalness,float32,float32,float32


In [254]:
df_10M.describe().loc['mean':'max'].T

Unnamed: 0,mean,std,min,25%,50%,75%,max
acousticness,0.42,0.37,0.00,0.03,0.34,0.82,1.00
danceability,0.53,0.19,0.00,0.40,0.54,0.68,1.00
duration_ms,238_311.97,156_917.57,1_000.00,169_587.00,216_933.00,275_400.00,6_072_187.00
energy,0.55,0.28,0.00,0.31,0.57,0.79,1.00
instrumentalness,0.26,0.37,0.00,0.00,0.00,0.66,1.00
key,5.24,3.54,0.00,2.00,5.00,8.00,11.00
liveness,0.21,0.18,0.00,0.10,0.13,0.26,1.00
loudness,-11.00,6.34,-60.00,-13.72,-9.21,-6.40,7.23
mode,0.66,0.47,0.00,0.00,1.00,1.00,1.00
speechiness,0.10,0.14,0.00,0.04,0.05,0.08,0.97


In [255]:
df_B100.describe().loc['mean':'max'].T

Unnamed: 0,mean,std,min,25%,50%,75%,max
rank,50.50,28.87,1.00,26.00,51.00,76.00,100.00
last-week,47.59,28.05,1.00,23.00,47.00,72.00,100.00
peak-rank,40.97,29.35,1.00,13.00,38.00,65.00,100.00
weeks-on-board,9.16,7.62,1.00,4.00,7.00,13.00,90.00
acousticness,0.28,0.27,0.00,0.04,0.18,0.47,1.00
danceability,0.60,0.15,0.00,0.50,0.61,0.71,0.99
duration_ms,226_926.40,65_973.00,37_013.00,183_560.00,221_400.00,258_533.00,1_292_293.00
energy,0.63,0.20,0.02,0.48,0.64,0.79,1.00
instrumentalness,0.03,0.13,0.00,0.00,0.00,0.00,0.99
key,5.22,3.56,0.00,2.00,5.00,8.00,11.00


In [256]:
df_B100_songs.describe().loc['mean':'max'].T

Unnamed: 0,mean,std,min,25%,50%,75%,max
acousticness,0.32,0.29,0.00,0.05,0.22,0.56,1.00
danceability,0.59,0.15,0.00,0.49,0.60,0.70,0.99
duration_ms,217_614.29,67_767.96,37_013.00,169_707.00,210_533.00,251_260.25,1_292_293.00
energy,0.61,0.20,0.02,0.46,0.62,0.77,1.00
instrumentalness,0.04,0.14,0.00,0.00,0.00,0.00,0.99
key,5.20,3.55,0.00,2.00,5.00,8.00,11.00
liveness,0.19,0.16,0.01,0.09,0.13,0.25,1.00
loudness,-8.90,3.62,-29.52,-11.32,-8.52,-6.08,2.29
mode,0.74,0.44,0.00,0.00,1.00,1.00,1.00
speechiness,0.07,0.07,0.00,0.03,0.04,0.06,0.94


In [257]:
# Date Range for Billboard Hot 100
df_B100.date.min(), df_B100.date.max()

(Timestamp('1958-08-04 00:00:00'), Timestamp('2021-11-06 00:00:00'))

In [286]:
# Genre Counts, AF counts
df_B100_songs.genre.count(), df_B100_songs.shape[0]

(19750, 21274)

In [287]:
# percentage of songs with audio features that also have genre data
# NOTE: not all songs on Spotify have an associated genre
df_B100_songs.genre.count() / df_B100_songs.valence.count()

0.9283632603177587

In [291]:
# number of total songs that include audio features with and without genre 
df_10M.genre.count(), df_10M.shape[0], df_10M.genre.count() / df_10M.shape[0]

(6320843, 9431755, 0.6701661567757008)

In [292]:
# All Billboard 100 lists
# number not null, total, proportion not null
(
    df_B100[df_B100.id.notnull()].shape[0], 
    df_B100.shape[0], 
    df_B100[df_B100.id.notnull()].shape[0] / df_B100.shape[0]
)

(243051, 330087, 0.7363240600205401)

# DISCUSSION: Outliers

In [293]:
# yep, I checked, that song is actually 37 seconds...
df_B100_songs[df_B100_songs.duration_ms == df_B100_songs.duration_ms.min()]

Unnamed: 0,id,song,artist,genre,release_date,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
10556,4IIuCotvqijraSdnVLaFnM,Beautiful Trip,Kid Cudi,hip hop,2020-12-11,0.97,0.33,37013,0.51,0.95,11,0.88,-15.39,0,0.63,133.97,4,0.42


In [294]:
# yep, I checked, that song is actually 21 minutes...
df_B100_songs[df_B100_songs.duration_ms == df_B100_songs.duration_ms.max()]

Unnamed: 0,id,song,artist,genre,release_date,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
14649,6pN3ra1mEPtjFsdCvDDHW3,Get Ready,Rare Earth,classic rock,1969-09-30,0.0,0.41,1292293,0.87,0.53,0,0.83,-5.84,1,0.04,127.36,4,0.65


#### not exactly zscore, but a guess
![image.png](attachment:image.png)

In [263]:
outside = 1 - .9973
outside/2, 1 - outside/2

(0.0013500000000000179, 0.99865)

In [295]:
# IQR doesn't work for outliers (it excludes almost none)
# but using Z=3, this "song" would be excluded
df_B100_songs.describe(percentiles=[0.00135, 0.999]).loc['mean':'max'].T

# OUTLIERS SHOULD BE INVESTIGATED IN MORE DETAIL, BEFORE CLUSTERING

Unnamed: 0,mean,std,min,0.14%,50%,99.9%,max
acousticness,0.32,0.29,0.00,0.00,0.22,0.98,1.00
danceability,0.59,0.15,0.00,0.16,0.60,0.96,0.99
duration_ms,217_614.29,67_767.96,37_013.00,95_309.39,210_533.00,677_275.35,1_292_293.00
energy,0.61,0.20,0.02,0.06,0.62,0.99,1.00
instrumentalness,0.04,0.14,0.00,0.00,0.00,0.95,0.99
key,5.20,3.55,0.00,0.00,5.00,11.00,11.00
liveness,0.19,0.16,0.01,0.02,0.13,0.98,1.00
loudness,-8.90,3.62,-29.52,-23.03,-8.52,-1.16,2.29
mode,0.74,0.44,0.00,0.00,1.00,1.00,1.00
speechiness,0.07,0.07,0.00,0.02,0.04,0.59,0.94


In [296]:
df_10M.query('in_B100 == True').sample(5)

Unnamed: 0,id,song,artist,genre,release_date,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,in_B100
3976753,5pxI50SA8WciKebbTFfAAA,Livingston Saturday Night,Jimmy Buffett,classic rock,1978-01-01,0.23,0.61,192867,0.88,0.78,4,0.13,-11.72,1,0.03,147.79,4,0.96,True
8464516,5BE7v9I2FUjUnObRAcopIs,You Know What I Mean,The Turtles,folk rock,1967-01-01,0.74,0.3,122400,0.64,0.0,11,0.25,-6.66,1,0.06,120.42,4,0.75,True
2028020,5yGTQzYbEdY6B9RFZJypgt,Rhythm Of The Night,Debarge,soul,1985-01-01,0.08,0.71,229107,0.76,0.0,11,0.08,-12.33,0,0.05,115.14,4,0.95,True
5673182,7bdUKJBcNob37UCRAs1wC6,Out Of Mind Out Of Sight,Models,australian rock,2017-07-05,0.01,0.63,217467,0.77,0.0,6,0.2,-13.64,0,0.05,122.09,4,0.79,True
830455,3SdTKo2uVsxFblQjpScoHy,Stand By Me,Ben E. King,soul,1962-08-20,0.57,0.65,180056,0.31,0.0,9,0.07,-9.44,1,0.04,118.07,4,0.61,True


In [297]:
df_10M.query('in_B100 == False').sample(5)

Unnamed: 0,id,song,artist,genre,release_date,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,in_B100
5404601,7v1ChixYWsmjprghoKlsgM,Through My Eyes,Matt Perrone,,2012-01-31,0.08,0.68,246600,0.79,0.0,7,0.08,-4.83,1,0.03,124.96,4,0.83,False
852059,6Q6Jw7R3CzPIWfEiu1KWzn,"Sing, Sing, Sing",Benny Goodman,adult standards,2010-10-10,0.91,0.58,505993,0.66,0.91,6,0.14,-8.05,1,0.11,116.3,4,0.25,False
545857,2xeT0rKPFa9YV3MWnz2Blp,Decaying Development Rave,Aria Bare,,2020-05-01,0.02,0.72,469500,0.9,0.81,1,0.11,-7.34,0,0.07,120.02,4,0.13,False
1715922,4F48BXHLvuxiB3CvOnHXPH,Wandering,Craig Taubman,judaica,2001-01-01,0.03,0.61,207533,0.81,0.0,9,0.36,-6.35,1,0.04,132.47,4,0.71,False
6316550,2i7SoOM7HSWOyh89mXmnv1,All Or Nothing - Paul Birken Remix,Paul Birken,acid techno,2016-08-15,0.0,0.85,367054,0.69,0.88,11,0.12,-7.55,0,0.07,135.99,4,0.2,False
