In [1]:
import os, sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import ast
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler


def extract_track(row): 
    if "http" in row:
        return row.split("=")[-1]
    else: return row

def clean_lyrics(row):
    return [x for x in row.replace("\r", "").strip().split("\n") if x != ""]

def get_us_songs(row): 
    return "US" in row

def read_raw_billboard(path = "../billboard_2000_2018_spotify_lyrics.csv"):
    """Helper Function to Filter the Billboard Dataset"""
    billboard = pd.read_csv(path)
    billboard = billboard[['year', 'title', 'main_artist', 'artist', 'peak_pos', 'weeks', 'spotify_link',
    'genre', 'broad_genre', 'energy', 'liveness', 'tempo', 'speechiness', 'acousticness',
    'instrumentalness', 'time_signature', 'danceability', 'key',
    'duration_ms', 'loudness', 'valence', 'mode', 'lyrics',]]
    
    billboard = billboard.replace("unknown", np.nan)
    billboard = billboard.dropna().sort_values("weeks", ascending=False).drop_duplicates(subset="title").reset_index(drop=True)
    billboard["spotify_link"] = billboard["spotify_link"].apply(extract_track) # Making the columns the same 
    billboard = billboard.rename(columns = {"spotify_link" : "uri", "title" : 'name'})
    billboard['artist_popularity'] = 0
    billboard['followers'] = 0
   
    return billboard

def construct_spd(path = "../SpotGenTrack/Data Sources/"):
    """Helper Function to Filter the SPD Dataset"""
    spd = pd.read_csv(os.path.join(path, "spotify_tracks.csv"), index_col=0)
    spd = spd[spd.available_markets.apply(get_us_songs)]
    spd['artists_id'] = spd['artists_id'].apply(ast.literal_eval)
    
    m = pd.read_csv(os.path.join(path,"spotify_artists.csv"), index_col=0)
    spd_albums = pd.read_csv(os.path.join(path,"spotify_albums.csv"), index_col=0)
    
    artist_popularity = list()
    followers = list()
    year = list()
    
    spd.reset_index(drop=True, inplace=True)
    for i in tqdm(range(spd.shape[0]), desc='Creating Filtered SPD'): 
        row = m[m.id == spd.loc[i, "artists_id"][0]]

        artist_popularity.append(row.artist_popularity.values[0])
        followers.append(row.followers.values[0])
        year.append(spd_albums[spd_albums.id == spd.loc[i, "album_id"]]['release_date'].values[0])

    spd['artist_popularity'] = artist_popularity
    spd['followers'] = followers
    spd["peak_pos"] = 0
    spd["weeks"] = 0 
    spd['year'] = year
    
    return spd

# Using the Spotify API to collect Artist metadata
- The following requires a Spotify Application to be setup
- It is possible for the incorrect artist to be collected. As such we have added a filter to account for this, it isn't perfect but it works well enough.

In [None]:
import spotipy
from spotipy import SpotifyClientCredentials

sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())
seen = set()
artists = dict()
invalid = []

billboard = read_raw_billboard("../billboard_2000_2018_spotify_lyrics.csv")

for _, track in tqdm(billboard.iterrows(), total=billboard.shape[0]):
    if track.main_artist in seen:  # Skip if they have already had their data collected 
        continue

    seen.add(track.main_artist) # Add them the first time they have been encountered

    df = pd.json_normalize(sp.artist(sp.track(track['uri'])['artists'][0]['uri']))
    df = df.rename(columns={"followers.total" : "followers"})
    df['artist'] = track['main_artist']

    artists[track['main_artist']] = df


for k, v in tqdm(artists.items(), desc="Adding Artist metadata"): 
    billboard.loc[billboard.main_artist == k, ["artist_popularity", "followers"]] = v["popularity"][0], v["followers"][0]

billboard = billboard[~((billboard.artist_popularity <= 25) & (billboard.followers <= 100_000))].reset_index(drop=True)
billboard.head(10)

# The following is used to create the combined dataset

In [12]:
#spd = construct_spd("../SpotGenTrack/Data Sources/")

common = np.intersect1d(billboard.columns, spd.columns)
spd_common = spd[spd.uri.isin(common)]
spd = spd[~spd.uri.isin(common)]
combined = pd.concat([spd[common], billboard[common]]).reset_index(drop=True)

combined['year'] = combined['year'].astype(str).str[:4].astype(int)
combined['lyrics'] = combined['lyrics'].apply(clean_lyrics)
combined = combined.astype({"key" : "int32", "mode" : "int16", "time_signature" : "int16"})
combined["hit"] = combined["weeks"] >= 1

# This section is used to filter songs that do not have 95% of their lyrics in English

In [22]:
from langdetect import detect_langs
from langdetect.lang_detect_exception import LangDetectException


def lyrics_to_sting(row): 
    return ", ".join(row)

combined.lyrics = combined.lyrics.apply(lyrics_to_sting)
lyrics = combined.lyrics

indicies = []
for i, song in tqdm(enumerate(lyrics), total=len(lyrics), desc="Checking lyrics"):  
    try:
        langlist = detect_langs(song)
        for l in langlist:
            if l.prob < 0.95 or l.lang != 'en':
                continue
            else:
                indicies.append(i)
    except LangDetectException:
        continue
    
combined_english = combined.iloc[indicies].reset_index(drop=True)
combined_english

Checking lyrics: 100%|██████████| 95736/95736 [10:37<00:00, 150.17it/s]


Unnamed: 0,acousticness,artist_popularity,danceability,duration_ms,energy,followers,instrumentalness,key,liveness,loudness,...,peak_pos,speechiness,tempo,time_signature,uri,valence,weeks,year,hit,popularity
0,0.294,28,0.698,235584.0,0.606,425,0.000003,10,0.151,-7.447,...,0,0.0262,115.018,4,spotify:track:5qljLQuKnNJf4F4vfxQB0V,0.622,0,2018,False,0.000000
1,0.863,36,0.719,656960.0,0.308,2965,0.0,6,0.253,-10.34,...,0,0.922,115.075,3,spotify:track:3VAX2MJdmdqARLSU5hPMpm,0.589,0,2011,False,0.000000
2,0.763,10,0.719,316578.0,0.126,158,0.0,3,0.113,-20.254,...,0,0.938,112.822,3,spotify:track:6aCe9zzoZmCojX7bbgKKtf,0.533,0,2005,False,0.000000
3,0.971,62,0.367,183653.0,0.349,201820,0.296,11,0.633,-7.74,...,0,0.0268,81.85,4,spotify:track:4PrAZpH9Ic7S47E78BN6E4,0.192,0,2017,False,0.000000
4,0.824,36,0.688,29240.0,0.304,2965,0.0,10,0.142,-9.96,...,0,0.531,77.056,3,spotify:track:1WJzRtI1ABzV3TPIeJZVvi,0.414,0,2011,False,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95730,0.447,65,0.605,212693,0.483,2580404,0.000318,2,0.0962,-7.775,...,92,0.0291,130.836,4,spotify:track:3ga3cmO4mb9dMhysyJUEaH,0.169,1,2011,True,2.189709
95732,0.00268,70,0.494,199493,0.762,7085762,0.00538,10,0.472,-2.885,...,57,0.0441,114.1,4,spotify:track:6XscPNlnKw0mnVYE7kvWRj,0.653,1,2011,True,6.167588
95733,0.00698,88,0.675,190977,0.842,26078781,0,11,0.349,-2.698,...,70,0.0337,127.019,4,spotify:track:6JH56gZC7EJDcoxabVcWVL,0.617,1,2011,True,5.001037
95734,0.0217,89,0.263,258874,0.315,44666058,0.163,8,0.109,-10.797,...,90,0.03,142.657,4,spotify:track:0kuv7BqWNDprDao3Tb5flN,0.196,1,2011,True,2.536367


# This section is used to add our defined popularity metrix to the dataset

In [25]:
hits = combined_english.hit == True # Records the locations of the hit songs 
combined_english["popularity"] = 0

values = np.sqrt(((101 - combined_english[hits].peak_pos).values) * combined_english[hits].weeks.values)
scaler = MinMaxScaler(feature_range=(0, 100))
combined_english.loc[hits, "popularity"] = scaler.fit_transform(values.reshape(-1,1))

# Used to select years and then split the dataset into training / testing sets

In [29]:
def split_dataset(df, threshold = 300): 
    df = df[(df.year >= 2000) & (df.year < 2018)].reset_index(drop=True)
    hits = df[df.hit].reset_index(drop=True)
    non_hits = df[~df.hit].reset_index(drop=True)
    selected_songs = non_hits.sort_values('followers', ascending=False).groupby('year').head(threshold)
    df = pd.concat([hits, selected_songs]).sort_values('year').reset_index(drop=True)
    df['train'] = True
    df.loc[df.year >= 2015, 'train'] = False
    return df

train_test = split_dataset(combined_english)
#train_test.to_pickle("../final_dataset.pkl")

Unnamed: 0,acousticness,artist_popularity,danceability,duration_ms,energy,followers,instrumentalness,key,liveness,loudness,...,speechiness,tempo,time_signature,uri,valence,weeks,year,hit,popularity,train
0,0.823,12,0.743,226520.0,0.35,2,0.0,2,0.0636,-7.202,...,0.046,138.7,4,spotify:track:6BoQdPEcadx6n6BBdqCS7k,0.707,0,2000,False,0.000000,True
1,0.00408,30,0.674,206733,0.623,37696,5.61E-05,2,0.0488,-5.97,...,0.0283,126.547,4,spotify:track:1PrZOkFxJqTZAH8P3x87hW,0.452,20,2000,True,42.424749,True
2,0.000192,44,0.623,226227,0.898,11811,0.303,4,0.138,-6.604,...,0.0309,130.993,4,spotify:track:4jCWcK6PhbnzOCAFJEztUm,0.786,29,2000,True,44.192963,True
3,0.779,35,0.655,44533.0,0.689,11106,0.0,8,0.694,-5.568,...,0.863,105.116,1,spotify:track:6RHrQuUrdu0qGym6dzG0Q2,0.724,0,2000,False,0.000000,True
4,0.238,38,0.636,268600.0,0.868,11219,0.0,11,0.0649,-6.673,...,0.0414,103.005,4,spotify:track:6mn15kWtqGdQtBYIx9L3Kc,0.49,0,2000,False,0.000000,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10821,0.0558,70,0.523,214800,0.725,7085762,6.96E-06,2,0.0937,-5.448,...,0.0339,155.855,4,spotify:track:0c4ICGb0jvszKj3KPR59JU,0.528,11,2017,True,23.264119,False
10822,0.0137,62,0.559,231674,0.531,1797400,0,10,0.293,-6.981,...,0.404,144.367,4,spotify:track:1fnRwgZLgAYMM1dVL1oqwG,0.333,1,2017,True,0.000000,False
10823,0.248,89,0.662,179613,0.496,40806605,0,5,0.0769,-8.267,...,0.109,81.951,4,spotify:track:5yuShbu70mtHXY0yLzCQLQ,0.133,11,2017,True,17.061253,False
10824,0.0198,68,0.707,186113,0.611,8846891,3.59E-06,2,0.0651,-4.474,...,0.0639,123.006,4,spotify:track:4lnAN2S1fcI0SjxEbksZVr,0.285,12,2017,True,31.531073,False
