In [2]:
import sys
import re
import functools
import operator
import concurrent.futures
import threading
import time
import random
import glob
import requests
import pickle
import numpy as np
import pandas as pd
from apis import Genius, Spotify

def map_chunkwise(df, func, n_workers=20, tmp="tmp/tmp{}.csv"):
    n_workers = 20
    n_chunks = n_workers*100
    chunksize = (len(df) + n_chunks - 1) // n_chunks
    chunks = [(i, df.iloc[i*chunksize:(i+1)*chunksize])
              for i in range(0, n_chunks)]
    def safe_func(args):
        i, chunk = args
        j = threading.get_ident()
        df = func(args)
        with open(tmp.format(j), 'a+') as f:
            chunk.to_csv(f, header=False)   # as a backup in case something goes wrong!
        return df
    with concurrent.futures.ThreadPoolExecutor(max_workers=n_workers) as pool:
        res = pool.map(safe_func, chunks)
    return pd.concat(res)

# If long-running parallel execution of a task fails, you can restore partial results
# from the tmp files using this function.
def restore_from_tmp(columns, tmp="tmp/tmp*.csv"):
    df = pd.DataFrame([], columns=columns)
    files = glob.glob(tmp)
    for file in files:
        this_df = pd.read_csv(file, header=None, index_col=0)
        this_df.columns = columns
        df = pd.concat([df, this_df])
    df.drop_duplicates()
    return df

# Last.FM Emotion Tags Data Set

These are the available emotion tags in the data set.

In [2]:
tags_pickle = "data/lastfm_id_to_emotion_tags.pickle"
emotions_csv = "data/lastfm_clean.csv"
with open(tags_pickle, "rb") as f:
    track_to_emotion = pickle.load(f)
    
all_tags = functools.reduce(set.union, [{emotion for emotion, _ in emotions}
                                        for _, emotions in track_to_emotion.items()], set())
    
emotions_table = {"lastfm_id": []}
emotions_table.update({tag: [] for tag in all_tags})
for track in track_to_emotion:
    emotions_table["lastfm_id"].append(track)
    track_emotions = {emotion: count for emotion, count in track_to_emotion[track]}
    for emotion in all_tags:
        count = track_emotions[emotion] if emotion in track_emotions else 0
        emotions_table[emotion].append(count)

In [3]:
emotions_df = pd.DataFrame(emotions_table)
emotions_df.loc[:, "tag_count"] = emotions_df.loc[:, all_tags].sum(axis=1).copy()
emotions_df

Unnamed: 0,lastfm_id,tranquil,intense,pesimism,tragic,outraged,sooth,mellow,peppy,fighting,...,lament,bittersweet,desperate,jovial,relax,annoying,sorry,dark,soothe,tag_count
0,TRRRRCH128F9342C72,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,6
1,TRRRRNA128F42948D2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,TRRRRLE128F147C97D,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,100
3,TRRRRYK128F93229FA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,12
4,TRRRRGT128F4288741,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83274,TRWWPDA128E0792F83,0,0,0,0,0,0,10,0,0,...,0,0,0,0,0,0,0,0,0,10
83275,TRWWWGP128F146AE64,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,20
83276,TRWWWOH12903CB7288,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,66
83277,TRWWWKQ128F426EEDC,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,14


In [4]:
emotions_df = emotions_df[emotions_df["tag_count"]>0]
emotions_df.to_csv(emotions_csv)

In [91]:
emotions_df = pd.read_csv(emotions_csv)

# Wasabi Preprocess Song Data Set

We are only interested in rows that have emotion tags as well as Spotify data associated with them. Since this is a huge data set we process it in chunks and read only our relevant data.

In [27]:
input_csv = "data/wasabi_songs.csv"
clean_csv = "data/wasabi_clean.csv"

In [None]:
with pd.read_csv(input_csv,
                 chunksize=32000,
                 sep="\t",
                 usecols=["artist", "language", "title", "lastfm_id", "urlSpotify"],
                 low_memory=False) as reader:
    for i, songs_df in enumerate(reader):
        sys.stdout.write(".")  # Show some progress
        clean_df = songs_df[~songs_df["urlSpotify"].isnull()&   # only songs that have a spotify id
                            ~songs_df["lastfm_id"].isnull()&    # only songs that have a last.fm id
                            (songs_df["language"].str.lower().str.startswith("en"))] # english only
        clean_df.to_csv(clean_csv,
                        header=(i==1),
                        mode=("a" if i > 1 else "w"))

In [33]:
wasabi_df = pd.read_csv(clean_csv)
wasabi_df

Unnamed: 0.1,Unnamed: 0,artist,language,lastfm_id,title,urlSpotify
0,32241,Agnetha Fältskog,eng,TRUQYXC128F92E3437,The Heat Is On,https://play.spotify.com/track/1qnG5n2LtWlyD2V...
1,32256,Agnetha Fältskog,eng,TRUQYXC128F92E3437,The Heat Is On,https://play.spotify.com/track/1qnG5n2LtWlyD2V...
2,32286,Agnetha Fältskog,eng,TRBJXDD128E0786E86,Little White Secrets,https://play.spotify.com/track/5Up2AQdOew31hTG...
3,32288,Agnetha Fältskog,eng,TRLZNEW128E0786E88,Love In A World Gone Mad,https://play.spotify.com/track/5gIHa908vYk8R7c...
4,32289,Agnetha Fältskog,eng,TREITVF128E0786E89,Maybe It Was Magic,https://play.spotify.com/track/6S2Vqf2kx3Zrg21...
...,...,...,...,...,...,...
37702,2099183,50 Cent,eng,TRZWWZU128F9353DDA,God Gave Me Style,https://play.spotify.com/track/0RZQHGfpOVplGcU...
37703,2099215,50 Cent,eng,TRVMVPF128F9316377,Amusement Park,https://play.spotify.com/track/1irbL0PXRE0Nf9Q...
37704,2099216,50 Cent,eng,TRNUZHF128F4233666,Fully Loaded Clip,https://play.spotify.com/track/5Y1RgksowacV4hb...
37705,2099255,50 Cent,eng,TRNGHIA128F93220EB,Death To My Enemies,https://play.spotify.com/track/1eUmgy6Ep6Ot6Gq...


In [34]:
# Remove duplicates by Spotify ID
# Some songs are in the data set multiple times, e.g. if they are on multiple albums
wasabi_df = wasabi_df.drop_duplicates("urlSpotify").drop(columns=["Unnamed: 0"])
wasabi_df

Unnamed: 0,artist,language,lastfm_id,title,urlSpotify
0,Agnetha Fältskog,eng,TRUQYXC128F92E3437,The Heat Is On,https://play.spotify.com/track/1qnG5n2LtWlyD2V...
2,Agnetha Fältskog,eng,TRBJXDD128E0786E86,Little White Secrets,https://play.spotify.com/track/5Up2AQdOew31hTG...
3,Agnetha Fältskog,eng,TRLZNEW128E0786E88,Love In A World Gone Mad,https://play.spotify.com/track/5gIHa908vYk8R7c...
4,Agnetha Fältskog,eng,TREITVF128E0786E89,Maybe It Was Magic,https://play.spotify.com/track/6S2Vqf2kx3Zrg21...
5,Agnetha Fältskog,eng,TRHJGFA128E0786E8A,Let It Shine,https://play.spotify.com/track/0Dd60NViuR1wj6L...
...,...,...,...,...,...
37702,50 Cent,eng,TRZWWZU128F9353DDA,God Gave Me Style,https://play.spotify.com/track/0RZQHGfpOVplGcU...
37703,50 Cent,eng,TRVMVPF128F9316377,Amusement Park,https://play.spotify.com/track/1irbL0PXRE0Nf9Q...
37704,50 Cent,eng,TRNUZHF128F4233666,Fully Loaded Clip,https://play.spotify.com/track/5Y1RgksowacV4hb...
37705,50 Cent,eng,TRNGHIA128F93220EB,Death To My Enemies,https://play.spotify.com/track/1eUmgy6Ep6Ot6Gq...


# Merge Wasabi and Last.fm data sets

In [93]:
merged_df = wasabi_df.merge(emotions_df, on="lastfm_id")
merged_df

Unnamed: 0.1,artist,language,lastfm_id,title,urlSpotify,Unnamed: 0,tranquil,intense,pesimism,tragic,...,lament,bittersweet,desperate,jovial,relax,annoying,sorry,dark,soothe,tag_count
0,Agnetha Fältskog,eng,TRLZNEW128E0786E88,Love In A World Gone Mad,https://play.spotify.com/track/5gIHa908vYk8R7c...,72757,0,0,0,0,...,0,0,0,0,0,0,0,0,0,100
1,Agnostic Front,eng,TRIXIZP128F428B97A,Gotta Go,https://play.spotify.com/track/4E5DjeGH6tSZ5Sz...,10316,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,Agonoize,eng,TRONFBK128F9329E54,"Death, Murder, Kill",https://play.spotify.com/track/1xSBDHmJWjHBvDX...,36312,0,0,0,0,...,0,0,0,0,0,0,0,0,0,54
3,Aim,eng,TRZASCT128F92FD4D0,Cold Water Music,https://play.spotify.com/track/3JNTQoN7w6I5HWH...,19851,0,0,0,0,...,0,0,0,0,5,0,0,0,0,24
4,Aimee Mann,eng,TRRGMKE128F42482A7,Long Shot,https://play.spotify.com/track/7543cpWwdSur9zI...,747,0,0,0,0,...,0,0,0,0,0,0,0,0,0,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11202,3 Doors Down,eng,TRMEQLY128F424EAD2,Pages,https://play.spotify.com/track/1ClNImvq0kI7bKr...,44847,0,0,0,0,...,0,0,0,0,0,0,0,0,0,12
11203,3 Doors Down,eng,TRMJBDR128F424EADD,When It's Over,https://play.spotify.com/track/5psKPzHiwRUq6Fh...,44105,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
11204,3 Doors Down,eng,TRECJZR128F424EADF,She Don't Want The World,https://play.spotify.com/track/3a5nwO8Gzm7yM8s...,65331,0,0,0,0,...,0,0,0,0,0,0,0,0,0,86
11205,50 Cent,eng,TRRRVOS128F92CF7A4,Disco Inferno,https://play.spotify.com/track/75xfOAv01A3RwBd...,83,0,0,0,0,...,0,0,0,0,0,0,0,0,0,13


# Look at data

In [69]:
def is_tagged(df, tags, thresh=3):
    return sum(df[em] for em in tags) > thresh

tags_happy     = ("happy",)
tags_sad       = ("sad", "melancholy", "melancholic")
tags_energetic = ("energetic",)
tags_calm      = ("calm", "relax", "relaxing")

thresh       = 1
is_happy     = is_tagged(merged_df, tags_happy)
is_sad       = is_tagged(merged_df, tags_sad)
is_energetic = is_tagged(merged_df, tags_energetic)
is_calm      = is_tagged(merged_df, tags_calm)


select_df = merged_df[(is_happy^is_sad) & (is_energetic^is_calm)]

{tags_happy:     np.count_nonzero(is_tagged(select_df, tags_happy)),
 tags_sad:       np.count_nonzero(is_tagged(select_df, tags_sad)),
 tags_energetic: np.count_nonzero(is_tagged(select_df, tags_energetic)),
 tags_calm:      np.count_nonzero(is_tagged(select_df, tags_calm)),
 "total":        len(merged_df)}

{('happy',): 139,
 ('sad', 'melancholy', 'melancholic'): 566,
 ('energetic',): 77,
 ('calm', 'relax', 'relaxing'): 628,
 'total': 11207}

# Scrape Lyrics off of Genius

The Wasabi dataset does not provide us with a unique ID that can be used to retrieve lyrics. We search on Genius based on song title and artist instead. If the match is not exact, we set a flag in the data so we can manually check it (or drop the row).

In [46]:
genius = Genius()
genius.authenticate()

Please follow this link and authorize the app:
https://api.genius.com/oauth/authorize?client_id=HDGdy8grlfzttb5mZx3hKJzLdgJdlTnyxGTPY06Ldkrx21MCkug1Vi6UQwgW4KXW&redirect_uri=http%3A%2F%2Flocalhost&scope=&state=&response_type=code

Then, paste the URL you are redirected to here: 
http://localhost/?code=yl8GPN4XC7kk9MTvKlS_RUIu1CCgUii85Hcnb9vmmHektb-5O_hcrhPHSwK99pQo
Authentication successful.


In [87]:
space_regex = re.compile("\s+")
alphnum_regex = re.compile("[^a-zA-Z0-9 ]")
def clean_str(instr):
    return alphnum_regex.sub("", space_regex.sub(" ", instr.strip().lower()))

def fetch_lyrics_for_df(args):
    j, df = args
    for i, (idx, row) in enumerate(df.iterrows()):
        time.sleep(random.random())  # to avoid hitting rate limits
        if i % 100 == 0:
            sys.stdout.write("{0}: {1:4d}/{2:4d}.\n".format(j, i, len(df)))  # Show some progress
            sys.stdout.flush()
        try:
            res = genius.get("search", q="{} by {}".format(row["title"], row["artist"]))
            assert res["meta"]["status"] == 200
            assert len(res["response"]["hits"]) >= 1
            genius_id = None
            genius_path = None
            genius_title = None
            genius_artist = None
            exact_match = False
            # Go through all search results; if we have an exact match, we stop.
            # For inexact matches, since we are reversing the list, we will have the most relevant
            # one last, i.e. use that one.
            for hit in reversed(res["response"]["hits"]):
                if hit["type"] != "song":
                        continue
                genius_id = hit["result"]["id"]
                genius_path = hit["result"]["path"]
                genius_title = hit["result"]["title"]
                genius_artist = hit["result"]["primary_artist"]["name"]
                exact_match = (clean_str(genius_title.strip()) == clean_str(row["title"])
                               and clean_str(genius_artist) == clean_str(row["artist"]))
                if exact_match:
                    break
            if not exact_match:
                l = max(len(clean_str(genius_artist)), len(clean_str(row["artist"])))
                print()
                print("Inexact match: {1:{0:d}s} - {2:s}".format(l, clean_str(row["artist"]), clean_str(row["title"])))
                print("           vs: {1:{0:d}s} - {2:s}".format(l, clean_str(genius_artist), clean_str(genius_title)))
            df.loc[idx, "genius_id"] = genius_id
            df.loc[idx, "genius_artist"] = genius_artist
            df.loc[idx, "genius_title"] = genius_title
            df.loc[idx, "genius_path"] = genius_path
            lyrics = genius.lyrics_from_song_api_path(genius_path)
            df.loc[idx, "genius_lyrics"] = lyrics
            df.loc[idx, "exact_match"] = exact_match
        except Exception as e:
            print()
            print("{}: {}".format(i, repr(e)))
    return df

In [None]:
merged_df = map_chunkwise(merged_df, fetch_lyrics_for_df)
merged_df

In [129]:
# merged_df = restore_from_tmp(list(merged_df.columns) + ["genius_id", "genius_artist", "genius_title",
#                                                         "genius_path", "genius_lyrics", "exact_match"])

In [139]:
merged_df.to_csv("data/wasabi_lastfm_genius.csv")

In [3]:
merged_df = pd.read_csv("data/wasabi_lastfm_genius.csv")
merged_df

Unnamed: 0.2,Unnamed: 0,artist,language,lastfm_id,title,urlSpotify,Unnamed: 0.1,tranquil,intense,pesimism,...,dark,soothe,tag_count,genius_id,genius_artist,genius_title,genius_path,genius_lyrics,exact_match,spotify_id
0,66,Alexia,eng,TRHKKKV128F4238CB3,Uh La La La,https://play.spotify.com/track/1ahAVyeP1B027c0...,41876,0,0,0,...,0,0,7,796138.0,Alexia,Uh La La La,/Alexia-uh-la-la-la-lyrics,\n\nTo all the people all around the world\nOo...,True,1ahAVyeP1B027c03NlVSRd
1,67,Alexia,eng,TRDZQWY128F4238C8B,Gimme Love,https://play.spotify.com/track/2scKJ2GTRLCDvnz...,52523,0,0,0,...,0,0,28,796404.0,Alexia,Gimme Love,/Alexia-gimme-love-lyrics,\n\nGimme love gimme love\nBaby gimme your lov...,True,2scKJ2GTRLCDvnzUbSIEEv
2,68,Alexisonfire,eng,TRAPGKY128F42972A9,Mailbox Arson,https://play.spotify.com/track/5UDRAe48sxIpwQz...,32315,0,0,0,...,0,0,6,777010.0,Alexisonfire,Mailbox Arson,/Alexisonfire-mailbox-arson-lyrics,\n\n[Pre-Chorus: George Pettit]\nYour mail's n...,True,5UDRAe48sxIpwQzg5a77Uv
3,69,Alexisonfire,eng,TROCHHB128F14A664A,Boiled Frogs,https://play.spotify.com/track/3mE1W8j5OZstBKt...,37666,0,0,0,...,0,0,2,776969.0,Alexisonfire,Boiled Frogs,/Alexisonfire-boiled-frogs-lyrics,\n\n[Verse 1: George Pettit]\nA man sits at hi...,True,3mE1W8j5OZstBKtZhRKWXg
4,70,Alexisonfire,eng,TRQNBJV128F42972BF,Rough Hands,https://play.spotify.com/track/4zDPicZuWXyCzdH...,58308,0,0,0,...,0,0,16,199499.0,Alexisonfire,Rough Hands,/Alexisonfire-rough-hands-lyrics,\n\n[Verse 1: Dallas Green]\nWas I left behind...,True,4zDPicZuWXyCzdHnervvkH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11178,11137,Zero 7,eng,TRYEGBI128E0795767,I Have Seen,https://play.spotify.com/track/343S0S5OXLWqznF...,77085,0,0,0,...,0,0,18,1465755.0,Zero 7,I Have Seen,/Zero-7-i-have-seen-lyrics,"\n\nOld man there, people stare\nThinking back...",True,343S0S5OXLWqznFNJDOyvy
11179,11138,Zero 7,eng,TRGCVSJ128E0795768,Polaris,https://play.spotify.com/track/4njttTYaIqT8V9Y...,17139,0,0,0,...,0,0,23,2181659.0,Zero 7,Polaris,/Zero-7-polaris-lyrics,\n\n(Instrumental)\n\n,True,4njttTYaIqT8V9YWQoT0Mt
11180,11139,Zero 7,eng,TROXFNG12903CF54B2,Destiny,https://play.spotify.com/track/6PcuC1LuoldUcOG...,38179,0,0,0,...,0,0,37,373113.0,Zero 7,Destiny,/Zero-7-destiny-lyrics,\n\n[Verse 1: Sia]\nI lie awake\nI've gone to ...,True,6PcuC1LuoldUcOGegswbQp
11181,11140,Zero 7,eng,TRDVHFU128E079576E,Red Dust,https://play.spotify.com/track/74uEUoR6nCR1aSI...,53832,0,0,0,...,0,0,18,2185451.0,Zero 7,Red Dust,/Zero-7-red-dust-lyrics,\n\nInstrumental\n\n,True,74uEUoR6nCR1aSIJIh4d2F


# Scrape Audio Features off of Spotify

In [4]:
spotify = Spotify()
spotify.authenticate()

In [5]:
# Add Spotify ID to tracks
spotify_id_regex = re.compile(r".*track\/([a-zA-Z0-9]+)$")
merged_df.loc[:,"spotify_id"] = [spotify_id_regex.match(str(url)).group(1)
                                 for url in list(merged_df.loc[:,"urlSpotify"])]

In [6]:
audio_features = {"energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness",
                  "liveness", "valence", "tempo", "time_signature"}

In [7]:
def get_spotify_audio_features_for_df(args):
    i, df = args
    chunksize = 100  # Spotify allows at most 100 IDs per request
    for start in range(0, len(df), chunksize):
        time.sleep(1+random.random()*5)  # To avoid hitting the rate limits
        stop = start + chunksize
        if start//chunksize % 10:
            sys.stdout.write("{}: {}/{}.  ".format(i, start//chunksize+1, len(df)//chunksize))  # Show some progress
            sys.stdout.flush()
        try:
            ids = df.iloc[start:stop]["spotify_id"].copy()
            res = spotify.get("audio-features", ids=",".join(ids))
            if len(res["audio_features"]) < len(ids):
                raise Exception("Did not get enough results, {} v {} v {}".format(len(res["audio_features"]), stop-start, len(ids)))
            for features in res["audio_features"]:
                if not features:
                    continue
                spotify_id = features["id"]
                matching_row = (df["spotify_id"]==spotify_id).copy()
                for feature in audio_features:
                    if feature not in features:
                        raise Exception("Missing feature {} from response: {}".format(feature, repr(features)))
                    df.loc[matching_row, "feature_{}".format(feature)] = features[feature]
        except Exception as e:
            print(i, start)
            print(repr(e))
            continue
    return df

In [12]:
tags_spotify_csv = "data/wasabi_lastfm_genius_spotify.csv"
merged_df = map_chunkwise(merged_df, get_spotify_audio_features_for_df, n_workers=8)
merged_df.to_csv(tags_spotify_csv)

In [11]:
merged_df = pd.read_csv(tags_spotify_csv)
merged_df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,artist,language,lastfm_id,title,urlSpotify,Unnamed: 0.1.1,tranquil,intense,...,feature_tempo,feature_valence,feature_energy,feature_speechiness,feature_acousticness,feature_mode,feature_liveness,feature_loudness,feature_key,feature_time_signature
0,0,66,Alexia,eng,TRHKKKV128F4238CB3,Uh La La La,https://play.spotify.com/track/1ahAVyeP1B027c0...,41876,0,0,...,92.308,0.8150,0.589,0.0893,0.28900,1.0,0.331,-11.562,5.0,4.0
1,1,67,Alexia,eng,TRDZQWY128F4238C8B,Gimme Love,https://play.spotify.com/track/2scKJ2GTRLCDvnz...,52523,0,0,...,98.010,0.7590,0.804,0.0348,0.04690,1.0,0.801,-6.682,0.0,4.0
2,2,68,Alexisonfire,eng,TRAPGKY128F42972A9,Mailbox Arson,https://play.spotify.com/track/5UDRAe48sxIpwQz...,32315,0,0,...,90.473,0.3620,0.949,0.0597,0.00025,0.0,0.462,-4.464,0.0,4.0
3,3,69,Alexisonfire,eng,TROCHHB128F14A664A,Boiled Frogs,https://play.spotify.com/track/3mE1W8j5OZstBKt...,37666,0,0,...,100.511,0.3520,0.955,0.0559,0.00120,0.0,0.402,-4.715,10.0,4.0
4,4,70,Alexisonfire,eng,TRQNBJV128F42972BF,Rough Hands,https://play.spotify.com/track/4zDPicZuWXyCzdH...,58308,0,0,...,120.937,0.0812,0.762,0.0430,0.00349,1.0,0.128,-6.129,6.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11178,11178,11137,Zero 7,eng,TRYEGBI128E0795767,I Have Seen,https://play.spotify.com/track/343S0S5OXLWqznF...,77085,0,0,...,158.065,0.7610,0.680,0.0308,0.00786,1.0,0.192,-8.414,8.0,4.0
11179,11179,11138,Zero 7,eng,TRGCVSJ128E0795768,Polaris,https://play.spotify.com/track/4njttTYaIqT8V9Y...,17139,0,0,...,78.636,0.1870,0.542,0.0476,0.02980,0.0,0.121,-10.469,8.0,4.0
11180,11180,11139,Zero 7,eng,TROXFNG12903CF54B2,Destiny,https://play.spotify.com/track/6PcuC1LuoldUcOG...,38179,0,0,...,138.031,0.4390,0.586,0.0306,0.21600,0.0,0.117,-9.104,7.0,4.0
11181,11181,11140,Zero 7,eng,TRDVHFU128E079576E,Red Dust,https://play.spotify.com/track/74uEUoR6nCR1aSI...,53832,0,0,...,86.004,0.1910,0.416,0.0274,0.46600,1.0,0.135,-10.246,4.0,4.0


# Data Cleanup

In [13]:
clean_df = merged_df.copy()

In [24]:
# Remove annotations such as [Chorus] from the lyrics
spaces_regex = re.compile("\s+")
annotations_regex = re.compile("[\[\(\{][\w+\s+-:'&\?,]+[\)\}\]]")
instrumental_regex = re.compile("(Instrumental|instrumental)")
def clean_lyrics(lyrics):
    if not isinstance(lyrics, str):
        return None
    lyrics = annotations_regex.sub("", lyrics)
    lyrics = spaces_regex.sub(" ", lyrics)
    lyrics = instrumental_regex.sub("", lyrics)
    lyrics = lyrics.strip()
    return lyrics
clean_df.loc[:,"lyrics_clean"] = clean_df.loc[:,"genius_lyrics"].copy().apply(clean_lyrics)

In [None]:
clean_df = clean_df[(clean_df["exact_match"] == True) &  # Lyrics match exactly the title
                    ~clean_df["genius_lyrics"].isnull() & (clean_df["lyrics_clean"] != "") &  # Not instrumental, has lyrics
                    ~clean_df["feature_acousticness"].isnull()  # has Spotify features
                   ]
clean_df = clean_df.drop(columns=["Unnamed: 0", "Unnamed: 0.1", "Unnamed: 0.1.1"])
clean_df.to_csv("data/final_clean.csv")

In [45]:
clean_df = pd.read_csv("data/final_clean.csv")
clean_df

Unnamed: 0.1,Unnamed: 0,artist,language,lastfm_id,title,urlSpotify,tranquil,intense,pesimism,tragic,...,feature_valence,feature_energy,feature_speechiness,feature_acousticness,feature_mode,feature_liveness,feature_loudness,feature_key,feature_time_signature,lyrics_clean
0,0,Alexia,eng,TRHKKKV128F4238CB3,Uh La La La,https://play.spotify.com/track/1ahAVyeP1B027c0...,0,0,0,0,...,0.8150,0.589,0.0893,0.28900,1.0,0.3310,-11.562,5.0,4.0,To all the people all around the world Ooh la ...
1,1,Alexia,eng,TRDZQWY128F4238C8B,Gimme Love,https://play.spotify.com/track/2scKJ2GTRLCDvnz...,0,0,0,0,...,0.7590,0.804,0.0348,0.04690,1.0,0.8010,-6.682,0.0,4.0,Gimme love gimme love Baby gimme your love I l...
2,2,Alexisonfire,eng,TRAPGKY128F42972A9,Mailbox Arson,https://play.spotify.com/track/5UDRAe48sxIpwQz...,0,0,0,0,...,0.3620,0.949,0.0597,0.00025,0.0,0.4620,-4.464,0.0,4.0,Your mail's not safe in this town Your mail's ...
3,3,Alexisonfire,eng,TROCHHB128F14A664A,Boiled Frogs,https://play.spotify.com/track/3mE1W8j5OZstBKt...,0,0,0,0,...,0.3520,0.955,0.0559,0.00120,0.0,0.4020,-4.715,10.0,4.0,A man sits at his desk one year from retiremen...
4,4,Alexisonfire,eng,TRQNBJV128F42972BF,Rough Hands,https://play.spotify.com/track/4zDPicZuWXyCzdH...,0,0,0,0,...,0.0812,0.762,0.0430,0.00349,1.0,0.1280,-6.129,6.0,4.0,"Was I left behind? Someone tell me, tell me I ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10421,11176,William Shatner,eng,TRDDSWW128F4284622,Common People,https://play.spotify.com/track/6pT57VywIGw8eyK...,0,0,0,0,...,0.7730,0.915,0.1540,0.03520,1.0,0.0955,-3.668,0.0,4.0,"She came from Greece, she had a thirst for kno..."
10422,11177,Zappacosta,eng,TRHYPZD128F429EC45,Overload,https://play.spotify.com/track/2Y1yIA6IPRCPzIV...,0,0,0,0,...,0.8980,0.684,0.0402,0.18400,0.0,0.1730,-11.016,9.0,4.0,This overload I can hear your heels clicking o...
10423,11178,Zero 7,eng,TRYEGBI128E0795767,I Have Seen,https://play.spotify.com/track/343S0S5OXLWqznF...,0,0,0,0,...,0.7610,0.680,0.0308,0.00786,1.0,0.1920,-8.414,8.0,4.0,"Old man there, people stare Thinking back, to ..."
10424,11180,Zero 7,eng,TROXFNG12903CF54B2,Destiny,https://play.spotify.com/track/6PcuC1LuoldUcOG...,0,0,0,0,...,0.4390,0.586,0.0306,0.21600,0.0,0.1170,-9.104,7.0,4.0,I lie awake I've gone to ground I'm watching p...


# Inspect the data

In [38]:
sad_synonyms =       ('sad', 'melancholy', 'melancholic', 'dark', 'depressing', 'sorrow', 'blue')
happy_synonyms =     ('happy', 'happiness', 'joyful', 'cheerful')
calm_synonyms =      ('calm', 'relax', 'relaxing', 'mellow', 'soothing', 'peaceful')
energetic_synonyms = ('energetic', 'lively', 'exciting')
axes = {(sad_synonyms,  happy_synonyms),
        (calm_synonyms, energetic_synonyms)}

Find songs that can be placed on the dimensions by the given axes, i.e. it must have ONE tag per axis (not none and not two).

In [46]:
thresh = 3  # require at least three tags of given emotion

# match tags that have 
matches = np.array([True]*len(clean_df))
for axis_l, axis_r in axes:
    l_matches = np.sum(clean_df[list(axis_l)], axis=1) > thresh
    r_matches = np.sum(clean_df[list(axis_r)], axis=1) > thresh
    matches &= l_matches ^ r_matches