In [318]:
import sys
import re
import functools
import operator
import random
import requests
import pickle
import numpy as np
import pandas as pd
from apis import Genius, Spotify

In [319]:
def invert_dict(d):
    inverted = {}
    for k, vs in d.items():
        for v in vs:
            if v in inverted:
                inverted[v].append(k)
            else:
                inverted[v] = [k]
    return inverted

# Emotion Tags

These are the available emotion tags in the data set.

In [358]:
tags_pickle = "data/lastfm_id_to_emotion_tags.pickle"
emotions_csv = "data/emotion_tags.csv"
with open(tags_pickle, "rb") as f:
    track_to_emotion = pickle.load(f)
    
all_tags = functools.reduce(set.union, [{emotion for emotion, _ in emotions}
                                        for _, emotions in track_to_emotion.items()], set())
    
emotions_table = {"lastfm_id": []}
emotions_table.update({tag: [] for tag in all_tags})
for track in track_to_emotion:
    emotions_table["lastfm_id"].append(track)
    track_emotions = {emotion: count for emotion, count in track_to_emotion[track]}
    for emotion in all_tags:
        count = track_emotions[emotion] if emotion in track_emotions else 0
        emotions_table[emotion].append(count)

In [359]:
original_df = pd.DataFrame(emotions_table)

In [360]:
original_df.loc[:, "tag_count"] = original_df.loc[:, all_tags].sum(axis=1).copy()
original_df

Unnamed: 0,lastfm_id,annoyed,cheerup,conflict,boisterous,rebel,playful,tough,cheer,celebrate,...,soothing,introvert,dreamy,gentle,desire,feral,sceptic,tragedy,glad,tag_count
0,TRRRRCH128F9342C72,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6
1,TRRRRNA128F42948D2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,TRRRRLE128F147C97D,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,100
3,TRRRRYK128F93229FA,0,0,0,0,0,0,0,0,0,...,4,0,0,0,0,0,0,0,0,12
4,TRRRRGT128F4288741,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83274,TRWWPDA128E0792F83,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,10
83275,TRWWWGP128F146AE64,0,0,0,0,0,0,0,0,0,...,0,0,20,0,0,0,0,0,0,20
83276,TRWWWOH12903CB7288,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,66
83277,TRWWWKQ128F426EEDC,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,14


In [444]:
emotions_df = original_df.copy()
for tag in all_tags:
    emotions_df.loc[:, "{}_satisfied".format(tag)] = ((emotions_df[tag]/emotions_df["tag_count"]>0.2) & (emotions_df[tag]>2)).copy()
    
np.count_nonzero(np.sum(emotions_df.loc[:, ["{}_satisfied".format(tag) for tag in all_tags]], axis=1))
    

75805

# Preproces and Select Emotions

Merge duplicates, select interesting subset.

In [445]:
# Synonyms are merged and renamed to the first tag in the tuple
merge_similar = {('sad', 'melancholy', 'melancholic', 'dark', 'depressing', 'sorrow', 'blue'),
                 ('happy', 'happiness', 'joyful', 'cheerful'),
                 ('calm', 'relax', 'relaxing', 'mellow', 'soothing', 'peaceful'),
                 ('energetic', 'lively', 'exciting')}

for to_merge in merge_similar:
    name, *_ = to_merge
    emotions_df.loc[:, "{}_merged".format(name)] = emotions_df.loc[:, to_merge].sum(axis=1)
    emotions_df.loc[:, "{}_satisfied_merged".format(name)] = emotions_df.loc[:, ["{}_satisfied".format(x) for x in to_merge]].sum(axis=1) > 0

In [446]:
emotions_df

Unnamed: 0,lastfm_id,annoyed,cheerup,conflict,boisterous,rebel,playful,tough,cheer,celebrate,...,tragedy_satisfied,glad_satisfied,energetic_merged,energetic_satisfied_merged,calm_merged,calm_satisfied_merged,happy_merged,happy_satisfied_merged,sad_merged,sad_satisfied_merged
0,TRRRRCH128F9342C72,0,0,0,0,0,0,0,0,0,...,False,False,0,False,1,False,5,True,0,False
1,TRRRRNA128F42948D2,0,0,0,0,0,0,0,0,0,...,False,False,0,False,0,False,0,False,0,False
2,TRRRRLE128F147C97D,0,0,0,0,0,0,0,0,0,...,False,False,0,False,0,False,0,False,0,False
3,TRRRRYK128F93229FA,0,0,0,0,0,0,0,0,0,...,False,False,0,False,8,True,0,False,0,False
4,TRRRRGT128F4288741,0,0,0,0,0,0,0,0,0,...,False,False,0,False,0,False,0,False,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83274,TRWWPDA128E0792F83,0,0,0,0,0,0,0,0,0,...,False,False,0,False,10,True,0,False,0,False
83275,TRWWWGP128F146AE64,0,0,0,0,0,0,0,0,0,...,False,False,0,False,0,False,0,False,0,False
83276,TRWWWOH12903CB7288,0,0,0,0,0,0,0,0,0,...,False,False,0,False,0,False,0,False,0,False
83277,TRWWWKQ128F426EEDC,0,0,0,0,0,0,0,0,0,...,False,False,0,False,0,False,0,False,0,False


In [447]:
emotion_axes = [("happy_satisfied_merged", "sad_satisfied_merged"),
                ("calm_satisfied_merged", "energetic_satisfied_merged")]

condition = functools.reduce(operator.iand,
                             [emotions_df[axis_l] ^ emotions_df[axis_r]
                              for axis_l, axis_r in emotion_axes],
                             [True]*len(emotions_df))

matching_df = emotions_df[condition].copy()
np.count_nonzero(condition)

4732

In [448]:
interesting_ones = ["lastfm_id",
                    "happy", "happy_merged", "happy_satisfied_merged",
                    "sad", "melancholy", "melancholic", "sad_merged", "sad_satisfied_merged",
                    "calm", "calm_merged", "calm_satisfied_merged",
                    "energetic", "energetic_merged", "energetic_satisfied_merged"]
matching_df[interesting_ones]

Unnamed: 0,lastfm_id,happy,happy_merged,happy_satisfied_merged,sad,melancholy,melancholic,sad_merged,sad_satisfied_merged,calm,calm_merged,calm_satisfied_merged,energetic,energetic_merged,energetic_satisfied_merged
41,TRRRAAG128F14744C5,6,6,True,0,0,0,0,False,0,3,True,0,0,False
47,TRRRAOJ128F148D7C9,0,0,False,0,8,2,10,True,2,16,True,0,0,False
101,TRRREPN128F42920D2,0,0,False,0,0,25,25,True,0,25,True,0,0,False
130,TRRRYMO128E0780E6B,0,0,False,10,43,7,60,True,0,20,True,0,0,False
163,TRRUIQI128F9314D83,0,0,False,0,3,1,4,True,0,3,True,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83193,TRWWANS12903CF606A,5,5,True,0,0,0,0,False,0,5,True,0,0,False
83207,TRWWOKS128F9325C37,0,0,False,0,0,5,5,True,0,9,True,0,0,False
83246,TRWWXVW128F92C44C7,0,0,False,0,11,0,11,True,11,11,True,0,0,False
83260,TRWWLZJ128F426A946,0,0,False,0,21,7,28,True,0,42,True,0,0,False


In [449]:
class_sizes = [(k, np.count_nonzero(matching_df["{}_satisfied_merged".format(k)])) for k in {"happy", "sad", "energetic", "calm"}]
smallest_subset_size = min(count for _, count in class_sizes)
class_sizes, smallest_subset_size

([('happy', 1044), ('energetic', 386), ('calm', 4346), ('sad', 3688)], 386)

In [450]:
indices = [list(matching_df[matching_df['{}_satisfied_merged'.format(k)]].index)
           for k in {"energetic", "happy", "sad", "calm"}]
new_indices = []
picked_indices = set()
for idx in indices:
    pick = list(set(idx).difference(picked_indices))
    random.shuffle(pick)
    pick = pick[0:smallest_subset_size]
    picked_indices = picked_indices.union(pick)
    new_indices.extend(pick)
len(new_indices)

1458

In [375]:
columns = ["lastfm_id", "happy_satisfied_merged", "energetic_satisfied_merged"]
emotions_clean_df = matching_df.loc[new_indices, columns].copy()
emotions_clean_df.columns = ["lastfm_id", "tagged_happy", "tagged_energetic"]
emotions_clean_df

Unnamed: 0,lastfm_id,tagged_happy,tagged_energetic
75620,TRYZQEP128F146AE5D,True,False
20281,TRZHHIL128F4264EE0,True,False
5123,TRUAHKB128F427B1BA,True,True
28133,TRSXHIQ128F427AF6A,True,False
17813,TRGEZUO12903CFE581,True,True
...,...,...,...
72534,TRLNIBD128F92CA4FF,False,False
49575,TRCSNKA128F4298F10,False,False
2816,TRRXYTH128F93335FC,False,False
79327,TRPCGLW128F93335EA,False,False


In [376]:
emotions_clean_df.to_csv("data/emotions_songs.csv")
emotions_clean_df

Unnamed: 0,lastfm_id,tagged_happy,tagged_energetic
75620,TRYZQEP128F146AE5D,True,False
20281,TRZHHIL128F4264EE0,True,False
5123,TRUAHKB128F427B1BA,True,True
28133,TRSXHIQ128F427AF6A,True,False
17813,TRGEZUO12903CFE581,True,True
...,...,...,...
72534,TRLNIBD128F92CA4FF,False,False
49575,TRCSNKA128F4298F10,False,False
2816,TRRXYTH128F93335FC,False,False
79327,TRPCGLW128F93335EA,False,False


# Preprocess Song Data Set

We are only interested in rows that have emotion tags as well as Spotify data associated with them. Since this is a huge data set we process it in chunks and read only our relevant data.

In [377]:
input_csv = "data/wasabi_songs.csv"
clean_csv = "data/wasabi_clean.csv"

In [394]:
with pd.read_csv(input_csv,
                 chunksize=16000,
                 sep="\t",
                 usecols=["albumTitle", "artist", "genre", "language", "title", "lastfm_id", "urlSpotify"],
                 low_memory=False) as reader:
    for i, songs_df in enumerate(reader):
        sys.stdout.write(".")  # Show some progress
        clean_df = songs_df[#~songs_df["urlSpotify"].isnull()&
                            ~songs_df["lastfm_id"].isnull()&
                            (songs_df["language"]=="eng")]
        clean_df.to_csv(clean_csv,
                        header=(i==1),
                        mode=("a" if i > 1 else "w"))

....................................................................................................................................

In [398]:
wasabi_df = pd.read_csv(clean_csv)

In [399]:
len(wasabi_df)

47122

In [400]:
# Remove duplicates by Spotify ID
# Some songs are in the data set multiple times, e.g. if they are on multiple albums
wasabi_df = wasabi_df.drop_duplicates("lastfm_id")
len(wasabi_df), len(emotions_clean_df)

(29276, 3977)

# Select songs with top emotions

Select all the songs with a manual selection of emotion tags.

In [401]:
#tracks_with_emotion = functools.reduce(set.union, select_emotions.values(), set())
#reduced_df = wasabi_df[wasabi_df["lastfm_id"].isin(tracks_with_emotion)].copy()
reduced_df = wasabi_df.merge(emotions_clean_df, on="lastfm_id").drop(columns=["Unnamed: 0", "genre", "albumTitle"])
reduced_df

Unnamed: 0,artist,language,lastfm_id,title,urlSpotify,tagged_happy,tagged_energetic
0,Ace Of Base,eng,TRLNIBD128F92CA4FF,Ravine,https://play.spotify.com/track/7rTu61ILMBgshAb...,False,False
1,Adam And The Ants,eng,TRNWGOL128F427F679,Press Darlings,https://play.spotify.com/track/4l8nMSVlgRKxUZy...,True,True
2,Adele,eng,TRYEXDH128F423A724,Right As Rain,https://play.spotify.com/track/2Q2SougPFYpLWVZ...,True,False
3,Adele,eng,TRTWBQF128F425069C,Hometown Glory,https://play.spotify.com/track/7AFiFyTWOma1qPY...,False,False
4,Adriana Caselotti,eng,TROYCBC128F9333C58,Some Day My Prince Will Come,https://play.spotify.com/track/0L4ZfgqxgMhGifU...,False,False
...,...,...,...,...,...,...,...
1077,Yo La Tengo,eng,TRSLRAA128F429E9E3,Let's Save Tony Orlando's House,,True,False
1078,Zeromancer,eng,TRMNEPD128E0786D54,Eurotrash,https://play.spotify.com/track/3MstNhBWGLvh0aK...,False,True
1079,Zeromancer,eng,TRUGBUS128F4259029,Hollywood,https://play.spotify.com/track/6IxpHxE1N6zl7A4...,False,False
1080,3 Doors Down,eng,TRAXJHG128F427EA02,Be Like That,https://play.spotify.com/track/1UcWNtdPvWzKBsl...,False,False


In [261]:
#for emotion, tracks in select_emotions.items():
#    reduced_df.loc[:,"emotion_{}".format(emotion)] = reduced_df.loc[:,"lastfm_id"].isin(tracks).copy()

# Set up APIs for Scraping

In [4]:
spotify = Spotify()
genius = Genius()
spotify.authenticate()
genius.authenticate()

Please follow this link and authorize the app:
https://api.genius.com/oauth/authorize?client_id=HDGdy8grlfzttb5mZx3hKJzLdgJdlTnyxGTPY06Ldkrx21MCkug1Vi6UQwgW4KXW&redirect_uri=http%3A%2F%2Flocalhost&scope=&state=&response_type=code

Then, paste the URL you are redirected to here: 
http://localhost/?code=8CSpOn7q2YUeXx8NzKLh58XDbZvxlnkq2zT2t4maUBFlH6Qw2TSTbV0lGWn5AG9r
Authentication successful.


# Scrape Audio Features off of Spotify

In [18]:
spotify_id_regex = re.compile(r".*track\/([a-zA-Z0-9]+)$")
reduced_df.loc[:,"spotify_id"] = [spotify_id_regex.match(str(url)).group(1)
                                  for url in list(reduced_df.loc[:,"urlSpotify"])]

In [228]:
audio_features = {"energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness",
                  "liveness", "valence", "tempo", "time_signature"}

In [None]:
chunksize = 100  # Spotify allows at most 100 IDs per request
for start in range(0, len(reduced_df), chunksize):
    stop = start+chunksize
    sys.stdout.write("{}/{}.  ".format(start//chunksize+1, len(reduced_df)//chunksize))  # Show some progress
    try:
        ids = reduced_df.iloc[start:stop]["spotify_id"]
        res = spotify.get("audio-features", ids=",".join(ids))
        if len(res["audio_features"]) < len(ids):
            raise Exception("Did not get enough results, {} v {} v {}".format(len(res["audio_features"]), stop-start, len(ids)))
        for features in res["audio_features"]:
            if not features:
                print("No Spotify features for song available.")
                continue
            spotify_id = features["id"]
            matching_row = (reduced_df["spotify_id"]==spotify_id).copy()
            if np.count_nonzero(matching_row) != 1:
                raise Exception("Found 0 or more than 1 matching Spotify ID in table. Duplicates?")
            for feature in audio_features:
                if feature not in features:
                    raise Exception("Missing feature {} from response: {}".format(feature, repr(features)))
                reduced_df.loc[matching_row, "feature_{}".format(feature)] = features[feature]
    except Exception as e:
        print(e)
        continue

In [247]:
reduced_df = reduced_df[~reduced_df["feature_energy"].isnull()]
reduced_df.to_csv("data/wasabi_spotify_reduced.csv")
len(reduced_df)

5313

In [2]:
reduced_df = pd.read_csv("data/wasabi_spotify_reduced.csv")

# Scrape Lyrics off of Genius

The Wasabi dataset does not provide us with a unique ID that can be used to retrieve lyrics. We search on Genius based on song title and artist instead. If the match is not exact, we set a flag in the data so we can manually check it (or drop the row).

In [53]:
space_regex = re.compile("\s+")
alphnum_regex = re.compile("[^a-zA-Z0-9 ]")
def clean_str(instr):
    return alphnum_regex.sub("", space_regex.sub(" ", instr.strip().lower()))

for i, (idx, row) in enumerate(reduced_df.iterrows()):
    if i % 10 == 0:
        sys.stdout.write("{0:4d}/{1:4d}.  ".format(i, len(reduced_df)))  # Show some progress
    try:
        if isinstance(row["genius_lyrics"], str) and row["exact_match"]:
            continue  # Already fetched exact lyrics for this row; continue
        res = genius.get("search", q="{} by {}".format(row["title"], row["artist"]))
        assert res["meta"]["status"] == 200
        assert len(res["response"]["hits"]) >= 1
        genius_id = None
        genius_path = None
        genius_title = None
        genius_artist = None
        exact_match = False
        # Go through all search results; if we have an exact match, we stop.
        # For inexact matches, since we are reversing the list, we will have the most relevant
        # one last, i.e. use that one.
        for hit in reversed(res["response"]["hits"]):
            if hit["type"] != "song":
                    continue
            genius_id = hit["result"]["id"]
            genius_path = hit["result"]["path"]
            genius_title = hit["result"]["title"]
            genius_artist = hit["result"]["primary_artist"]["name"]
            exact_match = (clean_str(genius_title.strip()) == clean_str(row["title"])
                           and clean_str(genius_artist) == clean_str(row["artist"]))
            if exact_match:
                break
        if not exact_match:
            l = max(len(clean_str(genius_artist)), len(clean_str(row["artist"])))
            print()
            print("Inexact match: {1:{0:d}s} - {2:s}".format(l, clean_str(row["artist"]), clean_str(row["title"])))
            print("           vs: {1:{0:d}s} - {2:s}".format(l, clean_str(genius_artist), clean_str(genius_title)))
        reduced_df.loc[idx, "genius_id"] = genius_id
        reduced_df.loc[idx, "genius_artist"] = genius_artist
        reduced_df.loc[idx, "genius_title"] = genius_title
        reduced_df.loc[idx, "genius_path"] = genius_path
        lyrics = genius.lyrics_from_song_api_path(genius_path)
        reduced_df.loc[idx, "genius_lyrics"] = lyrics
        reduced_df.loc[idx, "exact_match"] = exact_match
    except Exception as e:
        print()
        print("{}: {}".format(i, repr(e)))

   0/5313.    10/5313.  
Inexact match: ace of base - the juvenile
           vs: ace of base - juvenile
  20/5313.    30/5313.    40/5313.    50/5313.    60/5313.    70/5313.    80/5313.    90/5313.   100/5313.   110/5313.   120/5313.   130/5313.   140/5313.   150/5313.   160/5313.  
Inexact match: america         - sandman
           vs: homeboy sandman - america the beautiful
 170/5313.   180/5313.   190/5313.  
Inexact match: amy winehouse - cupid
           vs: amy winehouse - cupid deluxe edition version
 200/5313.  
Inexact match: andrew bird               - nuthinduan waltz
           vs: andrew birds bowl of fire - nuthinduan waltz
 210/5313.   220/5313.   230/5313.   240/5313.   250/5313.  
Inexact match: ania          - sound of silence
           vs: anna kendrick - the sound of silence
 260/5313.   270/5313.   280/5313.   290/5313.   300/5313.   310/5313.   320/5313.   330/5313.   340/5313.  
Inexact match: baby bash    - suga suga
           vs: brockhampton - things we l

1740/5313.  1750/5313.  1760/5313.  1770/5313.  
Inexact match: glenn miller                   - chattanooga choo choo
           vs: glenn miller and his orchestra - chattanooga choo choo
1780/5313.  1790/5313.  1800/5313.  1810/5313.  
Inexact match: grace potter and the nocturnals - white rabbit
           vs: grace potter  the nocturnals    - white rabbit
1820/5313.  1830/5313.  1840/5313.  
Inexact match: groove armada - dusk you  me
           vs: genius        - october 2020 album release calendar
1850/5313.  1860/5313.  1870/5313.  1880/5313.  1890/5313.  
Inexact match: hanson   - mmmbop
           vs: kickraux - feelin u

Inexact match: hanson                 - every word i say
           vs: sir arthur conan doyle - a study in scarlet part ii chapter 6
1900/5313.  
Inexact match: harry manx  - a single spark
           vs: web du bois - the quest of the silver fleece chap 21
1910/5313.  
Inexact match: heart     - say hello
           vs: bob dylan - if you see her say hello

3010/5313.  
Inexact match: najee        - as
           vs: bittereinder - a tale of three cities
3020/5313.  3030/5313.  3040/5313.  3050/5313.  
Inexact match: new order - icb
           vs: bushido   - kalter krieg
3060/5313.  3070/5313.  
Inexact match: nick cave and the bad seeds - do you love me
           vs: nick cave  the bad seeds    - do you love me

Inexact match: nick cave and the bad seeds - loverman
           vs: nick cave  the bad seeds    - loverman

Inexact match: nick cave and the bad seeds - i let love in
           vs: nick cave  the bad seeds    - i let love in

Inexact match: nick cave and the bad seeds - aint gonna rain anymore
           vs: nick cave  the bad seeds    - aint gonna rain anymore

Inexact match: nick cave and the bad seeds - brompton oratory
           vs: ewokabdevito                - song emotion catalog experiment

Inexact match: nick cave and the bad seeds - idiot prayer
           vs: nick cave  the bad seeds    - idiot prayer
3080/5313.  


4107: AttributeError("'NoneType' object has no attribute 'get_text'")
4110/5313.  4120/5313.  4130/5313.  4140/5313.  
Inexact match: tpau      - heart and soul
           vs: ash ellis - broken lightbulb
4150/5313.  4160/5313.  4170/5313.  4180/5313.  4190/5313.  4200/5313.  4210/5313.  4220/5313.  4230/5313.  
Inexact match: the ark         - disease
           vs: charles dickens - little dorrit chap 26

Inexact match: the ark - this piece of poetry is meant to do harm
           vs: ark     - this piece of poetry is meant to do harm

Inexact match: the ark          - rock city wankers
           vs: oxygen destroyer - cleansing the earth of humanitys existence

Inexact match: the automatic      - recover
           vs: the scp foundation - scp1980
4240/5313.  4250/5313.  4260/5313.  4270/5313.  4280/5313.  4290/5313.  4300/5313.  4310/5313.  4320/5313.  4330/5313.  4340/5313.  4350/5313.  4360/5313.  4370/5313.  4380/5313.  4390/5313.  
Inexact match: the coral - in the rain
     

In [55]:
# Remove annotations such as [Chorus] from the lyrics
spaces_regex = re.compile("\s+")
annotations_regex = re.compile("[\[\(\{][\w+\s+-:'&\?,]+[\)\}\]]")
def clean_lyrics(lyrics):
    if not isinstance(lyrics, str):
        return None
    lyrics = annotations_regex.sub("", lyrics)
    lyrics = spaces_regex.sub(" ", lyrics)
    return lyrics
reduced_df.loc[:,"lyrics_clean"] = reduced_df.loc[:,"genius_lyrics"].copy().apply(clean_lyrics)

In [23]:
reduced_df.to_csv("data/wasabi_spotify_genius_reduced.csv")
reduced_df[reduced_df["exact_match"]].to_csv("data/wasabi_spotify_genius_reduced_exact_matches.csv")

In [5]:
len(reduced_df), len(reduced_df[reduced_df["exact_match"]])

(5313, 5054)

In [29]:
reduced_df[reduced_df["exact_match"]].to_csv("data/wasabi_spotify_genius_reduced_exact_matches.csv")

In [None]:
reduced_df = pd.read_csv("data/wasabi_spotify_genius_reduced.csv")
len(reduced_df)