# Embeddings arithmetic

Ideas:
* Song / song
* Song / genre
* Song - its artist representative + other artist representative
* If representative meaningful: artist / genre

In [2]:
import pandas as pd
from zipfile import ZipFile
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
import numpy as np
from scipy import spatial
from utils import read_lastfm, build_vocab, load_model

Let's load the dataset and the song2vec model.

In [3]:
songs, users = read_lastfm("data/lastfm-dataset-1K.zip")
songs.head()

Unnamed: 0,user_id,timestamp,artist_id,artist_name,track_id,track_name
0,user_000001,2009-05-04 23:08:57+00:00,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
1,user_000001,2009-05-04 13:54:10+00:00,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Composition 0919 (Live_2009_4_15)
2,user_000001,2009-05-04 13:52:04+00:00,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc2 (Live_2009_4_15)
3,user_000001,2009-05-04 13:42:52+00:00,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Hibari (Live_2009_4_15)
4,user_000001,2009-05-04 13:42:11+00:00,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc1 (Live_2009_4_15)


In [4]:
emb_vectors, model = load_model("data/word2vec/word2vec.model")
emb_list = list(emb_vectors.keys())
X = np.array([list(v) for k, v in emb_vectors.items()])

In [5]:
tree = spatial.KDTree(X)

We use of a KD tree to find the closest vector to any given song representation.

In [6]:
def create_song_id(df):
    """Add the song ID's to the dataset"""
    df = df.copy()
    df = df[~df.track_name.isna()]
    df["song_id"]= df.artist_name.cat.codes.astype("int64") * df.track_name.nunique() \
                            + df.track_name.cat.codes
    df = df.drop_duplicates("song_id")[["song_id", "artist_name", "track_name"]]
    return df

mapping = create_song_id(songs)

In [7]:
def emb2song(emb):
    """Convert a vector to the corresponding song"""
    if type(emb) == int:
        emb = [emb]
    return mapping[mapping.song_id.isin(emb)]

def song2emb(artist, song):
    """Convert a song to its vector representation"""
    return mapping.query("(artist_name == @artist) & (track_name == @song)").song_id.values[0]

In [8]:
emb2song(4616724870)

Unnamed: 0,song_id,artist_name,track_name
24969,4616724870,The Postal Service,Such Great Heights


In [9]:
song2emb("The Postal Service", "Such Great Heights")

4616724870

In [10]:
def avg(artist1, song1, artist2, song2, topn=10):
    """
    Computes the averages between two song representations and 
    return the closest song to that average
    """
    emb1 = song2emb(artist1, song1)
    emb2 = song2emb(artist2, song2)
    result = (emb_vectors[emb1] + emb_vectors[emb2]) / 2
    index = tree.query(result, k=topn)[1]
    result_embs = [emb_list[i] for i in index]
    return emb2song(result_embs)

In [11]:
avg("The Postal Service", "Nothing Better", "The Postal Service", "Such Great Heights")

Unnamed: 0,song_id,artist_name,track_name
24969,4616724870,The Postal Service,Such Great Heights
31248,4616710818,The Postal Service,Brand New Colony
46215,4616711620,The Postal Service,Clark Gable
46216,4616722437,The Postal Service,Recycled Air
46217,4616720761,The Postal Service,Nothing Better
46219,4616723888,The Postal Service,Sleeping In
46222,4616725808,The Postal Service,The District Sleeps Alone Tonight
50532,4616728247,The Postal Service,We Will Become Silhouettes
50534,4616726846,The Postal Service,This Place Is A Prison
50537,4616720331,The Postal Service,Natural Anthem


As expected the average of two songs from the same artist is closest to other songs of the same artist.

In [12]:
avg("Britney Spears", "Gimme More", "Boy Division", "Love Will Tear Us Apart")

Unnamed: 0,song_id,artist_name,track_name
102950,2341413539,Joy Division,Passover
231432,2341422409,Joy Division,A Means To An End
308223,2341458102,Joy Division,Novelty
308228,2341449850,Joy Division,Day Of The Lords
308233,2341453792,Joy Division,I Remember Nothing
308234,2341454473,Joy Division,Interzone
308235,2341420776,Joy Division,Wilderness
308236,2341415443,Joy Division,Shadowplay
311049,2341406910,Joy Division,Glass
485234,2341473500,Joy Division,No Love Lost


Interestingly the closest songs to Gimme More and Love will Tear Us Apart are songs from Joy Division. Is it a coincidence that the names are similar? Moreover, Joy Division also has a song called Love Will Tear Us Apart.

In [13]:
def artist_emb(artist):
    """Create an artist representation as the centroid of all the artist's song representations"""
    artist_songs = mapping.query("artist_name == @artist").song_id.values
    centroid = np.zeros(100)
    for emb in artist_songs:
        if emb in emb_vectors:
            centroid += emb_vectors[emb]
    centroid /= len(artist_songs)
    return centroid

In [14]:
radiohead = artist_emb("Radiohead")
emb2song(emb_list[tree.query(radiohead)[1]])

Unnamed: 0,song_id,artist_name,track_name
2696383,3583355688,Radiohead,"Polyethylene, Part 1 & 2"


The radioheadest song of radiohead looks to be Polyethylene.

In [15]:
def change_artist(artist1, song, artist2, topn=10):
    """
    Change the artist of a song by substracting the original artist's representation
    and adding the new artist representation
    """
    artist_emb1 = artist_emb(artist1)
    artist_emb2 = artist_emb(artist2)
    song_emb = song2emb(artist1, song)
    result = song_emb - artist_emb1 + artist_emb2
    index = tree.query(result, k=topn)[1]
    result_embs = [emb_list[i] for i in index]
    return emb2song(result_embs)

In [16]:
change_artist("Radiohead", "Karma Police", "Metallica")

Unnamed: 0,song_id,artist_name,track_name
33926,3488825901,Pixies,Where Is My Mind?
67875,3488809975,Pixies,Dig For Fire
67879,3488809728,Pixies,Debaser
67918,4419517436,The Cure,Boys Don'T Cry
68411,4419524268,The Cure,Just Like Heaven
236522,3488835438,Pixies,Here Comes Your Man
241128,3488839745,Pixies,Monkey Gone To Heaven
241132,3488848790,Pixies,Wave Of Mutilation
241145,3488848435,Pixies,Velouria
244570,3488813029,Pixies,Hey


Let's note two interesting facts:
* The most popular song of Radiohead has been mapped to the most popular song of the Pixies
* Where is My Mind sounds similar to Karma Police, with a more rock style but that is left to interpretation