In [3]:
import numpy as np
import pandas as pd
import os
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
data_folder_path = "data"

In [5]:
info_df = pd.read_csv(os.path.join(data_folder_path, "id_information_mmsr.tsv"), delimiter='\t')
genres_df = pd.read_csv(os.path.join(data_folder_path, "id_genres_mmsr.tsv"), delimiter='\t')

tfidf_df = pd.read_csv(os.path.join(data_folder_path, "id_lyrics_tf-idf_mmsr.tsv"), delimiter='\t')
tfidf_array = tfidf_df.to_numpy()

word2vec_df = pd.read_csv(os.path.join(data_folder_path, "id_lyrics_word2vec_mmsr.tsv"), delimiter='\t')
word2vec_array = word2vec_df.to_numpy()

bert_df = pd.read_csv(os.path.join(data_folder_path, "id_bert_mmsr.tsv"), delimiter='\t')
bert_array = bert_df.to_numpy()

In [6]:
display(info_df.iloc[1:2])
display(genres_df.iloc[0:1])
display(tfidf_df.iloc[0:1])
display(word2vec_df.iloc[0:1])
display(bert_df.iloc[0:1])

Unnamed: 0,id,artist,song,album_name
1,0010xmHR6UICBOYT,Oddisee,After Thoughts,The Beauty in All


Unnamed: 0,id,genre
0,0009fFIM1eYThaPg,['pop']


Unnamed: 0,id,abl,accept,across,act,addict,afraid,age,ago,ah,...,yea,yeah,year,yellow,yes,yesterday,yet,yo,young,youth
0,9jbSytob9XRzwvB6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150511


Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,290,291,292,293,294,295,296,297,298,299
0,0LiOzxBZ1aPLlFsK,0.031109,0.018026,0.022785,0.028802,-0.026084,-0.006278,0.030599,-0.041043,0.036703,...,-0.025845,0.010468,-0.047819,0.00562,-0.025106,-0.017939,-0.009981,-0.027846,0.0211,-0.020994


Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,758,759,760,761,762,763,764,765,766,767
0,9jbSytob9XRzwvB6,0.009225,0.041393,-0.003659,-0.0305,-0.006346,-0.019719,-0.075958,-0.003737,-0.000486,...,-0.037228,-0.027013,0.029167,0.009537,-0.010819,0.004054,-0.018671,0.012545,0.020696,-0.019794


In this first task, please implement a simple reusable(!) framework for text-based retrieval/similarity of music pieces (we will use the term “song” and “track” synonymously in the following). The input (query) is a song, more precisely its meta-data, i.e., artist and track name. The output of the system should be a list of songs that are similar to the query song. Throughout the practical part you will investigate various ways to define this similarity. For this first exercise, only consider textual representations of the song, in particular lyrics features (i.e., TF-IDF, word2vec, and BERT embeddings). Start with something simple (e.g., cosine similarity computed on TF-IDF vectors); and then add one additional variant (i.e., combination of song representation and similarity metric).

In [7]:
def filter_df(df, **args):
    if not args:
        raise AttributeError("**args required")
    query = ''
    for k, v in args.items():
        query += f"{k}=='{v}' and "
    query = query[:-5]
    return df.query(query)

In [8]:
example_artist = "Cheryl"
example_song = "Rain on Me"
example_embedding_df = bert_df
example_embedding_array = bert_array

In [9]:
example_id = filter_df(info_df, artist=example_artist, song=example_song).to_numpy()[0, 0]
print(f"example_song found by artist='{example_artist}' and song='{example_song}' --> id='{example_id}'")

example_song found by artist='Cheryl' and song='Rain on Me' --> id='0009fFIM1eYThaPg'


In [10]:
example_Y = filter_df(example_embedding_df, id=example_id).to_numpy().reshape(1, -1)
cs = cosine_similarity(X=example_embedding_array[:, 1:], Y=example_Y[:, 1:])
print("avg_similarity (all songs):", np.mean(cs))

avg_similarity (all songs): 0.3988439414994515


In [11]:
example_X = filter_df(info_df.merge(example_embedding_df), artist=example_artist).to_numpy()[:, 3:]
cs = cosine_similarity(X=example_X[:, 1:], Y=example_Y[:, 1:])
print("similarities (within example_artist):\n", cs)

similarities (within example_artist):
 [[1.        ]
 [0.45588829]
 [0.415266  ]
 [0.56247426]
 [0.52213525]
 [0.54025452]
 [0.58719805]
 [0.51434849]
 [0.48975459]
 [0.50849349]
 [0.40145441]
 [0.48441612]
 [0.45773557]
 [0.49066422]]


In [12]:
# usage: 
#   1. get query song id by using "filter_df" on merged info_df & embedding_df (filter by artist & song)
#   2. call top_k_similar with song id from step 1 

def top_k_similar(id, embedding_array, k=None, similarity_measure=cosine_similarity):
    similarities = similarity_measure(X=embedding_array[:, 1:],
                                      Y=embedding_array[embedding_array[:, 0] == id].reshape(1, -1)[:, 1:]).reshape(-1)
    most_similar = [[this_id, this_sim] for this_sim, this_id in sorted(zip(similarities, embedding_array[:, 0]))][::-1]
    if k is None:
        return most_similar[1:]
    return most_similar[1:k + 1]  # return top k but strip self

In [18]:
similar = top_k_similar(example_id, example_embedding_array, k=10, similarity_measure=cosine_similarity)
print(similar)

[['WAFIWIziIPINi0MC', 0.8294106047397072], ['lbTW6YAzKbARhB59', 0.8273032882018979], ['NGhsmk5BSCAoQijc', 0.8212638765322849], ['wXcMv63aWS4KEPm8', 0.8183482638952273], ['HFLuvJXc6SjcJt7d', 0.8109073542430317], ['mWPcgcOpMTMcuYa7', 0.7921297632371399], ['Af926lrdYuaRdEQe', 0.7910895566674847], ['qmI07MLHa0lNsIIY', 0.7895209748498319], ['aY7VhvjZJ0vLZX5F', 0.787393971912822], ['e6OyomkSZeHMNZOu', 0.7850501472782985]]


In [61]:
similar_id = list(next(zip(*similar)))


Unnamed: 0,similar,query_id
0,WAFIWIziIPINi0MC,0009fFIM1eYThaPg
1,lbTW6YAzKbARhB59,0009fFIM1eYThaPg
2,NGhsmk5BSCAoQijc,0009fFIM1eYThaPg
3,wXcMv63aWS4KEPm8,0009fFIM1eYThaPg
4,HFLuvJXc6SjcJt7d,0009fFIM1eYThaPg


In [63]:
df.query_id

0    0009fFIM1eYThaPg
1    0009fFIM1eYThaPg
2    0009fFIM1eYThaPg
3    0009fFIM1eYThaPg
4    0009fFIM1eYThaPg
5    0009fFIM1eYThaPg
6    0009fFIM1eYThaPg
7    0009fFIM1eYThaPg
8    0009fFIM1eYThaPg
9    0009fFIM1eYThaPg
Name: query_id, dtype: object

In [58]:
def get_genre(x):
    genre = genres_df[genres_df.id == x]["genre"].values
    return genre

In [185]:
import ast
def get_result_genre(x):
    df = pd.DataFrame()
    df["similar"] = similar_id
    df["query_id"] = [example_id for _ in range(len(df))]
    df["query_genre"] = df.query_id.apply(lambda x: ast.literal_eval(get_genre(x)[0])[0])
    df["similar_genre"] = df.similar.apply(lambda x: ast.literal_eval(get_genre(x)[0]))
    df["similar_genre_value"] = df[["query_genre", "similar_genre"]].apply(
        lambda x: 0 if x["query_genre"] in x["similar_genre"] else 1,
        axis=1)
    df.drop(["similar", "query_id"], axis=1, inplace=True)
    df["query_genre_value"] = [0 for _ in range(len(df))]
    return df

In [186]:
result = get_result_genre(similar_id)
re

Unnamed: 0,query_genre,similar_genre,similar_genre_value,query_genre_value
0,pop,"[pop, new wave, synthpop, rain, rock, pop rock...",0,0
1,pop,"[rock and roll, rockabilly, classic rock, rock...",0,0
2,pop,"[hard rock, classic rock, rock, blues rock, me...",1,0
3,pop,"[pop, classic rock, country, rain, rockabilly,...",0,0
4,pop,"[pop, rain]",0,0
5,pop,"[soul, smooth soul, quiet storm, rhythm and bl...",0,0
6,pop,"[rock, progressive rock, symphonic rock, class...",0,0
7,pop,"[pop, rock, alternative rock, pop rock, soft r...",0,0
8,pop,"[soul, rain, neo soul, singer songwriter, r b,...",1,0
9,pop,"[pop, rain, rock, singer songwriter, pop rock,...",0,0


In [209]:

def mean_reciprocal_rank(rs):
    """Score is reciprocal of the rank of the first relevant item
    First element is 'rank 1'.  Relevance is binary (nonzero is relevant).
    Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
    >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.61111111111111105
    >>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]])
    >>> mean_reciprocal_rank(rs)
    0.5
    >>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.75
    Args:
        rs: Iterator of relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Mean reciprocal rank
    """
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])


def r_precision(r):
    """Score is precision after all relevant documents have been retrieved
    Relevance is binary (nonzero is relevant).
    >>> r = [0, 0, 1]
    >>> r_precision(r)
    0.33333333333333331
    >>> r = [0, 1, 0]
    >>> r_precision(r)
    0.5
    >>> r = [1, 0, 0]
    >>> r_precision(r)
    1.0
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        R Precision
    """
    r = np.asarray(r) != 0
    z = r.nonzero()[0]
    if not z.size:
        return 0.
    return np.mean(r[:z[-1] + 1])

In [210]:
print("precision:",r_precision(result["similar_genre_value"]))

precision: 0.2222222222222222
