In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
data_folder_path = "data"

In [3]:
info_df = pd.read_csv(os.path.join(data_folder_path, "id_information_mmsr.tsv"), delimiter='\t')
genres_df = pd.read_csv(os.path.join(data_folder_path, "id_genres_mmsr.tsv"), delimiter='\t')

tfidf_df = pd.read_csv(os.path.join(data_folder_path, "id_lyrics_tf-idf_mmsr.tsv"), delimiter='\t')
tfidf_array = tfidf_df.to_numpy()

word2vec_df = pd.read_csv(os.path.join(data_folder_path, "id_lyrics_word2vec_mmsr.tsv"), delimiter='\t')
word2vec_array = word2vec_df.to_numpy()

bert_df = pd.read_csv(os.path.join(data_folder_path, "id_bert_mmsr.tsv"), delimiter='\t')
bert_array = bert_df.to_numpy()

In [4]:
display(info_df.iloc[1:2])
display(genres_df.iloc[0:1])
display(tfidf_df.iloc[0:1])
display(word2vec_df.iloc[0:1])
display(bert_df.iloc[0:1])

Unnamed: 0,id,artist,song,album_name
1,0010xmHR6UICBOYT,Oddisee,After Thoughts,The Beauty in All


Unnamed: 0,id,genre
0,0009fFIM1eYThaPg,['pop']


Unnamed: 0,id,abl,accept,across,act,addict,afraid,age,ago,ah,...,yea,yeah,year,yellow,yes,yesterday,yet,yo,young,youth
0,9jbSytob9XRzwvB6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150511


Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,290,291,292,293,294,295,296,297,298,299
0,0LiOzxBZ1aPLlFsK,0.031109,0.018026,0.022785,0.028802,-0.026084,-0.006278,0.030599,-0.041043,0.036703,...,-0.025845,0.010468,-0.047819,0.00562,-0.025106,-0.017939,-0.009981,-0.027846,0.0211,-0.020994


Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,758,759,760,761,762,763,764,765,766,767
0,9jbSytob9XRzwvB6,0.009225,0.041393,-0.003659,-0.0305,-0.006346,-0.019719,-0.075958,-0.003737,-0.000486,...,-0.037228,-0.027013,0.029167,0.009537,-0.010819,0.004054,-0.018671,0.012545,0.020696,-0.019794


In this first task, please implement a simple reusable(!) framework for text-based retrieval/similarity of music pieces (we will use the term “song” and “track” synonymously in the following). The input (query) is a song, more precisely its meta-data, i.e., artist and track name. The output of the system should be a list of songs that are similar to the query song. Throughout the practical part you will investigate various ways to define this similarity. For this first exercise, only consider textual representations of the song, in particular lyrics features (i.e., TF-IDF, word2vec, and BERT embeddings). Start with something simple (e.g., cosine similarity computed on TF-IDF vectors); and then add one additional variant (i.e., combination of song representation and similarity metric).

In [5]:
def filter_df(df, **args):
    if not args:
        raise AttributeError("**args required")
    query = ''
    for k, v in args.items():
        query += f"{k}=='{v}' and "
    query = query[:-5]  # Removing the last 'and' of our query
    return df.query(query)

In [6]:
example_artist = "Cheryl"
example_song = "Rain on Me"
example_embedding_df = bert_df
example_embedding_array = bert_array

In [7]:
example_id = filter_df(info_df, artist=example_artist, song=example_song).to_numpy()[0, 0]
print(f"example_song found by artist='{example_artist}' and song='{example_song}' --> id='{example_id}'")

example_song found by artist='Cheryl' and song='Rain on Me' --> id='0009fFIM1eYThaPg'


In [8]:
example_Y = filter_df(example_embedding_df, id=example_id).to_numpy().reshape(1, -1)
cs = cosine_similarity(X=example_embedding_array[:, 1:], Y=example_Y[:, 1:])
print("avg_similarity (all songs):", np.mean(cs))

avg_similarity (all songs): 0.3988439414994516


In [9]:
example_X = filter_df(info_df.merge(example_embedding_df), artist=example_artist).to_numpy()[:, 3:]
cs = cosine_similarity(X=example_X[:, 1:], Y=example_Y[:, 1:])
print("similarities (within example_artist):\n", cs)

similarities (within example_artist):
 [[1.        ]
 [0.45588829]
 [0.415266  ]
 [0.56247426]
 [0.52213525]
 [0.54025452]
 [0.58719805]
 [0.51434849]
 [0.48975459]
 [0.50849349]
 [0.40145441]
 [0.48441612]
 [0.45773557]
 [0.49066422]]


In [10]:
# usage: 
#   1. get query song id by using "filter_df" on merged info_df & embedding_df (filter by artist & song)
#   2. call top_k_similar with song id from step 1 

def top_k_similar(id, embedding_array, k=None, similarity_measure=cosine_similarity):
    similarities = similarity_measure(X=embedding_array[:, 1:],
                                      Y=embedding_array[embedding_array[:, 0] == id].reshape(1, -1)[:, 1:]).reshape(-1)
    most_similar = [[this_id, this_sim] for this_sim, this_id in sorted(zip(similarities, embedding_array[:, 0]))][::-1]
    if k is None:
        return most_similar[1:]
    return most_similar[1:k + 1]  # return top k but strip self

In [11]:
similar = top_k_similar(example_id, example_embedding_array, k=10, similarity_measure=cosine_similarity)
print(similar)

[['WAFIWIziIPINi0MC', 0.8294106047397072], ['lbTW6YAzKbARhB59', 0.827303288201898], ['NGhsmk5BSCAoQijc', 0.8212638765322852], ['wXcMv63aWS4KEPm8', 0.8183482638952274], ['HFLuvJXc6SjcJt7d', 0.810907354243032], ['mWPcgcOpMTMcuYa7', 0.7921297632371401], ['Af926lrdYuaRdEQe', 0.7910895566674848], ['qmI07MLHa0lNsIIY', 0.7895209748498319], ['aY7VhvjZJ0vLZX5F', 0.7873939719128221], ['e6OyomkSZeHMNZOu', 0.7850501472782984]]


In [12]:
similar_id = list(next(zip(*similar)))


In [13]:
def get_genre(song_id_):
    genre = genres_df[genres_df.id == song_id_]["genre"].values
    return genre

In [46]:
import ast


def get_result_genre(song_ids_):
    """
    Converts the list of ids to a usable result dataframe

    :param song_ids_: list of ids
    :return: dataframe with query_genre and similar_genres
    """
    df = pd.DataFrame()
    df["similar"] = song_ids_
    df["query_id"] = [example_id for _ in range(len(df))]
    df["query_genre"] = df.query_id.apply(lambda x: ast.literal_eval(get_genre(x)[0])[0])
    df["similar_genre"] = df.similar.apply(lambda x: ast.literal_eval(get_genre(x)[0]))
    # Calculate if the genre is in the query genre
    df["similar_genre_value"] = df[["query_genre", "similar_genre"]].apply(
        lambda x: 1 if x["query_genre"] in x["similar_genre"] else 0,
        axis=1)
    df.drop(["similar", "query_id"], axis=1, inplace=True)
    df["query_genre_value"] = [0 for _ in range(len(df))]
    return df



In [47]:
result = get_result_genre(similar_id)

In [57]:
def mean_reciprocal_rank(result_df, relevant_col="similar_genre"):
    """
    Calculates the mean reciprocal rank of the result dataframe

    :param result_df: dataframe with query_genre and similar_genres
    :return: mean reciprocal rank
    """
    sum = 0
    for i, row in result_df.iterrows():
        counter = 0
        for genre in row[relevant_col]:
            counter += 1
            if row["query_genre"] == genre:
                sum += 1 / counter
                break

    return sum / len(result_df)


In [58]:
def precision(df, relevant_col="similar_genre_value"):
    """
    Calculates the r-precision of the result dataframe

    :param df: dataframe with query_genre and similar_genres
    """
    return df[relevant_col].sum() / len(df)

In [59]:
# define ndcg function
def ndcg(result_df, relevant_col="similar_genre_value"):
    """
    Calculates the ndcg score for a given result vector

    :param result_vector: vector of results
    :return: ndcg score
    """
    # Calculate the dcg
    dcg = 0
    for i in range(len(result_df)):
        dcg += result_df[relevant_col][i] / np.log2(i + 2)

    # Calculate the idcg
    idcg = 0
    for i in range(len(result_df)):
        idcg += 1 / np.log2(i + 2)
    return dcg / idcg


In [61]:
print("precision:", precision(result))
print("mrr:", mean_reciprocal_rank(result))
print("ndcg:", ndcg(result))

precision: 0.8
mrr: 0.5443589743589744
ndcg: 0.8236998933965709


# Basic setup guide
Furthermore, serves as an example on how to use the functions above.

In [65]:
# Because running all those functions is tedious we add a framework
def recommender_framework(example_artist, example_song, k=10, similarity_measure=cosine_similarity, display_songs=False,
                          evaluation=True):
    """
    Framework to run the recommender system

    :param example_artist: artist of the query song
    :param example_song: song of the query song
    :param k: number of similar songs
    :param similarity_measure: similarity measure to use
    :return: result dataframe
    """
    # Filter the dataframe to get the song id
    example_id = filter_df(info_df, artist=example_artist, song=example_song).to_numpy()[0, 0]
    # Get the top k similar songs
    similar_k = top_k_similar(example_id, example_embedding_array, k=k, similarity_measure=similarity_measure)
    similar_id = list(next(zip(*similar_k)))
    # Get the result dataframe
    result = get_result_genre(similar_id)
    if display_songs:
        display(info_df.where(info_df.id.isin(similar_id)).dropna())
    if evaluation:
        print("precision:", precision(result))
        print("ndcg:", ndcg(result))
        print("mrr:", mean_reciprocal_rank(result))
    return result

In [66]:
recommender_framework("Elton John", "Nikita", k=10, similarity_measure=cosine_similarity)

precision: 0.4
ndcg: 0.5423640154200349
mrr: 0.27


Unnamed: 0,query_genre,similar_genre,similar_genre_value,query_genre_value
0,pop,"[soul, funk, rock, neo soul, pop, alternative ...",1,0
1,pop,"[rock, pop, new wave, celtic, alternative rock...",1,0
2,pop,"[pop, soul, r b, rock, singer songwriter, down...",1,0
3,pop,"[gothic rock, industrial]",0,0
4,pop,"[alternative rock, hard rock, rock, alternativ...",0,0
5,pop,"[metal, nwobhm, rock, hard rock, classic rock,...",0,0
6,pop,[pop],1,0
7,pop,[shoegaze],0,0
8,pop,"[metal, nwobhm, power metal, rock, hard rock, ...",0,0
9,pop,"[metalcore, math rock, hardcore, emo, experime...",0,0
