In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.metrics.pairwise import cosine_similarity, manhattan_distances
from tqdm import tqdm

In [2]:
data_folder_path = "data"

In [3]:
info_df = pd.read_csv(os.path.join(data_folder_path, "id_information_mmsr.tsv"), delimiter='\t')
genres_df = pd.read_csv(os.path.join(data_folder_path, "id_genres_mmsr.tsv"), delimiter='\t')

tfidf_df = pd.read_csv(os.path.join(data_folder_path, "id_lyrics_tf-idf_mmsr.tsv"), delimiter='\t')
tfidf_array = tfidf_df.to_numpy()

word2vec_df = pd.read_csv(os.path.join(data_folder_path, "id_lyrics_word2vec_mmsr.tsv"), delimiter='\t')
word2vec_array = word2vec_df.to_numpy()

bert_df = pd.read_csv(os.path.join(data_folder_path, "id_bert_mmsr.tsv"), delimiter='\t')
bert_array = bert_df.to_numpy()

In [4]:
display(info_df.iloc[1:2])
display(genres_df.iloc[0:1])
display(tfidf_df.iloc[0:1])
display(word2vec_df.iloc[0:1])
display(bert_df.iloc[0:1])

Unnamed: 0,id,artist,song,album_name
1,0010xmHR6UICBOYT,Oddisee,After Thoughts,The Beauty in All


Unnamed: 0,id,genre
0,0009fFIM1eYThaPg,['pop']


Unnamed: 0,id,abl,accept,across,act,addict,afraid,age,ago,ah,...,yea,yeah,year,yellow,yes,yesterday,yet,yo,young,youth
0,9jbSytob9XRzwvB6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150511


Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,290,291,292,293,294,295,296,297,298,299
0,0LiOzxBZ1aPLlFsK,0.031109,0.018026,0.022785,0.028802,-0.026084,-0.006278,0.030599,-0.041043,0.036703,...,-0.025845,0.010468,-0.047819,0.00562,-0.025106,-0.017939,-0.009981,-0.027846,0.0211,-0.020994


Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,758,759,760,761,762,763,764,765,766,767
0,9jbSytob9XRzwvB6,0.009225,0.041393,-0.003659,-0.0305,-0.006346,-0.019719,-0.075958,-0.003737,-0.000486,...,-0.037228,-0.027013,0.029167,0.009537,-0.010819,0.004054,-0.018671,0.012545,0.020696,-0.019794


In this first task, please implement a simple reusable(!) framework for text-based retrieval/similarity of music pieces (we will use the term “song” and “track” synonymously in the following). The input (query) is a song, more precisely its meta-data, i.e., artist and track name. The output of the system should be a list of songs that are similar to the query song. Throughout the practical part you will investigate various ways to define this similarity. For this first exercise, only consider textual representations of the song, in particular lyrics features (i.e., TF-IDF, word2vec, and BERT embeddings). Start with something simple (e.g., cosine similarity computed on TF-IDF vectors); and then add one additional variant (i.e., combination of song representation and similarity metric).

In [5]:
from scipy import sparse


# I imported my LSA solution from another notebook. For our case we do not need the u and sigma_inv.

def LSA(matrix, k):
    # calculate the svd
    print("calculating svd, takes some time")
    u, s, vt = sparse.linalg.svds(matrix, k=k)
    # create m x k matrix
    s = np.diag(s)

    return u, np.linalg.inv(s), vt.T


def inference(matrix, u, sigma_inv):
    return (sigma_inv @ u.T @ matrix.T).T


ids = tfidf_df['id']
u, sigma_inv_1, tf_idf_small = LSA(tfidf_array[:, 1:].astype(np.float32).T, 10)
tf_idf_small = np.concatenate((ids[:, None], tf_idf_small), axis=1) #concatenate the id column again

ids = word2vec_df['id']
u, sigma_inv_1, word2vec_small = LSA(word2vec_array[:, 1:].astype(np.float32).T, 10)
word2vec_small = np.concatenate((ids[:, None], word2vec_small), axis=1) #concatenate the id column again

ids = bert_df['id']
u, sigma_inv_1, bert_small = LSA(bert_array[:, 1:].astype(np.float32).T, 10)
bert_small = np.concatenate((ids[:, None], bert_small), axis=1) #concatenate the id column again

calculating svd, takes some time


  tf_idf_small = np.concatenate((ids[:, None], tf_idf_small), axis=1) #concatenate the id column again


calculating svd, takes some time


  word2vec_small = np.concatenate((ids[:, None], word2vec_small), axis=1) #concatenate the id column again


calculating svd, takes some time


  bert_small = np.concatenate((ids[:, None], bert_small), axis=1) #concatenate the id column again


In [6]:
def filter_df(df, **args):
    if not args:
        raise AttributeError("**args required")
    query = ''
    for k, v in args.items():
        query += f"{k}=='{v}' and "
    query = query[:-5]  # Removing the last 'and' of our query
    return df.query(query)

In [7]:
example_artist = "Cheryl"
example_song = "Rain on Me"
example_embedding_df = bert_df
example_embedding_array = bert_array

In [8]:
example_id = filter_df(info_df, artist=example_artist, song=example_song).to_numpy()[0, 0]
print(f"example_song found by artist='{example_artist}' and song='{example_song}' --> id='{example_id}'")
example_Y = filter_df(example_embedding_df, id=example_id).to_numpy().reshape(1, -1)
cs = cosine_similarity(X=example_embedding_array[:, 1:], Y=example_Y[:, 1:])
print("avg_similarity (all songs):", np.mean(cs))
example_X = filter_df(info_df.merge(example_embedding_df), artist=example_artist).to_numpy()[:, 3:]
cs = cosine_similarity(X=example_X[:, 1:], Y=example_Y[:, 1:])
print("similarities (within example_artist):\n", cs)

example_song found by artist='Cheryl' and song='Rain on Me' --> id='0009fFIM1eYThaPg'
avg_similarity (all songs): 0.3988439414994516
similarities (within example_artist):
 [[1.        ]
 [0.45588829]
 [0.415266  ]
 [0.56247426]
 [0.52213525]
 [0.54025452]
 [0.58719805]
 [0.51434849]
 [0.48975459]
 [0.50849349]
 [0.40145441]
 [0.48441612]
 [0.45773557]
 [0.49066422]]


In [9]:
# usage: 
#   1. get query song id by using "filter_df" on merged info_df & embedding_df (filter by artist & song)
#   2. call top_k_similar with song id from step 1 

def top_k_similar(id, embedding_array, k=None, similarity_measure=cosine_similarity):
    """
    Return top k similar songs and their scores given an embedding and similarity meassure
    :param  id: song id
            embedding_array: embedding array to base similarities on
            k: the number of most similar songs to return
            similarity_measure: a function which returns the pairwise similarity of data points
    :return: list of top k songs and their scores
    """
    Y = embedding_array[embedding_array[:, 0] == id].reshape(1, -1)[:, 1:]
    if Y.shape == (1, 0):
        return None
    similarities = similarity_measure(X=embedding_array[:, 1:],
                                      Y=Y).reshape(-1)
    most_similar = [[this_id, this_sim] for this_sim, this_id in sorted(zip(similarities, embedding_array[:, 0]))][::-1]
    if k is None:
        return most_similar[1:]
    return most_similar[1:k + 1] 

In [10]:
similar = top_k_similar(example_id, tf_idf_small, k=10, similarity_measure=cosine_similarity)

In [11]:
def get_genre(song_id_):
    genre = genres_df[genres_df.id == song_id_]["genre"].values
    return genre

In [12]:
import ast


def get_result_genre(song_ids_):
    """
    Converts the list of ids to a usable result dataframe

    :param song_ids_: list of ids
    :return: dataframe with query_genre and similar_genres
    """
    df = pd.DataFrame()
    df["similar"] = song_ids_
    df["query_id"] = [example_id for _ in range(len(df))]
    df["query_genre"] = df.query_id.apply(lambda x: ast.literal_eval(get_genre(x)[0])[0])
    df["similar_genre"] = df.similar.apply(lambda x: ast.literal_eval(get_genre(x)[0]))
    # Calculate if the genre is in the query genre
    df["similar_genre_value"] = df[["query_genre", "similar_genre"]].apply(
        lambda x: 1 if x["query_genre"] in x["similar_genre"] else 0,
        axis=1)
    df.drop(["similar", "query_id"], axis=1, inplace=True)
    df["query_genre_value"] = [0 for _ in range(len(df))]
    return df

In [13]:
def mean_reciprocal_rank(result_df, relevant_col="similar_genre"):
    """
    Calculates the mean reciprocal rank of the result dataframe

    :param result_df: dataframe with query_genre and similar_genres
    :return: mean reciprocal rank
    """
    sum = 0
    for i, row in result_df.iterrows():
        counter = 0
        for genre in row[relevant_col]:
            counter += 1
            if row["query_genre"] == genre:
                sum += 1 / counter
                break

    return sum / len(result_df)

In [14]:
def precision(df, relevant_col="similar_genre_value"):
    """
    Calculates the r-precision of the result dataframe

    :param df: dataframe with query_genre and similar_genres
    """
    return df[relevant_col].sum() / len(df)

In [15]:
# define ndcg function
def ndcg(result_df, relevant_col="similar_genre_value"):
    """
    Calculates the ndcg score for a given result vector

    :param result_vector: vector of results
    :return: ndcg score
    """
    # Calculate the dcg
    dcg = 0
    for i in range(len(result_df)):
        dcg += result_df[relevant_col][i] / np.log2(i + 2)

    # Calculate the idcg
    idcg = 0
    for i in range(len(result_df)):
        idcg += 1 / np.log2(i + 2)
    return dcg / idcg


In [16]:
similar_id = list(next(zip(*similar)))

result = get_result_genre(similar_id)

print("precision:", precision(result))
print("mrr:", mean_reciprocal_rank(result))
print("ndcg:", ndcg(result))

precision: 0.5
mrr: 0.20416666666666666
ndcg: 0.5868924718493931


# Evaulation Framework

In [17]:
# Because running all those functions is tedious we add a framework

import time

def recommender(artist=None, song=None, example_id=None, embedding_array=tf_idf_small, k=10,
                similarity_measure=cosine_similarity,
                only_stat=False):
    """
    Framework to run the recommender system

    :param example_artist: artist of the query song
    :param example_song: song of the query song
    :param k: number of similar songs
    :param similarity_measure: similarity measure to use
    :return: result dataframe
    """
    # Filter the dataframe to get the song id
    if example_id is None:
        example_id = filter_df(info_df, artist=artist, song=song).to_numpy()[0, 0]
    # Get the top k similar songs
    similar_k = top_k_similar(example_id, embedding_array, k=k, similarity_measure=similarity_measure)

    # IF the song is not in the embedding array return None
    if similar_k == None:
        return [0, 0, 0]
    similar_id = list(next(zip(*similar_k)))
    # Get the result dataframe
    result = get_result_genre(similar_id)

    if only_stat == True:
        return np.array([precision(result), mean_reciprocal_rank(result), ndcg(result)])

    stat = {"precision": precision(result),
            "mrr": mean_reciprocal_rank(result),
            "ndcg": ndcg(result)}

    return result, stat, similar_id


In [18]:

result, stat, similar_id = recommender("Elton John", "Nikita", k=10, similarity_measure=cosine_similarity)
print(stat)
display(info_df.where(info_df.id.isin(similar_id)).dropna())

{'precision': 0.6, 'mrr': 0.3142857142857143, 'ndcg': 0.5501126236272343}


Unnamed: 0,id,artist,song,album_name
3155,2Tud1sKdzCfsgNuO,AFI,Hidden Knives,AFI (The Blood Album)
6371,54aJymX23XS3azJ3,Madeon,Beings,Adventure (Deluxe)
6397,569DDnQOSEyuJug9,Coldplay,Trouble,Parachutes
25408,KgU8jEngGmoWL33H,Blonde Redhead,Hated Because of Great Qualities,Melody of Certain Damaged Lemons
33332,R5kPPdlImSImgJXZ,Kelly Clarkson,I Hate Myself for Losing You,Breakaway
45785,bML5G3n9zcOK3V35,Glee Cast,Sweet Caroline,"Glee: The Music, Volume 1"
47053,cNylkOoSfpQ0Nnax,Townes Van Zandt,For The Sake of The Song,Townes Van Zandt
57037,kVMIjrvZHp021nv1,The Rolling Stones,The Last Time,Out Of Our Heads
69104,uIwvTI9FgpVE5jMZ,Girls Aloud,100 Different Ways,The Collection - Studio Albums / B Sides / Live
71771,wS29hpE8uaEO3y3t,Beach House,Wedding Bell,Devotion


I am using a breakpoint as "training set". Our evaluation framework seems to have an okay score :).

In [19]:
def recommender_evaluation_framework(embedding_array=tf_idf_small, k=10, similarity_measure=cosine_similarity, breakpoint=False):
    """
    Framework to evaluate the recommender system

    :param k: number of similar songs
    :param similarity_measure: similarity measure to use
    :return: result dataframe
    """
    stat = [0, 0, 0]
    for index, row in tqdm(info_df.iterrows(), total=len(info_df)):
        stat += recommender(example_id=row["id"], embedding_array=embedding_array, k=k, similarity_measure=similarity_measure, only_stat=True)
        if index == breakpoint:
            return stat / breakpoint

    return sum(stat) / len(info_df)

In [22]:
embedding_arrays = {
    "tf_idf_small": tf_idf_small,
    "word2vec_small": word2vec_small,
    "bert_small": bert_small
}

similarity_measures = {
    "cosine_similarity": cosine_similarity,
    "manhattan_distances": manhattan_distances,
}

result = []
for embedding_array_name in embedding_arrays.keys():
    for similarity_measure_name in similarity_measures.keys():
        stat = [embedding_array_name, similarity_measure_name]
        stat.extend(recommender_evaluation_framework(embedding_array=embedding_arrays[embedding_array_name],
                                                        k=10,
                                                        similarity_measure=similarity_measures[similarity_measure_name],
                                                        breakpoint=300))
        result.append(stat)
result = pd.DataFrame(data=result, columns=["embedding", "smilarity_measure", "precision", "mmr", "ndcg"])
display(result)

  0%|          | 300/76115 [01:10<4:55:47,  4.27it/s]
  0%|          | 300/76115 [01:09<4:53:06,  4.31it/s]
  0%|          | 300/76115 [01:09<4:54:40,  4.29it/s]
  0%|          | 300/76115 [01:07<4:45:51,  4.42it/s]
  0%|          | 300/76115 [01:09<4:54:04,  4.30it/s]
  0%|          | 300/76115 [01:08<4:50:36,  4.35it/s]


Unnamed: 0,embedding,smilarity_measure,precision,mmr,ndcg
0,tf_idf_small,cosine_similarity,0.416,0.21726,0.42129
1,tf_idf_small,manhattan_distances,0.550667,0.276557,0.552038
2,word2vec_small,cosine_similarity,0.446667,0.239571,0.45531
3,word2vec_small,manhattan_distances,0.397667,0.099297,0.411521
4,bert_small,cosine_similarity,0.447,0.244816,0.45262
5,bert_small,manhattan_distances,0.232333,0.142178,0.220981


In [23]:
result = []
for embedding_array_name in embedding_arrays.keys():
    for similarity_measure_name in similarity_measures.keys():
        stat = [embedding_array_name, similarity_measure_name]
        stat.extend(recommender_evaluation_framework(embedding_array=embedding_arrays[embedding_array_name],
                                                     k=100,
                                                     similarity_measure=similarity_measures[similarity_measure_name],
                                                     breakpoint=300))
        result.append(stat)
result = pd.DataFrame(data=result, columns=["embedding", "smilarity_measure", "precision", "mmr", "ndcg"])
display(result)

  0%|          | 300/76115 [04:47<20:10:21,  1.04it/s]
  0%|          | 300/76115 [04:42<19:50:30,  1.06it/s]
  0%|          | 300/76115 [04:42<19:50:17,  1.06it/s]
  0%|          | 300/76115 [04:41<19:47:42,  1.06it/s]
  0%|          | 300/76115 [04:42<19:50:57,  1.06it/s]
  0%|          | 300/76115 [04:41<19:44:57,  1.07it/s]


Unnamed: 0,embedding,smilarity_measure,precision,mmr,ndcg
0,tf_idf_small,cosine_similarity,0.4102,0.21498,0.410861
1,tf_idf_small,manhattan_distances,0.6145,0.347977,0.601449
2,word2vec_small,cosine_similarity,0.426867,0.230146,0.43033
3,word2vec_small,manhattan_distances,0.439167,0.187436,0.428085
4,bert_small,cosine_similarity,0.428133,0.231117,0.431133
5,bert_small,manhattan_distances,0.262433,0.146887,0.2554
