# Read and Explore Data

In [1]:
from typing import List, Union, Callable
from dataclasses import dataclass

import pandas as pd
import numpy as np
import random

In [2]:
def read(feature, h=None):
    return pd.read_csv(f"data/id_{feature}_mmsr.tsv", delimiter="\t", header=h)


def embed_and_merge(df1, df2, col_name):
    embedding = df2.columns.difference(["id"], sort=False)
    df2[col_name] = df2[embedding].apply(lambda x: np.array(x, dtype=float), axis=1)
    df2.drop(embedding, inplace=True, axis=1)
    return pd.merge(df1, df2, left_on="id", right_on="id", how="left")

In [3]:
df = read("information", 0)

In [4]:
# read bert embedding
bert = read("lyrics_bert", 0)
df = embed_and_merge(df, bert, "bert_embedding")

In [5]:
# read word2vec embedding
word2vec = read("lyrics_word2vec", 0)
df = embed_and_merge(df, word2vec, "word2vec_embedding")

In [6]:
# read tf-idf term weighting
tfidf_weighting = read("lyrics_tf-idf", 0)
df = embed_and_merge(df, tfidf_weighting, "tf-idf")

In [7]:
df

Unnamed: 0,id,artist,song,album_name,bert_embedding,word2vec_embedding,tf-idf
0,01Yfj2T3YTwJ1Yfy,We As Human,Take The Bullets Away (feat. Lacey Sturm),We As Human,"[0.0302475523203611, 0.0352500043809413, 0.010...","[0.0193592727054678, 0.0232394714425702, 0.028...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0818293914712727, ..."
1,01gyRHLquwXDlhkO,The Notorious B.I.G.,Somebody's Gotta Die,Life After Death (Remastered Edition),"[0.0084422621876001, 0.0302564185112714, 0.009...","[0.018537292381979, 0.0113115924403394, 0.0107...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,01rMxQv6vhyE1oQX,Against the Current,Chasing Ghosts,In Our Bones,"[0.0490818135440349, 0.0148476688191294, 0.001...","[0.0227837218553759, 0.0231641749730655, 0.012...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,02RGE9FNH65RtMS7,Barthezz,Infected,Trance - The Early Years (1997-2002),"[0.0445394963026046, 0.0214906893670558, 0.013...","[0.0381116103401342, 0.0278804157207017, 0.016...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,02ZnlCGZEbkfCDxo,Laura Pausini,Tra Te E Il Mare,The Best of Laura Pausini - E Ritorno Da Te,"[0.0514551289379596, 0.0297695714980363, -0.01...","[0.0182936789026777, -0.0064870788035669, 0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.2413163920156013, ..."
...,...,...,...,...,...,...,...
10090,zyzILCQvVeUFIINi,Crowded House,When You Come,Temple Of Low Men,"[0.006713552866131, 0.0480893477797508, -0.001...","[0.0195101330379449, 0.0236336907562543, 0.011...","[0.0, 0.0, 0.079623055470056, 0.0, 0.0, 0.0, 0..."
10091,zzgS4ZqyswamEWNj,Britney Spears,My Only Wish (This Year),Platinum Christmas,"[0.0098905526101589, 0.0401467233896255, -0.02...","[0.0268563718791583, 0.0082648759004199, 0.011...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
10092,zzoFYDMlqU1X2zz1,Thundercat,DUI,Drunk,"[0.0101165119558572, 0.0388841480016708, -0.01...","[0.0051499218912795, 0.0028818239457905, 0.017...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
10093,zzpkRCGA5ud8q4mv,Otis Redding,Rock Me Baby,Otis Blue,"[-0.0166116580367088, 0.0266939438879489, -0.0...","[0.0370260450523346, 0.0159991827379498, -0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10095 entries, 0 to 10094
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  10095 non-null  object
 1   artist              10095 non-null  object
 2   song                10095 non-null  object
 3   album_name          10095 non-null  object
 4   bert_embedding      10095 non-null  object
 5   word2vec_embedding  10095 non-null  object
 6   tf-idf              10095 non-null  object
dtypes: object(7)
memory usage: 630.9+ KB


In [9]:
df

Unnamed: 0,id,artist,song,album_name,bert_embedding,word2vec_embedding,tf-idf
0,01Yfj2T3YTwJ1Yfy,We As Human,Take The Bullets Away (feat. Lacey Sturm),We As Human,"[0.0302475523203611, 0.0352500043809413, 0.010...","[0.0193592727054678, 0.0232394714425702, 0.028...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0818293914712727, ..."
1,01gyRHLquwXDlhkO,The Notorious B.I.G.,Somebody's Gotta Die,Life After Death (Remastered Edition),"[0.0084422621876001, 0.0302564185112714, 0.009...","[0.018537292381979, 0.0113115924403394, 0.0107...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,01rMxQv6vhyE1oQX,Against the Current,Chasing Ghosts,In Our Bones,"[0.0490818135440349, 0.0148476688191294, 0.001...","[0.0227837218553759, 0.0231641749730655, 0.012...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,02RGE9FNH65RtMS7,Barthezz,Infected,Trance - The Early Years (1997-2002),"[0.0445394963026046, 0.0214906893670558, 0.013...","[0.0381116103401342, 0.0278804157207017, 0.016...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,02ZnlCGZEbkfCDxo,Laura Pausini,Tra Te E Il Mare,The Best of Laura Pausini - E Ritorno Da Te,"[0.0514551289379596, 0.0297695714980363, -0.01...","[0.0182936789026777, -0.0064870788035669, 0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.2413163920156013, ..."
...,...,...,...,...,...,...,...
10090,zyzILCQvVeUFIINi,Crowded House,When You Come,Temple Of Low Men,"[0.006713552866131, 0.0480893477797508, -0.001...","[0.0195101330379449, 0.0236336907562543, 0.011...","[0.0, 0.0, 0.079623055470056, 0.0, 0.0, 0.0, 0..."
10091,zzgS4ZqyswamEWNj,Britney Spears,My Only Wish (This Year),Platinum Christmas,"[0.0098905526101589, 0.0401467233896255, -0.02...","[0.0268563718791583, 0.0082648759004199, 0.011...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
10092,zzoFYDMlqU1X2zz1,Thundercat,DUI,Drunk,"[0.0101165119558572, 0.0388841480016708, -0.01...","[0.0051499218912795, 0.0028818239457905, 0.017...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
10093,zzpkRCGA5ud8q4mv,Otis Redding,Rock Me Baby,Otis Blue,"[-0.0166116580367088, 0.0266939438879489, -0.0...","[0.0370260450523346, 0.0159991827379498, -0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


# Define RS and functions

In [10]:
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [11]:
def dot_product(a: np.ndarray, b: np.ndarray) -> float:
    return np.dot(a, b)

In [12]:
def manhattan_distance(a: np.ndarray, b: np.ndarray) -> float:
    return np.sum(np.abs(a - b))

In [13]:
def euclidean_distance(a: np.ndarray, b: np.ndarray) -> float:
    return np.linalg.norm(a - b)

In [14]:
def random_similarity(a: np.ndarray, b: np.ndarray) -> float:
    return random.uniform(0, 1)

In [15]:
@dataclass
class SongInfo:
    title: str
    artist: str


class RetrievalSystem:
    def __init__(
            self,
            df: pd.DataFrame,
            sim_metric: Callable = cosine_similarity,
            sim_feature: str = "bert_embedding",
            enable_cache: bool = True
    ):
        self.df = df
        self.sim_metric = sim_metric
        self.sim_feature = sim_feature

        if self.sim_feature not in self.df.columns:
            raise ValueError(
                f"'{self.sim_feature}' not found in the dataframe columns."
            )

        # Precompute the stacked version of the feature
        # array(list(array)) -> 2d array
        self.all_songs_stacked = np.vstack(self.df[self.sim_feature].values)

        self.cache_enabled = enable_cache
        self.cache = {}

    def _calc_similarity(self, song: pd.DataFrame, n: int = 5) -> pd.DataFrame:
        """
        Calculate the similarity of the given song with all songs in the dataset.

        Parameters:
        - song: DataFrame row representing the song.
        - n: Number of top similar songs to retrieve.

        Returns:
        - DataFrame of top n similar songs.
        """
        song_vector = song[self.sim_feature]

        # Compute similarity for each song in the dataset, ensuring each song_vec is reshaped to 2D
        similarity = np.array(
            [
                self.sim_metric(song_vector, song_vec) if not np.array_equal(song_vector, song_vec) else -1 # set similarity with self as -1
                for song_vec in self.all_songs_stacked
            ]
        )

        top_n_indices = np.argsort(similarity)[::-1][:n]
        top_n = self.df.iloc[top_n_indices]
        # make pandas happy: no in-place modification
        top_n = self.df.iloc[top_n_indices].copy()
        top_n["similarity"] = similarity[top_n_indices]
        return top_n

    def random_baseline(self, query: Union[int, str], n: int = 5) -> pd.DataFrame:
        """
        Retrieve random songs from the dataset.

        Parameters:
        - query: If int, row of df. If str, song_id. If SongInfo, title and artist.- song_id: ID of the song.
            Not used in this method.
        - n: Number of songs to retrieve.

        Returns:
        - DataFrame of n random songs.
        """
        rand_n = self.df.sample(n=n)
        return self._remove_embeddings(rand_n)

    @staticmethod
    def _remove_embeddings(df: pd.DataFrame) -> pd.DataFrame:
        """
        Do not return columns with "embedding" or "tf-idf" in the name


        Args:
            df (pd.DataFrame): Dataframe to remove columns from

        Returns:
            pd.DataFrame: Dataframe without embedding and tf-idf columns
        """
        return df.loc[:, ~df.columns.str.contains("embedding|tf-idf")].reset_index(
            drop=True
        )

    def retrieve(self, query: Union[int, str, SongInfo], n: int = 5) -> pd.DataFrame:
        """
        Retrieve the top n songs similar to the given song_id.

        Parameters:
        - query: If int, row of df. If str, song_id. If SongInfo, title and artist.
        - n: Number of songs to retrieve.

        Returns:
        - DataFrame of top n similar songs.
        """
        if isinstance(query, (int, str)):
            song_id = query
            if song_id not in self.df["id"].values and song_id not in self.df.index:
                raise ValueError(f"Song id {song_id} not in the dataset.")
            song = (
                self.df.loc[song_id]
                if isinstance(song_id, int)
                else self.df[self.df["id"] == song_id].iloc[0]
            )
        elif isinstance(query, SongInfo):
            title, artist = query.title, query.artist
            song = self.df[(self.df["song"] == title) & (self.df["artist"] == artist)]
            if song.empty:
                raise ValueError(
                    f"Song with title '{title}' and artist '{artist}' not found in the dataset."
                )
            song = song.iloc[0]
        else:
            raise ValueError(
                "Invalid query type. Provide either song_id (int/str) or an instance of SongInfo."
            )

        if self.cache_enabled and song['id'] in self.cache:
            cached_result = self.cache[song['id']]
            if len(cached_result) >= n:
                return cached_result.head(n)

        top_n = self._calc_similarity(song, n=n)
        # from the assignment, it is not 100% clear what to return --> return all except embeddings
        result = self._remove_embeddings(top_n)

        self.cache[song['id']] = result
        return result

## Random Baseline

In [16]:
rs = RetrievalSystem(df)

In [17]:
rs.random_baseline(1)

Unnamed: 0,id,artist,song,album_name
0,srsns1RKo2wyyDzL,San Cisco,Reckless,Awkward
1,8RJgdNrEKezWc3UW,Fad Gadget,Salt Lake City Sunday,Fireside Favourites
2,fLyRvg1EpNVAYxh3,Oasis,Up in the Sky,Definitely Maybe (Remastered) [Deluxe Edition]
3,F7kB7q4ezZZMgZbM,Ryan Adams,Oh My Sweet Carolina,Heartbreaker
4,W5corTpjrk7QEawo,Faun,Walpurgisnacht,Luna


In [18]:
rs.random_baseline(1)

Unnamed: 0,id,artist,song,album_name
0,1lCMmYgppb2NUsL8,The Dodos,Undeclared,Visiter
1,TUCyTCD0CYKhbo34,Dio,The Last In Line,The Last In Line
2,baerAOUagyrMzFlv,Fleetwood Mac,The Ledge,Tusk (Deluxe Edition)
3,F7kB7q4ezZZMgZbM,Ryan Adams,Oh My Sweet Carolina,Heartbreaker
4,X0PIltQAfBnIbcZr,Kate Bush,King of the Mountain,Aerial


Indeed, the system produces new results for each query/run.

In [19]:
rs.random_baseline("01gyRHLquwXDlhkO", n=10)

Unnamed: 0,id,artist,song,album_name
0,R97YjDivI6ZdnITw,Smino,TEQUILA MOCKINGBIRD,NOIR
1,6vPtCBMsfApvIcnN,Austra,The Beast,Feel It Break (Deluxe Version)
2,iHAUwRKkWyLqAmUc,Nick Cave & The Bad Seeds,Anthrocene,Skeleton Tree
3,rv9c5nbm6WsG0ZJl,Metallica,Cyanide,Death Magnetic
4,ZVVd8SCDX0vgDyTk,A Fine Frenzy,"Liar, Liar",One Cell In The Sea
5,QGkT0l8KlFpEtp1y,Gorguts,Drifting Remains,Considered Dead
6,VlGI96h3Ugt2YnTx,Karol Conká,Caxambu,Batuk Freak (Instrumentals)
7,nRwISjF6fC9mDEaG,Bright Eyes,June on the West Coast,Letting Off The Happiness
8,BKnTQxqe1KAj03wT,Omnium Gatherum,The Pit,Grey Heavens
9,qfdGKqj2QIuogYbc,lil skies,Real Ties,Real Ties


In [20]:
rs.random_baseline("01gyRHLquwXDlhkO", n=10)

Unnamed: 0,id,artist,song,album_name
0,mcSyQWsHtskCQIBJ,Psyche,Angel Lies Sleeping,Daydream Avenue
1,MrFnVJzJgtYKfDFl,Fall Out Boy,Dead on Arrival,Take This To Your Grave
2,9aPerUbk44DOzrH4,Hey,"Cisza, ja i czas",CDN
3,QCAR75jOamnYjNiz,Gong,Love Is How You Make It,Radio Gnome Invisible Trilogy
4,lU5PLPAPuaYt0E15,Decapitated,Deathvaluation,Anticult
5,7ZhsfreCAjHa8Nnh,Rosa de Saron,Linda Menina,Acústico
6,F3BJ8ghlBXftQaZI,Fat Freddy's Drop,Wandering Eye,Based on a True Story
7,GVyOXZMK7NtsjN3Y,Glee Cast,Be Okay (Glee Cast Version),Glee: The Music - Celebrating 100 Episodes
8,hMoLbFgENhtFTrS7,Lucy Pearl,Everyday,Lucy Pearl
9,Op6daPhkFznpw74Y,Falkenbach,Ufirstanan Folk,Asa


In [21]:
sample_song = SongInfo(title="Always", artist="Bon Jovi")

rs.random_baseline(sample_song, n=10)

Unnamed: 0,id,artist,song,album_name
0,dmlywJaBrKL3TCzl,Cut Copy,This Is All We've Got,Zonoscope
1,F0OeKHcvWROLs1Qm,Aline Barros,Vitória no Deserto,Extraordinária Graça
2,pKSjPjKxJC4JMnhG,Héroes del Silencio,El Estanque,Tour 2007
3,9lJvZjUDRHYR4TSA,Chet Baker,I Get Along Without You Very Well (Except Some...,Chet Baker Sings
4,IvgK80LY0sPNGQ1Z,Leprous,From The Flame,Malina
5,YzPwMqQZt7OWN2yp,Grand Funk Railroad,Loneliness,E Pluribus Funk (Remastered)
6,SwZTe2F3YvXKP0OH,Emperor,Thus Spake the Nightspirit,Anthems To The Welkin At Dusk
7,NoKpsI0lWZtgMLdj,Jorge Ben,Rita Jeep,Puro Suingue
8,mHtW3xpHpijTAJ6a,Dr. John,I Walk On Guilded Splinters,Gris Gris
9,ZjXaL0Esf2zyFwGx,Eminem,We Made You,Relapse: Refill


In [22]:
sample_song = SongInfo(title="Help! - Remastered 2009", artist="The Beatles")

rs.random_baseline(sample_song, n=10)

Unnamed: 0,id,artist,song,album_name
0,MxvB9jqkYOnP5mgo,Bob Marley & The Wailers,She's Gone,Kaya - Deluxe Edition
1,VlyPBH0EpsKX3zWB,Minutemen,The Glory of Man,Double Nickels on the Dime
2,tbF2tUUkAXDzk7sZ,Kult,Celina,Tata Kazika
3,AVpJkUAko5OybSog,Animal Collective,Taste,Merriweather Post Pavilion
4,Y64eyEfzGVX8R8uq,Elisa,Ancora Qui,Quentin Tarantino’s Django Unchained Original ...
5,tw6Z9JF1xXTdjx8M,Swing Out Sister,You On My Mind,Kaleidoscope World
6,pIZnbaINXFsTWIt4,Christine and the Queens,Safe and Holy,Christine and the Queens
7,J3Y3E5wh2TcJcUGM,Steel Panther,Death To All But Metal,Feel The Steel
8,UEzpsg4wh7Sgcvaf,Elvis Costello,Miracle Man,My Aim Is True
9,Vi7gUdTDwLrhFPDJ,Testament,The Haunting,The Legacy


In [23]:
sample_song = SongInfo(title="Zombie", artist="The Cranberries")

rs.random_baseline(sample_song, n=10)

Unnamed: 0,id,artist,song,album_name
0,cdOSxOYhgfyw616b,Iggy Pop,Don't Look Down,New Values
1,e1IW0PdbvgPUsanQ,City and Colour,Sensible Heart,Bring Me Your Love
2,RABqou3OcIQ2NUAJ,Bad Company,Feel Like Makin' Love - 2015 Remastered Version,Straight Shooter
3,HsrlGi96GhqIOYP3,Slaves,I Know a Lot of Artists,Beautiful Death
4,FzSZcJLdhYzlUQDy,Alter Bridge,The Uninvited,Fortress
5,gmYBaQbTipiF651O,Circa Survive,Act Appalled,Act Appalled (Acoustic)
6,eoJUmlEgtIjtAcYS,Funeral for a Friend,The End of Nothing,Hours
7,k2sTB8Yei5EDW7ba,Ashley Tisdale,Don't Let Me Down,"Music Sessions, Vol.1"
8,AVJK8bPATwp3gezy,Chic,Chic Cheer,The Chic Organization 1977-1979 (2018 Remaster)
9,hR4QXQYDKumawhBj,Kelly Clarkson,Catch My Breath,Greatest Hits - Chapter One


## Text-based (cos-sim, tf-idf)

In [24]:
rs_cosine = RetrievalSystem(df, cosine_similarity, "tf-idf")

In [25]:
rs_cosine.retrieve(1)

Unnamed: 0,id,artist,song,album_name,similarity
0,74k8qdan0o4DFa7L,Lil' Wayne,Megaman,Tha Carter IV (Deluxe),0.50826
1,wiw2rM2xb4CJ5sVP,Kanye West,Never Let Me Down,The College Dropout,0.504759
2,2rQiu54zZBs5Dmmi,2Pac,Heartz Of Men,All Eyez On Me,0.485196
3,8ptSfMLrxpbfRaxl,Nas,The Genesis,Illmatic,0.445888
4,h2b5Pi6kMqyAK83G,Bone Thugs-N-Harmony,Look Into My Eyes,Art of War,0.444548


In [26]:
# check reproducibility
rs_cosine.retrieve(1)

Unnamed: 0,id,artist,song,album_name,similarity
0,74k8qdan0o4DFa7L,Lil' Wayne,Megaman,Tha Carter IV (Deluxe),0.50826
1,wiw2rM2xb4CJ5sVP,Kanye West,Never Let Me Down,The College Dropout,0.504759
2,2rQiu54zZBs5Dmmi,2Pac,Heartz Of Men,All Eyez On Me,0.485196
3,8ptSfMLrxpbfRaxl,Nas,The Genesis,Illmatic,0.445888
4,h2b5Pi6kMqyAK83G,Bone Thugs-N-Harmony,Look Into My Eyes,Art of War,0.444548


In [27]:
rs_cosine.retrieve("xUjqXrsiCLb8tPkC", n=10)

Unnamed: 0,id,artist,song,album_name,similarity
0,TEYr2jFKqLiibyI7,Sophie Ellis-Bextor,Love It Is Love,Shoot from the Hip,0.636165
1,FoenRGxt8ED5UDj8,Danger,11h30,French Attack!,0.604088
2,r9jKmYrPBtJRgg2k,The Weeknd,Can't Feel My Face - Martin Garrix Remix,Can't Feel My Face (Martin Garrix Remix),0.603747
3,ISH3Weh5lSybSTNb,Wings,Silly Love Songs,Wings At The Speed Of Sound,0.591854
4,4fNOkOQVsp2lkBen,Alicia Keys,Fallin',Songs In A Minor (Expanded Edition),0.590371
5,ptSQMoK9d8CydnHP,Phoenix,Chloroform,Bankrupt!,0.585183
6,U0fgnyiJBI9Ypbsj,Mika,Lollipop,Life in Cartoon Motion,0.573458
7,t8M8yJHyRlalRn4E,Hombres G,Te Quiero,Las baladas (Los singles vol II),0.565957
8,zI9DjkxOs5x1EhJQ,Anna Calvi,No More Words,Anna Calvi,0.562936
9,xDmgjHEzePoJsvkJ,Kylie Minogue,Under the Influence of Love,Light Years,0.546204


In [28]:
rs_cosine.retrieve("xUjqXrsiCLb8tPkC", n=3)

Unnamed: 0,id,artist,song,album_name,similarity
0,TEYr2jFKqLiibyI7,Sophie Ellis-Bextor,Love It Is Love,Shoot from the Hip,0.636165
1,FoenRGxt8ED5UDj8,Danger,11h30,French Attack!,0.604088
2,r9jKmYrPBtJRgg2k,The Weeknd,Can't Feel My Face - Martin Garrix Remix,Can't Feel My Face (Martin Garrix Remix),0.603747


In [29]:
sample_song = SongInfo(title="Always", artist="Bon Jovi")

rs_cosine.retrieve(sample_song, n=10)

Unnamed: 0,id,artist,song,album_name,similarity
0,GuH9cRalvkhXvmaZ,Switchfoot,Always,Hello Hurricane,0.483889
1,yP56QH0tXHXsDNF8,Whitney Houston,I Will Always Love You,The Bodyguard - Original Soundtrack Album,0.483217
2,h47nxyjyRPFroIaQ,k.d. lang,Constant Craving,Recollection,0.472977
3,muLitfTmi9qTxcFV,HammerFall,Always Will Be,Gates Of Dalhalla,0.458007
4,7av2576enFL1qIFt,Sade,You're Not The Man,Promise,0.455112
5,d0ANC4yI7hpiCzPe,Taylor Dayne,I'll Always Love You,Tell It to My Heart (Expanded Edition),0.424737
6,4TzvRc17Z1Rn6LWj,Sophie Zelmani,Always You,My Best Friend's Wedding: Music From The Motio...,0.423746
7,aIJRXTjxYPzwX2Y7,Ace of Base,"Always Have, Always Will",Flowers (Remastered),0.421836
8,BRT3Xamcitwly4w6,Forfun,Pra Sempre,Alegria Compartilhada,0.418782
9,OH7BeCnl70Kq6bA3,Damian Marley,There For You,Welcome to Jamrock,0.408642


In [30]:
sample_song = SongInfo(title="Help! - Remastered 2009", artist="The Beatles")

rs_cosine.retrieve(sample_song, n=10)

Unnamed: 0,id,artist,song,album_name,similarity
0,F4whfybQ8r8xQtb0,Deep Purple,Help,"Sleeping Music: Calm Music to Help You Sleep, ...",0.975958
1,c74L3efP1zcd37Gw,Metallica,Purify,St. Anger,0.687646
2,9ttQerThGMqz8Npx,Christine and the Queens,Tilted,Christine and the Queens,0.649568
3,XSJyafhagXyCcxZ3,The Beatles,With a Little Help from My Friends,Sgt. Pepper's Lonely Hearts Club Band (Remaste...,0.567146
4,SNruiLQdBF8F9HJR,Rita Lee,With a Little Help from My Friends,"Aqui, Ali, Em Qualquer Lugar",0.565782
5,qtDorbJpWDPAEqbp,"nothing,nowhere.",vacanter,ruiner,0.53403
6,hxKAY2pAtxaioSWj,Beach House,Myth,Bloom,0.520281
7,Mw9ctlfuvobHQtxz,Michael Bublé,Can't Help Falling in Love,Come Fly With Me,0.479365
8,OZyqPkVhX1XGYciF,Zappacosta,Overload,Dirty Dancing (Original Motion Picture Soundtr...,0.473655
9,i6tUPu5Yxy7pMzAZ,Los Secretos,Pero A Tu Lado,Dos Caras Distintas,0.467779


In [31]:
sample_song = SongInfo(title="Zombie", artist="The Cranberries")

rs_cosine.retrieve(sample_song, n=10)

Unnamed: 0,id,artist,song,album_name,similarity
0,Eld0lALLIikybAst,Bad Wolves,Zombie,Zombie,0.842231
1,9QjC5RsVakjafpxO,Pixies,Head Carrier,Head Carrier,0.641547
2,TlJ3uClhlJBIPxVg,Rammstein,Mann gegen Mann,ROSENROT,0.617655
3,UgxT5MSPdBJ6wekm,Mayday Parade,In My Head,"Punk Goes Pop, Vol. 3",0.587988
4,x7JXGEHZv7dYCPxZ,Alabama Shakes,Over My Head,Sound & Color,0.519051
5,V8gpwrBWuEYG0f4S,Cannibal Corpse,Severed Head Stoning,The Wretched Spawn,0.475522
6,K6dklMBW6pcuxBol,Julie Byrne,Sea As It Glides,Not Even Happiness,0.469837
7,GX2ioTwjLPBWPFwA,Colbie Caillat,Land Called Far Away,Gypsy Heart,0.46381
8,bwF6WFgG72FvarGJ,Pixies,Where Is My Mind?,Surfer Rosa (Remastered),0.456904
9,Ycz3ZVDtOh5wTwuT,Iron Maiden,Look for the Truth,The X Factor (2015 - Remaster),0.455056


## Text-based (cos-sim, "feature")

We use BERT embeddings.

In [32]:
rs_bert = RetrievalSystem(df, cosine_similarity, "bert_embedding")

In [33]:
rs_bert.retrieve(1)

Unnamed: 0,id,artist,song,album_name,similarity
0,jj55fEqkySIv4qy1,YNW Melly,Murder On My Mind,I AM YOU,0.75099
1,OokmnsloeW1Sh3NF,Freeway,What We Do,Philadelphia Freeway,0.731522
2,jJEgEAF2iuW4yk9g,Danny Brown,When It Rain,Atrocity Exhibition,0.729569
3,M65mU1UIozrDxcvu,Bad Meets Evil,Fast Lane,Hell: The Sequel (Deluxe),0.720946
4,4r1IjzqHAOYQlgfS,Eminem,Fall,Kamikaze,0.718222


In [34]:
# check reproducibility
rs_bert.retrieve(1)

Unnamed: 0,id,artist,song,album_name,similarity
0,jj55fEqkySIv4qy1,YNW Melly,Murder On My Mind,I AM YOU,0.75099
1,OokmnsloeW1Sh3NF,Freeway,What We Do,Philadelphia Freeway,0.731522
2,jJEgEAF2iuW4yk9g,Danny Brown,When It Rain,Atrocity Exhibition,0.729569
3,M65mU1UIozrDxcvu,Bad Meets Evil,Fast Lane,Hell: The Sequel (Deluxe),0.720946
4,4r1IjzqHAOYQlgfS,Eminem,Fall,Kamikaze,0.718222


In [35]:
rs_bert.retrieve("xUjqXrsiCLb8tPkC", n=10)

Unnamed: 0,id,artist,song,album_name,similarity
0,KJyVTXfwdjUGtj5m,Boyzone,Love Is A Hurricane,Brother,0.838821
1,vMluKVsjLFKZEmdg,Rose Royce,Best Love,Essential - Soul Love,0.816716
2,uRNge3sN74NdIaYi,Pixie Lott,My Love,Turn It Up,0.809644
3,8nyPt8gB1K8g5FNv,Bon Jovi,Always,Cross Road,0.809535
4,w64KwSB7npnE8hb2,Feist,Inside and Out,Let It Die,0.809321
5,gkaNGhUTCZzICBcE,Frankie Valli,Can't Take My Eyes Off You,Relaxing Classical Playlist: Chilled Music for...,0.808088
6,DOlk8ct9zA9uIFX0,Selena Gomez & The Scene,A Year Without Rain,A Year Without Rain,0.807728
7,8xFD1UO8nr1qwOg6,Ed Sheeran,Thinking Out Loud,x (Deluxe Edition),0.804023
8,gX0rgtPnKPOD9qhT,Mariah Carey,To Be Around You,Emotions,0.802836
9,xQfqDtAQbxEn7PVM,Jackie Mendoza,De Lejos,De Lejos,0.80166


In [36]:
rs_bert.retrieve("xUjqXrsiCLb8tPkC", n=3)

Unnamed: 0,id,artist,song,album_name,similarity
0,KJyVTXfwdjUGtj5m,Boyzone,Love Is A Hurricane,Brother,0.838821
1,vMluKVsjLFKZEmdg,Rose Royce,Best Love,Essential - Soul Love,0.816716
2,uRNge3sN74NdIaYi,Pixie Lott,My Love,Turn It Up,0.809644


In [37]:
sample_song = SongInfo(title="Always", artist="Bon Jovi")

rs_bert.retrieve(sample_song, n=10)

Unnamed: 0,id,artist,song,album_name,similarity
0,g5Y3fz7C9EnMXdMA,Lana Del Rey,Blue Jeans (Gesaffelstein Remix),Blue Jeans Remixes,0.861671
1,G2otV6WAmea6VB1f,Steps,Heartbeat,Step One,0.855285
2,DzOzu2jdYJQY7xH9,Mariah Carey,Whenever You Call,Butterfly,0.850007
3,ire6sDD2ryFx62Vx,Supertramp,My Kind Of Lady,Famous Last Words (Remastered),0.846296
4,RvExtv1mIVglWrLc,Amber Pacific,Gone So Young,The Possibility and the Promise,0.845325
5,8xFD1UO8nr1qwOg6,Ed Sheeran,Thinking Out Loud,x (Deluxe Edition),0.844703
6,qbNvLKSCnMbHHD26,Atomic Kitten,If You Come to Me,Ladies Night,0.839742
7,C9TfpaaJWG6ZbRmg,Oh Land,Love You Better,Wishbone,0.832535
8,WYDhkPUgu71CQnF2,Shania Twain,Forever and for Always,Up!,0.832055
9,3vbtwqhpTpQXYKs5,Ed Sheeran,Give Me Love,+,0.831858


In [38]:
sample_song = SongInfo(title="Help! - Remastered 2009", artist="The Beatles")

rs_bert.retrieve(sample_song, n=10)

Unnamed: 0,id,artist,song,album_name,similarity
0,F4whfybQ8r8xQtb0,Deep Purple,Help,"Sleeping Music: Calm Music to Help You Sleep, ...",0.929083
1,x61tsmeyuj0c44LR,No Doubt,Undone,Push And Shove,0.750682
2,pKelJlGY1Vf89K17,Issues,Disappear [Remember When],Issues,0.735862
3,dst4n8hPwe3bvdPI,Underoath,Coming Down Is Calming Down,Lost In The Sound Of Separation,0.70717
4,vOGoMtp0LQ2fzS3F,Glasvegas,Geraldine,Glasvegas,0.706876
5,ENFhYQZ1NLMxTg4N,The Story So Far,If I Fall,Proper Dose,0.706575
6,BPfuDdxtlgtLNQ5D,Jennifer Hudson,I Am Changing,Dreamgirls (Music from the Motion Picture),0.699699
7,k2sTB8Yei5EDW7ba,Ashley Tisdale,Don't Let Me Down,"Music Sessions, Vol.1",0.698252
8,0EVEKKlvBiB0Dkge,Crown the Empire,Lead Me Out Of The Dark,The Fallout (Deluxe Reissue),0.690536
9,5vUvj6eHNb9hNBtC,The Stooges,I Need Somebody,Raw Power,0.682918


In [39]:
sample_song = SongInfo(title="Zombie", artist="The Cranberries")

rs_bert.retrieve(sample_song, n=10)

Unnamed: 0,id,artist,song,album_name,similarity
0,Eld0lALLIikybAst,Bad Wolves,Zombie,Zombie,0.965913
1,ez6FRB2E8jioUUZt,Of Mice & Men,Warzone,Defy,0.69017
2,i5vRDPz9mJERSjCZ,Mono Inc.,Out In The Fields,Together Till the End,0.651763
3,pqRrADoga2QzRddh,Megadeth,Death From Within,Dystopia,0.650113
4,DKxQNHrmTPllOC5b,Gary Moore,Out in the Fields,Run For Cover,0.648858
5,QGUHfFcgQc9SqkCN,Cavalera Conspiracy,Killing Inside,Blunt Force Trauma (Special Edition),0.645943
6,pEAawngmMyR3Qccb,Lamb of God,Still Echoes,VII: Sturm Und Drang (Deluxe),0.644222
7,dleNiwrlnwrvU1Mr,Metallica,Some Kind of Monster,St. Anger,0.639605
8,1TTaCU6DaHQ0nE3m,Hüsker Dü,What's Going On,Zen Arcade,0.633913
9,Ycz3ZVDtOh5wTwuT,Iron Maiden,Look for the Truth,The X Factor (2015 - Remaster),0.629737


## Text-based(""similarity", "feature")
Finally, we use word2vec embeddings as features and the simple Dot Product as similarity metric.

In [40]:
rs_w2v = RetrievalSystem(df, dot_product, "word2vec_embedding")

In [41]:
rs_w2v.retrieve(1)

Unnamed: 0,id,artist,song,album_name,similarity
0,kqVWk6G25ortikfF,Tame Impala,Beverly Laurel,Lonerism B-Sides & Remixes,0.20698
1,VJXAAxF84r7eg4bf,Blur,I Know,Leisure (Special Edition),0.200881
2,9ziWMjkmNbJKtU8Y,Tame Impala,Nangs,Currents,0.200703
3,PqjX8Jm9lA7ogAe4,Solange,Time (Is),When I Get Home,0.200372
4,Qh5XjFCInkeOAYzu,Supergrass,Time,I Should Coco,0.200032


In [42]:
# check reproducibility
rs_w2v.retrieve(1)

Unnamed: 0,id,artist,song,album_name,similarity
0,kqVWk6G25ortikfF,Tame Impala,Beverly Laurel,Lonerism B-Sides & Remixes,0.20698
1,VJXAAxF84r7eg4bf,Blur,I Know,Leisure (Special Edition),0.200881
2,9ziWMjkmNbJKtU8Y,Tame Impala,Nangs,Currents,0.200703
3,PqjX8Jm9lA7ogAe4,Solange,Time (Is),When I Get Home,0.200372
4,Qh5XjFCInkeOAYzu,Supergrass,Time,I Should Coco,0.200032


In [43]:
rs_w2v.retrieve("xUjqXrsiCLb8tPkC", n=10)

Unnamed: 0,id,artist,song,album_name,similarity
0,alViVmyWPgD1TNjV,Amy Winehouse,To Know Him Is to Love Him,Back To Black: B-Sides,0.25823
1,Qh5XjFCInkeOAYzu,Supergrass,Time,I Should Coco,0.257726
2,FoenRGxt8ED5UDj8,Danger,11h30,French Attack!,0.256744
3,bIDVY7FbsuHTgV50,Bent,To Be Loved,Intercept! Deluxe Edition,0.256257
4,9ziWMjkmNbJKtU8Y,Tame Impala,Nangs,Currents,0.251869
5,VJXAAxF84r7eg4bf,Blur,I Know,Leisure (Special Edition),0.249743
6,kqVWk6G25ortikfF,Tame Impala,Beverly Laurel,Lonerism B-Sides & Remixes,0.248669
7,wthZJWSU9CZmQ8XX,September,Because I Love You,September,0.245535
8,4XJZFt2CYHjejWh7,Avicii,Levels - Radio Edit,Sommer 2019,0.243641
9,oaSlacL5oON2ueP8,Orchestral Manoeuvres in the Dark,(Forever) Live and Die,The Pacific Age,0.241794


In [44]:
rs_w2v.retrieve("xUjqXrsiCLb8tPkC", n=3)

Unnamed: 0,id,artist,song,album_name,similarity
0,alViVmyWPgD1TNjV,Amy Winehouse,To Know Him Is to Love Him,Back To Black: B-Sides,0.25823
1,Qh5XjFCInkeOAYzu,Supergrass,Time,I Should Coco,0.257726
2,FoenRGxt8ED5UDj8,Danger,11h30,French Attack!,0.256744


In [45]:
sample_song = SongInfo(title="Always", artist="Bon Jovi")

rs_w2v.retrieve(sample_song, n=10)

Unnamed: 0,id,artist,song,album_name,similarity
0,9ziWMjkmNbJKtU8Y,Tame Impala,Nangs,Currents,0.242612
1,VJXAAxF84r7eg4bf,Blur,I Know,Leisure (Special Edition),0.23092
2,oaSlacL5oON2ueP8,Orchestral Manoeuvres in the Dark,(Forever) Live and Die,The Pacific Age,0.229072
3,kqVWk6G25ortikfF,Tame Impala,Beverly Laurel,Lonerism B-Sides & Remixes,0.228911
4,Qh5XjFCInkeOAYzu,Supergrass,Time,I Should Coco,0.224972
5,Y6S6YPEyfK5F8TMP,Blur,High Cool,Leisure,0.222764
6,alViVmyWPgD1TNjV,Amy Winehouse,To Know Him Is to Love Him,Back To Black: B-Sides,0.22207
7,TvyK48kBK3S8XOXD,Fatboy Slim,Going Out of My Head,The Greatest Hits: Why Try Harder,0.219967
8,qq3el8xwdckarmlh,The Brian Jonestown Massacre,Wasted,Methodrone,0.217051
9,Nblp573S43B1TMzx,Gipsy Kings,Tu Quieres Volver,Cantos de Amor / Love Songs,0.216332


In [46]:
sample_song = SongInfo(title="Help! - Remastered 2009", artist="The Beatles")

rs_w2v.retrieve(sample_song, n=10)

Unnamed: 0,id,artist,song,album_name,similarity
0,9ziWMjkmNbJKtU8Y,Tame Impala,Nangs,Currents,0.258853
1,lTJVFhGsiLggKCEq,Beat Happening,I Let Him Get to Me,Beat Happening,0.238568
2,VJXAAxF84r7eg4bf,Blur,I Know,Leisure (Special Edition),0.238529
3,Y6S6YPEyfK5F8TMP,Blur,High Cool,Leisure,0.233498
4,Nblp573S43B1TMzx,Gipsy Kings,Tu Quieres Volver,Cantos de Amor / Love Songs,0.232844
5,kqVWk6G25ortikfF,Tame Impala,Beverly Laurel,Lonerism B-Sides & Remixes,0.230786
6,qq3el8xwdckarmlh,The Brian Jonestown Massacre,Wasted,Methodrone,0.230433
7,VBlFbZN2NIEct6EH,Sophie,Bipp,BIPP,0.228629
8,TvyK48kBK3S8XOXD,Fatboy Slim,Going Out of My Head,The Greatest Hits: Why Try Harder,0.224768
9,YzfpVoUPlOhBoRGx,Heather Nova,All I Need,Storm,0.222857


In [47]:
sample_song = SongInfo(title="Zombie", artist="The Cranberries")

rs_w2v.retrieve(sample_song, n=10)

Unnamed: 0,id,artist,song,album_name,similarity
0,1TTaCU6DaHQ0nE3m,Hüsker Dü,What's Going On,Zen Arcade,0.133713
1,9SwCdYIwySZFIZEQ,Theory of a Deadman,Out of My Head,The Truth Is... (Special Edition),0.129773
2,kqVWk6G25ortikfF,Tame Impala,Beverly Laurel,Lonerism B-Sides & Remixes,0.128834
3,hYDBUkFGT8axnXGa,Peaches,Dumb Fuck,Rub,0.127192
4,TvyK48kBK3S8XOXD,Fatboy Slim,Going Out of My Head,The Greatest Hits: Why Try Harder,0.126975
5,Eld0lALLIikybAst,Bad Wolves,Zombie,Zombie,0.1268
6,bIDVY7FbsuHTgV50,Bent,To Be Loved,Intercept! Deluxe Edition,0.126117
7,K6dklMBW6pcuxBol,Julie Byrne,Sea As It Glides,Not Even Happiness,0.125346
8,lUgSuAl7kvy7MSon,New Kids on the Block,You Got It (The Right Stuff),Hangin' Tough,0.125102
9,04OjszRi9rC5BlHC,Grizzly Bear,Knife,Yellow House,0.124561


The results already seem a lot less reasonable. However, for a full comparison, we should only change 1 variable at a time. Still, the results are interesting to look at.

## Qualitative Analysis

Select as queries three tracks you are familiar with and retrieve 10 tracks with
each system, including the random baseline. This will result in the following number of lists:
N = 3(query songs) * 4(rs) = 12
For each query track, qualitatively compare the retrieved tracks (with the query and other tracks in
the result list), analyzing for instance whether the list includes tracks by the same artist or of the
same genre. Also, investigate the relevance of the retrieved tracks for the query, i.e., given the
query can you speculate why the tracks in the result list have been retrieved?


In [48]:
k = 10
# query songs
songs = [
    SongInfo(title="Business", artist="Eminem"),
    SongInfo(title="Night Witches", artist="Sabaton"),
    SongInfo(title="Encoder", artist="Pendulum")]
# retrieval systems
rss = [
    RetrievalSystem(df, random_similarity, enable_cache=False),
    RetrievalSystem(df, cosine_similarity, "word2vec_embedding"),
    RetrievalSystem(df, cosine_similarity, "tf-idf"),
    RetrievalSystem(df, cosine_similarity, "bert_embedding")]

In [49]:
dfs = []

for (i, s) in enumerate(songs):
    acc = pd.DataFrame()
    for (rs, feat) in zip(rss, ['rand', 'word2vec', 'tf-idf', 'bert']):
        res = rs.retrieve(s, k)
        res.columns = [f'id_{feat}', f'artist_{feat}', f'song_{feat}', f'album_{feat}', f'score_{feat}']
        res.reset_index(drop=True, inplace=True)
        acc.reset_index(drop=True, inplace=True)
        acc = pd.concat([acc, res], axis=1)
    dfs.append(acc)

### Results for (title="Business", artist="Eminem")

In [50]:
dfs[0]

Unnamed: 0,id_rand,artist_rand,song_rand,album_rand,score_rand,id_word2vec,artist_word2vec,song_word2vec,album_word2vec,score_word2vec,id_tf-idf,artist_tf-idf,song_tf-idf,album_tf-idf,score_tf-idf,id_bert,artist_bert,song_bert,album_bert,score_bert
0,TSaRgg8Jn20M2Ctj,Staind,Home,Dysfunction (Internet Release),0.999978,Q8HUKzo5muVvgIvP,Eminem,Marshall Mathers,The Marshall Mathers LP,0.962139,CaNBba5E9KttJyB3,Red Hot Chili Peppers,C'mon Girl,Stadium Arcadium,0.400296,pK2EZxyujhM6yp7Z,Eminem,W.T.P.,Recovery,0.735179
1,LHtNZBLhK0TLvQEO,Jessie J,Alive,Alive (Deluxe Edition),0.999925,PoTQ9felmQHjpYDi,Bad Meets Evil,Lighters,Hell: The Sequel (Deluxe),0.960641,lTJVFhGsiLggKCEq,Beat Happening,I Let Him Get to Me,Beat Happening,0.395667,6Ku7bJlCxqrwTQNi,The Roots,Double Trouble,Things Fall Apart,0.718591
2,zMU7p4zFa5YmBMFG,Mariah Carey,Without You,Music Box,0.9999,SgBvRaxOi7XaDEmU,Eminem,The Monster,The Marshall Mathers LP2,0.960415,9CuEQRYVaaGpgOGn,G-Eazy,Been On,These Things Happen,0.387929,07AjGMk31TkPW3O9,The Sugarhill Gang,Rapper's Delight,Rhino Hi-Five: The Sugarhill Gang,0.716156
3,dEHXaXsser89ODzo,The Presets,This Boys In Love,Apocalypso,0.999811,CfCJl3HWjZCvaTv3,Eve,Let Me Blow Ya Mind,Scorpion,0.958956,WPJqHS1ZUdqC6LWs,Sophie Ellis-Bextor,You Get Yours,Shoot From The Hip,0.382842,MoUTYdSeg2uc70Pa,Naughty By Nature,Hip Hop Hooray,19 Naughty III,0.710492
4,MrybNrXk4F6kltbb,Grandson,War,War,0.999791,TYoYVKiQpzhvcatl,Eminem,Without Me,The Eminem Show,0.958579,ZgUzwR5NDfRuhDgx,Prince & The Revolution,Let's Go Crazy,Purple Rain,0.37137,ER6vgmZp5VMwLzfq,A Tribe Called Quest,God Lives Through,Midnight Marauders,0.708904
5,tPR5Ai9IkeHwd32t,Paula Abdul,I Need You,Forever Your Girl,0.999785,e9Q9R0EmvWwSjAxy,Eminem,Groundhog Day,The Marshall Mathers LP2 (Deluxe),0.957774,kZV6wep0ogka756U,Mayday Parade,Get Up,Anywhere But Here (Deluxe),0.365913,VwApT9MehZ2gx1a8,BROCKHAMPTON,VIVID,iridescence,0.705232
6,u6NtZfPArPnfwGXL,Judas Priest,Ram It Down,Ram It Down,0.999778,nVVj7Ksowk6ILZge,"Tyler, the Creator",Pigs,Wolf,0.95691,wiw2rM2xb4CJ5sVP,Kanye West,Never Let Me Down,The College Dropout,0.363931,jsyIbokybtgb3pmJ,A Tribe Called Quest,Jazz (We've Got),The Low End Theory,0.703841
7,PiyCkX6IHCTpEgF5,Kanye West,RoboCop,808s & Heartbreak,0.999694,gE39Ms4AoMKpJNGi,Lloyd Banks,Karma,The Hunger For More,0.956683,Mj0A7uXuK1Dx28dz,Smino,WE GOT THE BISCUITS,NOIR,0.362756,C5p9Jr8WABOMgdCl,Beastie Boys,"Hold It Now, Hit It",Licensed To Ill,0.699053
8,7YUDrW4v01GSGMGz,The Cardigans,Never Recover,First Band On The Moon (Remastered),0.999639,wiw2rM2xb4CJ5sVP,Kanye West,Never Let Me Down,The College Dropout,0.956396,88bG363HAfPRE2vi,Deep Purple,No No No,Burn (Expanded & Remastered),0.34822,Q8HUKzo5muVvgIvP,Eminem,Marshall Mathers,The Marshall Mathers LP,0.68933
9,hLJsmo8X4Z11hVXk,Sonic Youth,Hey Joni (Live),Daydream Nation (Deluxe Edition),0.999603,mgjDCaqqLfxz1070,Eminem,Drug Ballad,The Marshall Mathers LP,0.955484,gqyDfchba6vWzULe,The Fratellis,Halloween Blues,We Need Medicine (Deluxe Edition),0.347137,4r1IjzqHAOYQlgfS,Eminem,Fall,Kamikaze,0.684791


`(title="Business", artist="Eminem")`
- The top similarities will often be almost 1, since we are using a uniform distribution of random values: [0, 1]. Of course, they don't actually mean that the similarity is high. Just as a short note and reason for why we'll be ignoring the random results for qualitative analysis of the next queries.

- For word2vec the results look promising, a lot of songs with very high similarity. A total of 5 other songs by Eminem were found by the system, one of them (Without Me) is also from the same album. The other 5 retrieved songs are all HipHop/Rap songs and one of them (Bad Meets Evil) is even featuring Eminem.

- For tf-idf the results don't score as high similarity and the system could not retrieve any other songs by eminem. It also retrieved mostly non HipHop/Rap songs. Only 3 songs are of the same genre.

- With bert we, again, retrieve some songs by the same artist and mostly HipHop/Rap songs.

- Without extensively comparing the lyrics of all the retrieved songs, we suspect that the reason for finding that many other songs by eminem with word2vec and bert is that Eminem uses a few words in a lot of his songs, which are not really used by other artists. E.g. `Marshall`, `Dre` (which both appear in Business).

### Results for (title="Night Witches", artist="Sabaton")

In [51]:
dfs[1]

Unnamed: 0,id_rand,artist_rand,song_rand,album_rand,score_rand,id_word2vec,artist_word2vec,song_word2vec,album_word2vec,score_word2vec,id_tf-idf,artist_tf-idf,song_tf-idf,album_tf-idf,score_tf-idf,id_bert,artist_bert,song_bert,album_bert,score_bert
0,NPcSK26fKDd5mNNY,The Nearly Deads,Diamond in the Rough,Revenge of the Nearly Deads,0.999939,OUK5bBTgTGQgAVy7,Avantasia,Ghost In The Moon,Moonglow,0.859165,Zmb8idzkHjVyKSzG,Tim Buckley,Gypsy Woman,Happy Sad,0.351593,VSd47SUSc5Dja4Od,Iron Maiden,Where Eagles Dare,Piece of Mind (2015 - Remaster),0.64477
1,RvSa4O2tB10dtqQr,Yellow Magic Orchestra,Gradated Grey,Technodelic,0.999781,qBN7BnrB9uSXZDDg,Bathory,The Revenge of the Blood on Ice,Blood on Ice,0.854057,lK1wd2EwujDcpRFG,Rise Against,Far From Perfect,"The Ghost Note Symphonies, Vol.1",0.315371,dpdFc5WvD3qG1PzV,Dream Theater,The Dark Eternal Night,Systematic Chaos,0.620575
2,IPnJezdM2MJkcnjz,The White Stripes,We're Going to Be Friends,White Blood Cells,0.999776,xnPyxQcSGAC5dMzr,Between the Buried and Me,Silent Flight Parliament,The Parallax II: Future Sequence,0.851628,2HceDzG53j9404OD,Slayer,Captor Of Sin,Live Undead / Haunting the Chapel,0.311556,8xIeRqsIdyJPSZ2p,Rome,The Joys Of Stealth,Confessions d'un voleur d'âmes,0.613757
3,7U2ptxsghPXnYQWu,Steely Dan,Kid Charlemagne,The Royal Scam,0.999368,pqRrADoga2QzRddh,Megadeth,Death From Within,Dystopia,0.847954,C9ohq14tDIAJeodr,Juli,Perfekte Welle,Ein neuer Tag Live,0.307305,IW78SR7pLUMxhLwo,Einherjer,Spre Vingene,Spre Vingene,0.608947
4,3TIdOAlULKdqaSR0,Nico & Vinz,When the Day Comes,Black Star Elephant,0.999358,o6f1vBg76XF1vxJR,Rhapsody of Fire,Unholy Warcry,Symphony of Enchanted Lands II (The Dark Secret),0.845201,VSuR5YqH9v7NgN3n,I Love You But I've Chosen Darkness,According to Plan,Fear is on Our Side,0.3042,Pss60SSj8dqGts7y,Ólafur Arnalds,Reclaim,For Now I Am Winter,0.607159
5,Yoqqg9v9DJ4WtjIl,One Direction,Up All Night,Up All Night,0.999282,uyiAoox9wCrrVoQW,Sopor Aeternus & The Ensemble of Shadows,On Satur(n)days We Used To Sleep,Dead Lovers Sarabande,0.844565,2vroC72fBSgFKI6p,The Moody Blues,Are You Sitting Comfortably?,On The Threshold Of A Dream,0.295894,HmAiZIhgpTQh2HtY,Raubtier,Panzarmarsch,Bestia Borealis,0.603905
6,6w5ZaCB7VCZIt5Qv,Deftones,Beauty School,Diamond Eyes,0.999208,ZuujS0YWGIbArbP0,Dying Fetus,Weaken The Structure,Wrong One to Fuck With,0.84455,2KlTtldY45koDK06,Moby,The Perfect Life,Innocents,0.283617,l7C6Cplr8g1yZFKg,Deströyer 666,Clenched Fist,Cold Steel... For An Iron Age,0.603762
7,zoVx87fOsgF5xb6X,All Time Low,I Feel Like Dancin',Dirty Work,0.999139,Qbus1NSW0A9vhkuM,Ashbury,Vengeance,Endless Skies,0.844376,RSbNTd0jMg7I9R82,Cradle of Filth,The Cult of Venus Aversa,"Darkly, Darkly, Venus Aversa (Deluxe Edition)",0.275112,vPNIItkt1ru5ivn2,Sunn O))),Cursed Realms (Of the Winterdemons),Black One,0.602785
8,dst4n8hPwe3bvdPI,Underoath,Coming Down Is Calming Down,Lost In The Sound Of Separation,0.998849,R3H7yPXQab6G0qwo,Arcturus,The Arcturian Sign,Arcturian,0.840746,F7ebJa7eUiPkChuj,Cheap Trick,Mighty Wings,Top Gun - Motion Picture Soundtrack (Special E...,0.25885,pqRrADoga2QzRddh,Megadeth,Death From Within,Dystopia,0.602411
9,MMXgPpMDKctAw9jP,The Crystal Method,Born Too Slow,Legion of Boom,0.998566,Lyw8lI9YQZEfsZXX,Vader,Dark Age,Dark Age,0.84061,LalGH4emAyu0T5I0,Emperor,Lord of the Storms,In The Nightside Eclipse (20th Anniversary Edi...,0.250374,BRTTbs2WbkBKHynj,Dimmu Borgir,Alt Lys Er Svunnet Hen,Stormblåst,0.600827


`(title="Night Witches", artist="Sabaton")`

- word2vec receives songs with similar genre, but no songs from the same artist.
- the tfidf results are partially of the same genre and none of the same artist. The song `Die Perfekte Welle` seems out of place, but its lyrics contain the word `perfekt` a lot, where the query song's lyrics contain `perfect` a lot. This might be a reason for the similarity.
- bert receives songs with mostly the same genre, but no songs from the same artist.

### Results for (title="Encoder", artist="Pendulum")

In [52]:
dfs[2]

Unnamed: 0,id_rand,artist_rand,song_rand,album_rand,score_rand,id_word2vec,artist_word2vec,song_word2vec,album_word2vec,score_word2vec,id_tf-idf,artist_tf-idf,song_tf-idf,album_tf-idf,score_tf-idf,id_bert,artist_bert,song_bert,album_bert,score_bert
0,5GJJXM3toU6ieQmK,Siouxsie and the Banshees,Mirage,The Scream,0.99999,xuvOIIAf7xHY4tvm,Joy Division,24 Hours,Last Order (Hd Remastered),0.926218,GjIaZcckFpTQqF8N,Wilco,Passenger Side,A.M.,0.419339,sUthByiziG0pdfZW,James Blunt,So Far Gone,Some Kind Of Trouble,0.679692
1,J4onmjAmjdnYYbpX,The Faint,Southern Belles In London Sing,Wet From Birth,0.999854,qysJrLPfxSCkdg9o,Emery,The Ponytail Parades,The Weak's End,0.925573,HshAaiNsDdV1W4nq,Rivermaya,214,Greatest Hits,0.395458,KNWTsGsMKrMUOZGP,Black Veil Brides,Knives and Pens,We Stitch These Wounds,0.6624
2,hWP1OD0kv7fJkiRm,The Strokes,Juicebox,First Impressions Of Earth,0.999471,f7nMTiV0NHDaPOIK,Phil Collins,Against All Odds,The Singles (Expanded),0.924615,Sr6cMgnJq7UmdnyL,Drowning Pool,Blindfold,Resilience (Deluxe),0.330584,PdcTvt7fWomuG5NV,The Dear Hunter,Whisper,Migrant (Deluxe),0.650715
3,0KEwSB8tPsFGrv8I,Silverstein,Brookfield,Short Songs,0.999463,fraglE0va6B3zUwB,Phil Collins,Against All Odds (Take a Look at Me Now),The Singles (Expanded),0.924312,mreAbgfRVLT1349Q,Toni Braxton,Let Me Show You the Way (Out),More Than A Woman,0.326977,h2ap0A9dSDkFmO8D,Alice in Chains,Your Decision,Black Gives Way To Blue,0.648354
4,i6GDfQmdAYYAqMvx,Lady Antebellum,Lookin' for a Good Time,Lady Antebellum,0.999157,0IHQDwO3FnNYE7VY,Mariah Carey,Against All Odds (Take a Look at Me Now),Greatest Hits,0.922265,EV3VRzYbYoFjmj1w,3 Doors Down,Still Alive,Us And The Night,0.317766,VHEVsKLSHv9FLNrx,Emilíana Torrini,Gollum's Song,The Lord Of The Rings: The Two Towers (Origina...,0.647456
5,eggqzHqY78lFBErr,L7,Monster,Bricks Are Heavy,0.999074,hSXdyzqVnjWnEZbQ,BTS,First Love,Wings,0.920089,tXJHzoQa51DdyOX7,Judas Priest,All the Way,Point Of Entry,0.311683,4lncHvBfKlBM4yzL,Kate Bush,Hammer Horror,Lionheart,0.646831
6,D67sHxcWGk1kJeiT,M.I.A.,Go Off,AIM (Deluxe),0.998985,HgImdPHY2r03vbTh,Dinosaur Jr.,Keep The Glove,Bug,0.918289,hPjLYCrpOH1xIdoi,The Alchemist,Hold You Down,1st Infantry,0.311585,FP3QItPNaS9TqPKm,Trapt,Headstrong,Trapt,0.642641
7,aLeiDS7Y2Tkqrvvd,Operation Ivy,Take Warning,Operation Ivy (2007 Remaster),0.998691,n2jEWkacDcvJ6eJ8,Stray Kids,I am YOU,I am YOU,0.918077,nDiuhKCc9z5DOBcD,The Cab,Moon,Lock Me Up,0.310115,JOXJtqBv7ptLmaK3,Breaking Benjamin,What Lies Beneath,Dear Agony,0.638531
8,DRiWwqJRaiJCz6r9,August Burns Red,40 Nights,Leveler (Deluxe Edition),0.998592,Q0FuvjirjmMi2Xy4,Dream Theater,The Bigger Picture,Dream Theater,0.917097,lLr42PhCGChOItPt,Zola Blood,Play Out,Play Out / Pieces of the Day,0.304793,gKHosWLDavB700UY,The Appleseed Cast,Fight Song,Two Conversations,0.634198
9,AIMISCWIplym2JNG,Kero Kero Bonito,Forever Summer Holiday,Forever Summer Holiday,0.998586,pAd7kQdenSBvW8Ml,Mineral,Parking Lot,The Power of Failing,0.916767,S7TjeaBqPK9kCnZG,Daron Malakian and Scars on Broadway,Never Forget,Dictator,0.302866,bUyESS0Y1RhTzjsM,Reba McEntire,Consider Me Gone,Keep On Loving You,0.62943


`(title="Encoder", artist="Pendulum")`

- we particularly chose this song, because we wanted to see how the system would perform with electronic music. Electronic music often has very little lyrics, so we suspected the system to have a very hard time finding similar songs for this query.
- word2vec received three times the same song (with the same lyrics). `Against All Odds` probably has a lot of similar words or phrases in its lyrics. But other than that it is not very similar (different genre, artist). It also received two K-Pop songs, which seems like a very different genre.
- With tfidf, the results were also of different genre and artist
- With bert, we received  songs with a lot of different genres. But no electronic music
- These results show that the system has a hard time with electronic music. This gives us some evidence to strengthen our initial claim, that it is because of the lyrics, or the lack thereof. However, the reason could also be a bias of the data. Because we only have lyrics as features, electronic songs might be underrepresented.