# Read and Explore Data

In [1]:
from typing import List, Union, Callable
from dataclasses import dataclass

import pandas as pd
import numpy as np

In [2]:
def read(feature, h=None):
    return pd.read_csv(f"data/id_{feature}_mmsr.tsv", delimiter="\t", header=h)


def embed_and_merge(df1, df2, col_name):
    embedding = df2.columns.difference(["id"], sort=False)
    df2[col_name] = df2[embedding].apply(lambda x: np.array(x, dtype=float), axis=1)
    df2.drop(embedding, inplace=True, axis=1)
    return pd.merge(df1, df2, left_on="id", right_on="id", how="left")

In [3]:
df = read("information", 0)

In [4]:
# read bert embedding
bert = read("lyrics_bert", 0)
df = embed_and_merge(df, bert, "bert_embedding")

In [5]:
# read word2vec embedding
word2vec = read("lyrics_word2vec", 0)
df = embed_and_merge(df, word2vec, "word2vec_embedding")

In [6]:
# read tf-idf term weighting
tfidf_weighting = read("lyrics_tf-idf", 0)
df = embed_and_merge(df, tfidf_weighting, "tf-idf")

In [7]:
df

Unnamed: 0,id,artist,song,album_name,bert_embedding,word2vec_embedding,tf-idf
0,01Yfj2T3YTwJ1Yfy,We As Human,Take The Bullets Away (feat. Lacey Sturm),We As Human,"[0.0302475523203611, 0.0352500043809413, 0.010...","[0.0193592727054678, 0.0232394714425702, 0.028...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0818293914712727, ..."
1,01gyRHLquwXDlhkO,The Notorious B.I.G.,Somebody's Gotta Die,Life After Death (Remastered Edition),"[0.0084422621876001, 0.0302564185112714, 0.009...","[0.018537292381979, 0.0113115924403394, 0.0107...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,01rMxQv6vhyE1oQX,Against the Current,Chasing Ghosts,In Our Bones,"[0.0490818135440349, 0.0148476688191294, 0.001...","[0.0227837218553759, 0.0231641749730655, 0.012...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,02RGE9FNH65RtMS7,Barthezz,Infected,Trance - The Early Years (1997-2002),"[0.0445394963026046, 0.0214906893670558, 0.013...","[0.0381116103401342, 0.0278804157207017, 0.016...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,02ZnlCGZEbkfCDxo,Laura Pausini,Tra Te E Il Mare,The Best of Laura Pausini - E Ritorno Da Te,"[0.0514551289379596, 0.0297695714980363, -0.01...","[0.0182936789026777, -0.0064870788035669, 0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.2413163920156013, ..."
...,...,...,...,...,...,...,...
10090,zyzILCQvVeUFIINi,Crowded House,When You Come,Temple Of Low Men,"[0.006713552866131, 0.0480893477797508, -0.001...","[0.0195101330379449, 0.0236336907562543, 0.011...","[0.0, 0.0, 0.079623055470056, 0.0, 0.0, 0.0, 0..."
10091,zzgS4ZqyswamEWNj,Britney Spears,My Only Wish (This Year),Platinum Christmas,"[0.0098905526101589, 0.0401467233896255, -0.02...","[0.0268563718791583, 0.0082648759004199, 0.011...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
10092,zzoFYDMlqU1X2zz1,Thundercat,DUI,Drunk,"[0.0101165119558572, 0.0388841480016708, -0.01...","[0.0051499218912795, 0.0028818239457905, 0.017...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
10093,zzpkRCGA5ud8q4mv,Otis Redding,Rock Me Baby,Otis Blue,"[-0.0166116580367088, 0.0266939438879489, -0.0...","[0.0370260450523346, 0.0159991827379498, -0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10095 entries, 0 to 10094
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  10095 non-null  object
 1   artist              10095 non-null  object
 2   song                10095 non-null  object
 3   album_name          10095 non-null  object
 4   bert_embedding      10095 non-null  object
 5   word2vec_embedding  10095 non-null  object
 6   tf-idf              10095 non-null  object
dtypes: object(7)
memory usage: 630.9+ KB


In [9]:
df

Unnamed: 0,id,artist,song,album_name,bert_embedding,word2vec_embedding,tf-idf
0,01Yfj2T3YTwJ1Yfy,We As Human,Take The Bullets Away (feat. Lacey Sturm),We As Human,"[0.0302475523203611, 0.0352500043809413, 0.010...","[0.0193592727054678, 0.0232394714425702, 0.028...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0818293914712727, ..."
1,01gyRHLquwXDlhkO,The Notorious B.I.G.,Somebody's Gotta Die,Life After Death (Remastered Edition),"[0.0084422621876001, 0.0302564185112714, 0.009...","[0.018537292381979, 0.0113115924403394, 0.0107...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,01rMxQv6vhyE1oQX,Against the Current,Chasing Ghosts,In Our Bones,"[0.0490818135440349, 0.0148476688191294, 0.001...","[0.0227837218553759, 0.0231641749730655, 0.012...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,02RGE9FNH65RtMS7,Barthezz,Infected,Trance - The Early Years (1997-2002),"[0.0445394963026046, 0.0214906893670558, 0.013...","[0.0381116103401342, 0.0278804157207017, 0.016...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,02ZnlCGZEbkfCDxo,Laura Pausini,Tra Te E Il Mare,The Best of Laura Pausini - E Ritorno Da Te,"[0.0514551289379596, 0.0297695714980363, -0.01...","[0.0182936789026777, -0.0064870788035669, 0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.2413163920156013, ..."
...,...,...,...,...,...,...,...
10090,zyzILCQvVeUFIINi,Crowded House,When You Come,Temple Of Low Men,"[0.006713552866131, 0.0480893477797508, -0.001...","[0.0195101330379449, 0.0236336907562543, 0.011...","[0.0, 0.0, 0.079623055470056, 0.0, 0.0, 0.0, 0..."
10091,zzgS4ZqyswamEWNj,Britney Spears,My Only Wish (This Year),Platinum Christmas,"[0.0098905526101589, 0.0401467233896255, -0.02...","[0.0268563718791583, 0.0082648759004199, 0.011...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
10092,zzoFYDMlqU1X2zz1,Thundercat,DUI,Drunk,"[0.0101165119558572, 0.0388841480016708, -0.01...","[0.0051499218912795, 0.0028818239457905, 0.017...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
10093,zzpkRCGA5ud8q4mv,Otis Redding,Rock Me Baby,Otis Blue,"[-0.0166116580367088, 0.0266939438879489, -0.0...","[0.0370260450523346, 0.0159991827379498, -0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


# Define RS and functions

In [10]:
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [11]:
def dot_product(a: np.ndarray, b: np.ndarray) -> float:
    return np.dot(a, b)

In [12]:
def manhattan_distance(a: np.ndarray, b: np.ndarray) -> float:
    return np.sum(np.abs(a - b))

In [13]:
def euclidean_distance(a: np.ndarray, b: np.ndarray) -> float:
    return np.linalg.norm(a - b)

In [14]:
@dataclass
class SongInfo:
    title: str
    artist: str


class RetrievalSystem:
    def __init__(
        self,
        df: pd.DataFrame,
        sim_metric: Callable = cosine_similarity,
        sim_feature: str = "bert_embedding",
    ):
        self.df = df
        self.sim_metric = sim_metric
        self.sim_feature = sim_feature

        if self.sim_feature not in self.df.columns:
            raise ValueError(
                f"'{self.sim_feature}' not found in the dataframe columns."
            )

        # Precompute the stacked version of the feature
        # array(list(array)) -> 2d array
        self.all_songs_stacked = np.vstack(self.df[self.sim_feature].values)

    def _calc_similarity(self, song: pd.DataFrame, n: int = 5) -> pd.DataFrame:
        """
        Calculate the similarity of the given song with all songs in the dataset.

        Parameters:
        - song: DataFrame row representing the song.
        - n: Number of top similar songs to retrieve.

        Returns:
        - DataFrame of top n similar songs.
        """
        song_vector = song[self.sim_feature]
        # Compute similarity for each song in the dataset, ensuring each song_vec is reshaped to 2D
        similarity = np.array(
            [
                self.sim_metric(song_vector, song_vec)
                for song_vec in self.all_songs_stacked
            ]
        )

        top_n_indices = np.argsort(similarity)[::-1][:n]
        top_n = self.df.iloc[top_n_indices]
        # make pandas happy: no in-place modification
        top_n = self.df.iloc[top_n_indices].copy()
        top_n["similarity"] = similarity[top_n_indices]
        return top_n

    def random_baseline(self, query: Union[int, str], n: int = 5) -> pd.DataFrame:
        """
        Retrieve random songs from the dataset.

        Parameters:
        - query: If int, row of df. If str, song_id. If SongInfo, title and artist.- song_id: ID of the song.
            Not used in this method.
        - n: Number of songs to retrieve.

        Returns:
        - DataFrame of n random songs.
        """
        rand_n = self.df.sample(n=n)
        return self._remove_embeddings(rand_n)

    @staticmethod
    def _remove_embeddings(df: pd.DataFrame) -> pd.DataFrame:
        """
        Do not return columns with "embedding" or "tf-idf" in the name


        Args:
            df (pd.DataFrame): Dataframe to remove columns from

        Returns:
            pd.DataFrame: Dataframe without embedding and tf-idf columns
        """
        return df.loc[:, ~df.columns.str.contains("embedding|tf-idf")].reset_index(
            drop=True
        )

    def retrieve(self, query: Union[int, str, SongInfo], n: int = 5) -> pd.DataFrame:
        """
        Retrieve the top n songs similar to the given song_id.

        Parameters:
        - query: If int, row of df. If str, song_id. If SongInfo, title and artist.
        - n: Number of songs to retrieve.

        Returns:
        - DataFrame of top n similar songs.
        """
        if isinstance(query, (int, str)):
            song_id = query
            if song_id not in self.df["id"].values and song_id not in self.df.index:
                raise ValueError(f"Song id {song_id} not in the dataset.")
            song = (
                self.df.loc[song_id]
                if isinstance(song_id, int)
                else self.df[self.df["id"] == song_id].iloc[0]
            )
        elif isinstance(query, SongInfo):
            title, artist = query.title, query.artist
            song = self.df[(self.df["song"] == title) & (self.df["artist"] == artist)]
            if song.empty:
                raise ValueError(
                    f"Song with title '{title}' and artist '{artist}' not found in the dataset."
                )
            song = song.iloc[0]
        else:
            raise ValueError(
                "Invalid query type. Provide either song_id (int/str) or an instance of SongInfo."
            )

        top_n = self._calc_similarity(song, n=n)
        # from the assignment, it is not 100% clear what to return --> return all except embeddings
        return self._remove_embeddings(top_n)

## Random Baseline

In [15]:
rs = RetrievalSystem(df)

In [16]:
rs.random_baseline(1)

Unnamed: 0,id,artist,song,album_name
0,kSDPUwWDCOkb8wOX,Cutting Crew,(I Just) Died in Your Arms,The Best Of Cutting Crew
1,rbw6CccHFvjcDTfV,Aerosmith,Lay It Down,The Essential Aerosmith
2,O7ZMaHhYYXqSDbaM,Grimes,Pretty Dark (Demo),Pretty Dark (Demo)
3,EhdIUt6FI5kchomy,Kimbra,Two Way Street,Vows
4,Wo7ttQito032fSkb,Willie Colón,Que Lio,The Hustler


In [17]:
rs.random_baseline(1)

Unnamed: 0,id,artist,song,album_name
0,HffyNf2eyQXnch8S,Miss May I,Masses Of A Dying Breed,Monument
1,oyF09e28NB4YgFkU,Nile,Smashing the Antiu,Amongst the Catacombs of Nephren-Ka
2,DiGxMKulgv7EFris,Texas,Let's Work It Out,Let's Work It Out
3,PidolWc530MvkzXy,REO Speedwagon,Roll with the Changes,"You Can Tune a Piano, But You Can't Tuna Fish"
4,cxQ8ubQsI3P1L41V,Easy Star All-Stars,Let Down,Radiodread


Indeed, the system produces new results for each query/run.

In [18]:
rs.random_baseline("01gyRHLquwXDlhkO", n=10)

Unnamed: 0,id,artist,song,album_name
0,pBW4hSMcAIFt7dnh,Tom Waits,Talking at the Same Time,Bad As Me (Deluxe Edition Remastered)
1,diJdQUJHfVOl3LVf,Ulver,Blinded By Blood,Blood Inside
2,qcTRoUCsoFooCVsD,Jellyfish,Joining A Fan Club,Spilt Milk
3,eGzbWXhQffOPbPwt,At the Drive-In,Catacombs,Relationship Of Command
4,KYfT1CK8m0esKr0N,John Lennon,Woman,Double Fantasy Stripped Down
5,A2xv1wUjyFo2G4qN,Lush,Laura,Spooky
6,1PMNW9sRMoFJVfU3,Magic!,Let Your Hair Down,Don't Kill the Magic
7,ldQsK4wsh2IaeGe9,Selena Gomez & Demi Lovato,One and the Same,Disney Channel Playlist
8,I7xwlyqUByL76Wpu,The Allman Brothers Band,Midnight Rider,Idlewild South (Deluxe Edition Remastered)
9,NNB2CFD4cHOS0dZp,Migos,Fight Night,No Label II


In [19]:
rs.random_baseline("01gyRHLquwXDlhkO", n=10)

Unnamed: 0,id,artist,song,album_name
0,OIVkRwA7EgcS0tZp,Lara Fabian,Love by Grace,Lara Fabian
1,cZzaCPaGM1bWPU9z,Robin Bengtsson,I Can't Go On,I Can't Go On
2,C7cP4Tb3y713OCkZ,Korpiklaani,Ievan polkka,Manala
3,04xUDjAYC14jsHyH,Jawbreaker,Jinx Removing,24 Hour Revenge Therapy (Remastered)
4,Yt5SsMd0HhEIXiaN,Kelis,Till the Wheels Fall Off,Kelis Was Here
5,q1DA6ytqomlWnFdM,Muse,The 2nd Law: Unsustainable,The 2nd Law
6,7ZhsfreCAjHa8Nnh,Rosa de Saron,Linda Menina,Acústico
7,Ecw9XVEBc3ejMnAb,Heidi Montag,Love It or Leave It,Superficial
8,ARfBHsLlhLprEt6U,Wilson Pickett,Mustang Sally,Wilson Pickett: A Man And A Half
9,t79ZBHGAN1A2NbOg,Rush,The Enemy Within,Grace Under Pressure


## Text-based (cos-sim, tf-idf)

In [20]:
rs_cosine = RetrievalSystem(df, cosine_similarity, "tf-idf")

In [21]:
rs_cosine.retrieve(1)

Unnamed: 0,id,artist,song,album_name,similarity
0,01gyRHLquwXDlhkO,The Notorious B.I.G.,Somebody's Gotta Die,Life After Death (Remastered Edition),1.0
1,74k8qdan0o4DFa7L,Lil' Wayne,Megaman,Tha Carter IV (Deluxe),0.50826
2,wiw2rM2xb4CJ5sVP,Kanye West,Never Let Me Down,The College Dropout,0.504759
3,2rQiu54zZBs5Dmmi,2Pac,Heartz Of Men,All Eyez On Me,0.485196
4,8ptSfMLrxpbfRaxl,Nas,The Genesis,Illmatic,0.445888


In [22]:
# check reproducibility
rs_cosine.retrieve(1)

Unnamed: 0,id,artist,song,album_name,similarity
0,01gyRHLquwXDlhkO,The Notorious B.I.G.,Somebody's Gotta Die,Life After Death (Remastered Edition),1.0
1,74k8qdan0o4DFa7L,Lil' Wayne,Megaman,Tha Carter IV (Deluxe),0.50826
2,wiw2rM2xb4CJ5sVP,Kanye West,Never Let Me Down,The College Dropout,0.504759
3,2rQiu54zZBs5Dmmi,2Pac,Heartz Of Men,All Eyez On Me,0.485196
4,8ptSfMLrxpbfRaxl,Nas,The Genesis,Illmatic,0.445888


In [23]:
rs_cosine.retrieve("xUjqXrsiCLb8tPkC", n=10)

Unnamed: 0,id,artist,song,album_name,similarity
0,xUjqXrsiCLb8tPkC,Lena,Satellite,My Cassette Player,1.0
1,TEYr2jFKqLiibyI7,Sophie Ellis-Bextor,Love It Is Love,Shoot from the Hip,0.636165
2,FoenRGxt8ED5UDj8,Danger,11h30,French Attack!,0.604088
3,r9jKmYrPBtJRgg2k,The Weeknd,Can't Feel My Face - Martin Garrix Remix,Can't Feel My Face (Martin Garrix Remix),0.603747
4,ISH3Weh5lSybSTNb,Wings,Silly Love Songs,Wings At The Speed Of Sound,0.591854
5,4fNOkOQVsp2lkBen,Alicia Keys,Fallin',Songs In A Minor (Expanded Edition),0.590371
6,ptSQMoK9d8CydnHP,Phoenix,Chloroform,Bankrupt!,0.585183
7,U0fgnyiJBI9Ypbsj,Mika,Lollipop,Life in Cartoon Motion,0.573458
8,t8M8yJHyRlalRn4E,Hombres G,Te Quiero,Las baladas (Los singles vol II),0.565957
9,zI9DjkxOs5x1EhJQ,Anna Calvi,No More Words,Anna Calvi,0.562936


In [24]:
rs_cosine.retrieve("xUjqXrsiCLb8tPkC", n=3)

Unnamed: 0,id,artist,song,album_name,similarity
0,xUjqXrsiCLb8tPkC,Lena,Satellite,My Cassette Player,1.0
1,TEYr2jFKqLiibyI7,Sophie Ellis-Bextor,Love It Is Love,Shoot from the Hip,0.636165
2,FoenRGxt8ED5UDj8,Danger,11h30,French Attack!,0.604088


In [25]:
sample_song = SongInfo(title="Always", artist="Bon Jovi")

rs_cosine.retrieve(sample_song, n=10)

Unnamed: 0,id,artist,song,album_name,similarity
0,8nyPt8gB1K8g5FNv,Bon Jovi,Always,Cross Road,1.0
1,GuH9cRalvkhXvmaZ,Switchfoot,Always,Hello Hurricane,0.483889
2,yP56QH0tXHXsDNF8,Whitney Houston,I Will Always Love You,The Bodyguard - Original Soundtrack Album,0.483217
3,h47nxyjyRPFroIaQ,k.d. lang,Constant Craving,Recollection,0.472977
4,muLitfTmi9qTxcFV,HammerFall,Always Will Be,Gates Of Dalhalla,0.458007
5,7av2576enFL1qIFt,Sade,You're Not The Man,Promise,0.455112
6,d0ANC4yI7hpiCzPe,Taylor Dayne,I'll Always Love You,Tell It to My Heart (Expanded Edition),0.424737
7,4TzvRc17Z1Rn6LWj,Sophie Zelmani,Always You,My Best Friend's Wedding: Music From The Motio...,0.423746
8,aIJRXTjxYPzwX2Y7,Ace of Base,"Always Have, Always Will",Flowers (Remastered),0.421836
9,BRT3Xamcitwly4w6,Forfun,Pra Sempre,Alegria Compartilhada,0.418782


## Text-based (cos-sim, "feature")

We use BERT embeddings.

In [26]:
rs_bert = RetrievalSystem(df, cosine_similarity, "bert_embedding")

In [27]:
rs_bert.retrieve(1)

Unnamed: 0,id,artist,song,album_name,similarity
0,01gyRHLquwXDlhkO,The Notorious B.I.G.,Somebody's Gotta Die,Life After Death (Remastered Edition),1.0
1,jj55fEqkySIv4qy1,YNW Melly,Murder On My Mind,I AM YOU,0.75099
2,OokmnsloeW1Sh3NF,Freeway,What We Do,Philadelphia Freeway,0.731522
3,jJEgEAF2iuW4yk9g,Danny Brown,When It Rain,Atrocity Exhibition,0.729569
4,M65mU1UIozrDxcvu,Bad Meets Evil,Fast Lane,Hell: The Sequel (Deluxe),0.720946


In [28]:
# check reproducibility
rs_bert.retrieve(1)

Unnamed: 0,id,artist,song,album_name,similarity
0,01gyRHLquwXDlhkO,The Notorious B.I.G.,Somebody's Gotta Die,Life After Death (Remastered Edition),1.0
1,jj55fEqkySIv4qy1,YNW Melly,Murder On My Mind,I AM YOU,0.75099
2,OokmnsloeW1Sh3NF,Freeway,What We Do,Philadelphia Freeway,0.731522
3,jJEgEAF2iuW4yk9g,Danny Brown,When It Rain,Atrocity Exhibition,0.729569
4,M65mU1UIozrDxcvu,Bad Meets Evil,Fast Lane,Hell: The Sequel (Deluxe),0.720946


In [29]:
rs_bert.retrieve("xUjqXrsiCLb8tPkC", n=10)

Unnamed: 0,id,artist,song,album_name,similarity
0,xUjqXrsiCLb8tPkC,Lena,Satellite,My Cassette Player,1.0
1,KJyVTXfwdjUGtj5m,Boyzone,Love Is A Hurricane,Brother,0.838821
2,vMluKVsjLFKZEmdg,Rose Royce,Best Love,Essential - Soul Love,0.816716
3,uRNge3sN74NdIaYi,Pixie Lott,My Love,Turn It Up,0.809644
4,8nyPt8gB1K8g5FNv,Bon Jovi,Always,Cross Road,0.809535
5,w64KwSB7npnE8hb2,Feist,Inside and Out,Let It Die,0.809321
6,gkaNGhUTCZzICBcE,Frankie Valli,Can't Take My Eyes Off You,Relaxing Classical Playlist: Chilled Music for...,0.808088
7,DOlk8ct9zA9uIFX0,Selena Gomez & The Scene,A Year Without Rain,A Year Without Rain,0.807728
8,8xFD1UO8nr1qwOg6,Ed Sheeran,Thinking Out Loud,x (Deluxe Edition),0.804023
9,gX0rgtPnKPOD9qhT,Mariah Carey,To Be Around You,Emotions,0.802836


In [30]:
rs_bert.retrieve("xUjqXrsiCLb8tPkC", n=3)

Unnamed: 0,id,artist,song,album_name,similarity
0,xUjqXrsiCLb8tPkC,Lena,Satellite,My Cassette Player,1.0
1,KJyVTXfwdjUGtj5m,Boyzone,Love Is A Hurricane,Brother,0.838821
2,vMluKVsjLFKZEmdg,Rose Royce,Best Love,Essential - Soul Love,0.816716


In [31]:
sample_song = SongInfo(title="Always", artist="Bon Jovi")

rs_bert.retrieve(sample_song, n=10)

Unnamed: 0,id,artist,song,album_name,similarity
0,8nyPt8gB1K8g5FNv,Bon Jovi,Always,Cross Road,1.0
1,g5Y3fz7C9EnMXdMA,Lana Del Rey,Blue Jeans (Gesaffelstein Remix),Blue Jeans Remixes,0.861671
2,G2otV6WAmea6VB1f,Steps,Heartbeat,Step One,0.855285
3,DzOzu2jdYJQY7xH9,Mariah Carey,Whenever You Call,Butterfly,0.850007
4,ire6sDD2ryFx62Vx,Supertramp,My Kind Of Lady,Famous Last Words (Remastered),0.846296
5,RvExtv1mIVglWrLc,Amber Pacific,Gone So Young,The Possibility and the Promise,0.845325
6,8xFD1UO8nr1qwOg6,Ed Sheeran,Thinking Out Loud,x (Deluxe Edition),0.844703
7,qbNvLKSCnMbHHD26,Atomic Kitten,If You Come to Me,Ladies Night,0.839742
8,C9TfpaaJWG6ZbRmg,Oh Land,Love You Better,Wishbone,0.832535
9,WYDhkPUgu71CQnF2,Shania Twain,Forever and for Always,Up!,0.832055


## Text-based(""similarity", "feature")
Finally, we use word2vec embeddings as features and the simple Dot Product as similarity metric.

In [32]:
rs_w2v = RetrievalSystem(df, dot_product, "word2vec_embedding")

In [33]:
rs_w2v.retrieve(1)

Unnamed: 0,id,artist,song,album_name,similarity
0,kqVWk6G25ortikfF,Tame Impala,Beverly Laurel,Lonerism B-Sides & Remixes,0.20698
1,VJXAAxF84r7eg4bf,Blur,I Know,Leisure (Special Edition),0.200881
2,9ziWMjkmNbJKtU8Y,Tame Impala,Nangs,Currents,0.200703
3,PqjX8Jm9lA7ogAe4,Solange,Time (Is),When I Get Home,0.200372
4,Qh5XjFCInkeOAYzu,Supergrass,Time,I Should Coco,0.200032


In [34]:
# check reproducibility
rs_w2v.retrieve(1)

Unnamed: 0,id,artist,song,album_name,similarity
0,kqVWk6G25ortikfF,Tame Impala,Beverly Laurel,Lonerism B-Sides & Remixes,0.20698
1,VJXAAxF84r7eg4bf,Blur,I Know,Leisure (Special Edition),0.200881
2,9ziWMjkmNbJKtU8Y,Tame Impala,Nangs,Currents,0.200703
3,PqjX8Jm9lA7ogAe4,Solange,Time (Is),When I Get Home,0.200372
4,Qh5XjFCInkeOAYzu,Supergrass,Time,I Should Coco,0.200032


In [35]:
rs_w2v.retrieve("xUjqXrsiCLb8tPkC", n=10)

Unnamed: 0,id,artist,song,album_name,similarity
0,alViVmyWPgD1TNjV,Amy Winehouse,To Know Him Is to Love Him,Back To Black: B-Sides,0.25823
1,Qh5XjFCInkeOAYzu,Supergrass,Time,I Should Coco,0.257726
2,FoenRGxt8ED5UDj8,Danger,11h30,French Attack!,0.256744
3,bIDVY7FbsuHTgV50,Bent,To Be Loved,Intercept! Deluxe Edition,0.256257
4,9ziWMjkmNbJKtU8Y,Tame Impala,Nangs,Currents,0.251869
5,VJXAAxF84r7eg4bf,Blur,I Know,Leisure (Special Edition),0.249743
6,kqVWk6G25ortikfF,Tame Impala,Beverly Laurel,Lonerism B-Sides & Remixes,0.248669
7,wthZJWSU9CZmQ8XX,September,Because I Love You,September,0.245535
8,4XJZFt2CYHjejWh7,Avicii,Levels - Radio Edit,Sommer 2019,0.243641
9,oaSlacL5oON2ueP8,Orchestral Manoeuvres in the Dark,(Forever) Live and Die,The Pacific Age,0.241794


In [36]:
rs_w2v.retrieve("xUjqXrsiCLb8tPkC", n=3)

Unnamed: 0,id,artist,song,album_name,similarity
0,alViVmyWPgD1TNjV,Amy Winehouse,To Know Him Is to Love Him,Back To Black: B-Sides,0.25823
1,Qh5XjFCInkeOAYzu,Supergrass,Time,I Should Coco,0.257726
2,FoenRGxt8ED5UDj8,Danger,11h30,French Attack!,0.256744


In [37]:
sample_song = SongInfo(title="Always", artist="Bon Jovi")

rs_w2v.retrieve(sample_song, n=10)

Unnamed: 0,id,artist,song,album_name,similarity
0,9ziWMjkmNbJKtU8Y,Tame Impala,Nangs,Currents,0.242612
1,VJXAAxF84r7eg4bf,Blur,I Know,Leisure (Special Edition),0.23092
2,oaSlacL5oON2ueP8,Orchestral Manoeuvres in the Dark,(Forever) Live and Die,The Pacific Age,0.229072
3,kqVWk6G25ortikfF,Tame Impala,Beverly Laurel,Lonerism B-Sides & Remixes,0.228911
4,Qh5XjFCInkeOAYzu,Supergrass,Time,I Should Coco,0.224972
5,Y6S6YPEyfK5F8TMP,Blur,High Cool,Leisure,0.222764
6,alViVmyWPgD1TNjV,Amy Winehouse,To Know Him Is to Love Him,Back To Black: B-Sides,0.22207
7,TvyK48kBK3S8XOXD,Fatboy Slim,Going Out of My Head,The Greatest Hits: Why Try Harder,0.219967
8,qq3el8xwdckarmlh,The Brian Jonestown Massacre,Wasted,Methodrone,0.217051
9,Nblp573S43B1TMzx,Gipsy Kings,Tu Quieres Volver,Cantos de Amor / Love Songs,0.216332


The results already seem a lot less reasonable. However, for a full comparison, we should only change 1 variable at a time. Still, the results are interesting to look at.