# Read and Explore Data

In [1]:
import sys

sys.path.append("../")

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from typing import Union, Tuple
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

In [2]:
from task1.retrieval_system import RetrievalSystem, SongInfo
from task1.similarity_measure import (
    cosine_similarity,
    dot_product,
    manhattan_distance,
    euclidean_distance,
    random_similarity,
)
from utils import read, embed_and_merge

In [3]:
# basic song information from task 1
df = read("information", 0)
df

Unnamed: 0,id,artist,song,album_name
0,01Yfj2T3YTwJ1Yfy,We As Human,Take The Bullets Away (feat. Lacey Sturm),We As Human
1,01gyRHLquwXDlhkO,The Notorious B.I.G.,Somebody's Gotta Die,Life After Death (Remastered Edition)
2,01rMxQv6vhyE1oQX,Against the Current,Chasing Ghosts,In Our Bones
3,02RGE9FNH65RtMS7,Barthezz,Infected,Trance - The Early Years (1997-2002)
4,02ZnlCGZEbkfCDxo,Laura Pausini,Tra Te E Il Mare,The Best of Laura Pausini - E Ritorno Da Te
...,...,...,...,...
10090,zyzILCQvVeUFIINi,Crowded House,When You Come,Temple Of Low Men
10091,zzgS4ZqyswamEWNj,Britney Spears,My Only Wish (This Year),Platinum Christmas
10092,zzoFYDMlqU1X2zz1,Thundercat,DUI,Drunk
10093,zzpkRCGA5ud8q4mv,Otis Redding,Rock Me Baby,Otis Blue


In [4]:
# add genre information for metric calculation
genres = read("genres", 0)
# convert genre to actual list via eval
genres["genre"] = genres["genre"].apply(eval).apply(set)
df = df.merge(genres, on="id", how="left")

We load one new feature and the features from the previous assignments. We need them for comparison and fusion.

In [5]:
visual_feature = "resnet"
stats = read(visual_feature, 0)
df = embed_and_merge(df, stats, visual_feature)

for audio_feature in ["mfcc_bow", "blf_spectral", "ivec256", "musicnn"]:
    stats = read(audio_feature, 0)
    df = embed_and_merge(df, stats, audio_feature)

for text_feature in ["lyrics_bert", "lyrics_word2vec", "lyrics_tf-idf"]:
    stats = read(text_feature, 0)
    df = embed_and_merge(df, stats, text_feature.split("_")[1])

In [6]:
df.isna().sum()

id              0
artist          0
song            0
album_name      0
genre           1
resnet          1
mfcc_bow        1
blf_spectral    1
ivec256         1
musicnn         1
bert            0
word2vec        0
tf-idf          0
dtype: int64

In [7]:
# data for task 2 does not include the item with id "03Oc9WeMEmyLLQbj" = row 5
df = df.drop(5)
df = df.reset_index()

# Define retrieval systems

## From Task 1 (text-based)

In [8]:
rs_random = RetrievalSystem(
    df=df,
    sim_metric=random_similarity,
)

In [9]:
rs_cos_tdidf = RetrievalSystem(
    df=df,
    sim_metric=cosine_similarity,
    sim_feature="tf-idf",
)

In [10]:
rs_cos_bert = RetrievalSystem(
    df=df,
    sim_metric=cosine_similarity,
    sim_feature="bert",
)

In [11]:
rs_dot_w2v = RetrievalSystem(
    df=df,
    sim_metric=dot_product,
    sim_feature="word2vec",
)

## From Task 2 (audio-based)

In [12]:
rs_cos_mfcc = RetrievalSystem(
    df=df,
    sim_metric=cosine_similarity,
    sim_feature="mfcc_bow",
)

In [13]:
rs_cos_blf = RetrievalSystem(
    df=df,
    sim_metric=cosine_similarity,
    sim_feature="blf_spectral",
)

In [14]:
rs_cos_ivec256 = RetrievalSystem(
    df=df,
    sim_metric=cosine_similarity,
    sim_feature="ivec256",
)

In [15]:
rs_cos_dnn = RetrievalSystem(
    df=df,
    sim_metric=cosine_similarity,
    sim_feature="musicnn",
)

## From Task 3 (video-based; new!)


In [16]:
rs_cos_resnet = RetrievalSystem(
    df=df,
    sim_metric=cosine_similarity,
    sim_feature="resnet",
)

## Fusion Techniques

### Early Fusion
In this section we will perform early fusion of 2 features by:
- Concatenating two features
- Normalizing the aggregated feature
- Defining a retrieval system on the aggregated feature


In [17]:
def concat_features(first_feature, second_feature):
    # Concat features to form aggregated feature
    first = df[first_feature]
    second = df[second_feature]

    combined_features = pd.concat([first, second], axis=1)

    combined_features['aggr_feature'] = combined_features.apply(lambda row: np.concatenate(row), axis=1)

    print(f"Number of columns in the first feature: {len(combined_features.iloc[0, 0])}")
    print(f"Number of columns in the second feature: {len(combined_features.iloc[0, 1])}")
    print(f"Number of columns in the combined features: {len(combined_features.iloc[0, 2])}")

    # returns dataframe with first, second and combined feature
    return combined_features

In [18]:
def scale_feature(combined_features):
    # scale features to mean=0, sd=1
    scaler = StandardScaler()
    arr= []

    # convert to arr where feature values are columns and rows are samples
    for row in combined_features["aggr_feature"]:
        arr.append(row)
    arr =  np.array(arr)

    # fit scaler to whole arr
    scaler.fit(arr)

    # transform aggregated feature
    for row in combined_features["aggr_feature"]:
        scaler.transform(row.reshape(1, -1))

    # returns dataframe containing only the scaled feature
    return combined_features["aggr_feature"]

In [19]:
def early_fusion(first_feature, second_feature):
    # concat
    features = concat_features(first_feature, second_feature)
    # scale
    aggr_feature = scale_feature(features)

    # Add aggregated feature to the dataframe
    df["early_fusion"] = aggr_feature

    # And define new retrieval system instance for aggregated feature
    new_rs = RetrievalSystem(
        df=df,
        sim_metric=cosine_similarity,
        sim_feature="early_fusion",
    )
    # returns retrieval system instance with early fusion
    return new_rs


In [20]:
# define new RS
rs_cos_early_fusion = early_fusion("bert", "musicnn")

Number of columns in the first feature: 768
Number of columns in the second feature: 50
Number of columns in the combined features: 818


### Late Fusion
In this section we will perform late fusion of 2 retrieval systems using score aggregation by:
- precomputing all retrievals and their similarities for chosen retrieval systems.
- checking statistical compatability of scores
- fusing systems via score average

In [21]:
def pre_compute_retrievals(first_rs, second_rs):
    # pre compute retrievals
    res = []
    items_to_consider = 100

    for system in tqdm([first_rs, second_rs], desc="Precomputing systems"):
        sim_matrix = df[["id", "song", "artist"]].copy()
        sim_matrix["id_n"] = None
        sim_matrix["song_n"] = None
        sim_matrix["artist_n"] = None
        sim_matrix["sim_n"] = None
        sim_matrix["genre"] = None

        for idx, song_id in tqdm(enumerate(df["id"]), total=len(df["id"]), desc=f"Retrieving songs"):
            sim = system.retrieve(song_id, items_to_consider)
            # save relevant information for top_n returns
            sim_matrix.loc[idx]["id_n"] = sim["id"]
            sim_matrix.loc[idx]["song_n"] = sim["song"]
            sim_matrix.loc[idx]["artist_n"] = sim["artist"]
            sim_matrix.loc[idx]["sim_n"] = sim["similarity"]
            sim_matrix.loc[idx]["genre"] = sim["genre"]

        res.append(sim_matrix)

    # returns list with 2 dataframes containing top_n retrievals for each song
    return res

In [22]:
def check_stat(res):
    # check compatability of top-n scores
    for i, sim_ma in enumerate(["first_rs", "second_rs"]):
        arr = []
        for row in res[i]["sim_n"]:
            arr.append(row)
        arr =  np.array(arr)
        print(f"\nStatistics for scores of {str(sim_ma)}:")
        print(f"  Mean: {arr.mean().mean()}")
        print(f"  Standard Deviation: {arr.std().mean()}")

In [23]:
# late fusion/ retrieval system
class LateFusion(RetrievalSystem):

    """
    By extending the RS class and overriding the retrieve method,
    we can utilize the testing pipeline from task2.
    As the pipeline only calls RS.retrieve which would then call other functions within retrieve.
    For late fusion we precomputed everything thus the function retrieve really only retrieves.
    """
    def __init__(self, first_rs: RetrievalSystem, second_rs: RetrievalSystem, df: pd.DataFrame):
        super().__init__(df)
        self.res = pre_compute_retrievals(first_rs, second_rs)
        check_stat(self.res)
    def retrieve(self, query,  n: int = 10):
        # retrieval process of song like in class rs
        if isinstance(query, (int, str)):
            song_id = query
            if song_id not in self.df["id"].values:
                raise ValueError(f"Song id {song_id} not in the dataset.")
            song = (self.df.loc[song_id]
                if isinstance(song_id, int)
                else self.df[self.df["id"] == song_id].iloc[0]
            )
        elif isinstance(query, SongInfo):
            title, artist = query.title, query.artist
            song = self.df[(self.df["song"] == title) & (self.df["artist"] == artist)]
            if song.empty:
                raise ValueError(
                    f"Song with title '{title}' and artist '{artist}' not found in the dataset."
                )
            song = song.iloc[0]
        else:
            raise ValueError(
                "Invalid query type. Provide either song_id (int/str) or an instance of SongInfo."
            )

        # Instead of calc sim, we use late fusion and our precomputed res
        system_one = self.res[0][self.res[0]['id'] == song["id"]]
        system_two = self.res[1][self.res[1]['id'] == song["id"]]
        merge = pd.concat([system_one, system_two], ignore_index=True)

        top_n = pd.DataFrame(columns=["id", "song", "artist", "genre", "similarity"])
        score_dict = {}
        for n_id, n_song, n_artist, n_genre, n_scores in zip(merge["id_n"],
                                                    merge["song_n"],
                                                    merge["artist_n"],
                                                    merge["genre"],
                                                    merge["sim_n"]):
            for track_id, track, artist, genre, score in zip(n_id,
                                                      n_song,
                                                      n_artist,
                                                      n_genre,
                                                      n_scores):
                if track_id not in score_dict.keys():
                    score_dict[track_id] = score
                    new_row = {"id": track_id, 'song': track, 'artist': artist, "genre": genre, 'similarity': score}
                    top_n.loc[len(top_n)] = new_row
                else:
                    score_dict[track_id] += score
                    score_dict[track_id] /= 2
                    top_n.loc[top_n["id"] == track_id, "similarity"] = score_dict[track_id]


        top_n = top_n.sort_values(by="similarity", ascending=False)
        top_n = top_n.iloc[:n]

        return top_n


In [31]:
# Create new instance of late fusion RS
late_fusion = LateFusion(first_rs=rs_cos_dnn,
                         second_rs=rs_cos_bert,
                         df=df)


Precomputing systems:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving songs:   0%|          | 0/10094 [00:00<?, ?it/s]

Retrieving songs:   0%|          | 0/10094 [00:00<?, ?it/s]


Statistics for scores of first_rs:
  Mean: 0.9586202066257871
  Standard Deviation: 0.03826152450783243

Statistics for scores of second_rs:
  Mean: 0.6447264367486122
  Standard Deviation: 0.07763360100333433


In [32]:
sample_song = SongInfo(title="Always", artist="Bon Jovi")
late_fusion.retrieve(sample_song)

Unnamed: 0,id,song,artist,genre,similarity
0,7s9HbLG6ol1RTlLv,Spaceman,4 Non Blondes,"{grunge, indie rock, pop rock, pop, modern roc...",0.98515
1,cSz91lcJDuziR5nx,Everything Will Flow,Suede,"{glam rock, alternative pop, britpop, easy lis...",0.982254
2,LuyxeVk8b2tEEmwk,Dulce Soledad,Enjambre,{alternative rock},0.975917
3,QcoCbyve5S025Eox,Are You Still Mad,Alanis Morissette,"{singer songwriter, hard rock, pop rock, pop, ...",0.974529
4,KfCJWRrBAcPjxJNM,The One I Love,R.E.M.,"{soft rock, grunge, singer songwriter, indie r...",0.974124
6,cnLuNtwtIxId4eKd,Acid Rain,Avenged Sevenfold,"{metal, symphonic metal, hard rock, doom metal...",0.970998
7,iLRK4vyFeDJ5qM7x,Pain,Blackfield,"{art pop, emo, singer songwriter, art rock, po...",0.969246
8,mH2suhduIEa58xdJ,"Love, Save The Empty",Erin McCarley,"{piano rock, soft rock, soundtrack, easy liste...",0.96922
9,h2ap0A9dSDkFmO8D,Your Decision,Alice in Chains,"{metal, sludge metal, grunge, alternative meta...",0.96867
10,HrZQyMUA0enusyuh,Micro Cuts,Muse,"{metal, space rock, britpop, hard rock, art ro...",0.966543


## Combine all systems

In [33]:
# create pd dataframe consisting of all retrieval systems, with metric and feature
rs = pd.DataFrame(
    [
        ["random", "random", rs_random],
        ["cosine", "tf-idf", rs_cos_tdidf],
        ["cosine", "bert", rs_cos_bert],
        ["dot", "word2vec", rs_dot_w2v],
        ["cosine", "mfcc_bow", rs_cos_mfcc],
        ["cosine", "blf_spectral", rs_cos_blf],
        ["cosine", "ivec256", rs_cos_ivec256],
        ["cosine", "musicnn", rs_cos_dnn],
        ["cosine", "resnet", rs_cos_resnet],
        ["cosine", "early_fusion", rs_cos_early_fusion],
        ["score aggregation", "late_fusion", late_fusion],

    ],
    columns=["metric", "feature", "rs_object"],
)

# Evaluation

In [34]:
from pipeline import Pipeline

evaluate = Pipeline(rs, genres)

Creating genre overlap matrix:   0%|          | 0/10094 [00:00<?, ?it/s]

Creating result lists for every rs (max_k=100):   0%|          | 0/11 [00:00<?, ?it/s]

loaded results for system.metric='random', system.feature='random' from "results/random_random_results_100.npy"
loaded results for system.metric='cosine', system.feature='tf-idf' from "results/cosine_tf-idf_results_100.npy"
loaded results for system.metric='cosine', system.feature='bert' from "results/cosine_bert_results_100.npy"
loaded results for system.metric='dot', system.feature='word2vec' from "results/dot_word2vec_results_100.npy"
loaded results for system.metric='cosine', system.feature='mfcc_bow' from "results/cosine_mfcc_bow_results_100.npy"
loaded results for system.metric='cosine', system.feature='blf_spectral' from "results/cosine_blf_spectral_results_100.npy"
loaded results for system.metric='cosine', system.feature='ivec256' from "results/cosine_ivec256_results_100.npy"
loaded results for system.metric='cosine', system.feature='musicnn' from "results/cosine_musicnn_results_100.npy"
loaded results for system.metric='cosine', system.feature='resnet' from "results/cosine_re

In [35]:
evaluate.load_results_csv("task_3.csv")
result = evaluate.run(
    [
        (Pipeline.mean_precision_at_k, dict(k=10)),
        (Pipeline.mean_recall_at_k, dict(k=10)),
        (Pipeline.precision_and_recall_interval, dict(k_min=1, k_max=100, step_size=5)),
        (Pipeline.mean_ndcg_at_k, dict(k=10)),
        (Pipeline.genre_coverage_at_k, dict(k=10)),
         (Pipeline.mean_genre_diversity_at_k, dict(k=10)),
    ]
)
result.to_csv("task_3.csv", index=False)
result

running pipeline:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating 'mean_precision_at_k' with {'k': 10}:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating 'mean_recall_at_k' with {'k': 10}:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating 'precision_and_recall_interval' with {'k_min': 1, 'k_max': 100, 'step_size': 5}:   0%|          | …

... for metric_name='random', feature_name='random':   0%|          | 0/10094 [00:00<?, ?it/s]

... for metric_name='cosine', feature_name='tf-idf':   0%|          | 0/10094 [00:00<?, ?it/s]

... for metric_name='cosine', feature_name='bert':   0%|          | 0/10094 [00:00<?, ?it/s]

... for metric_name='dot', feature_name='word2vec':   0%|          | 0/10094 [00:00<?, ?it/s]

... for metric_name='cosine', feature_name='mfcc_bow':   0%|          | 0/10094 [00:00<?, ?it/s]

... for metric_name='cosine', feature_name='blf_spectral':   0%|          | 0/10094 [00:00<?, ?it/s]

... for metric_name='cosine', feature_name='ivec256':   0%|          | 0/10094 [00:00<?, ?it/s]

... for metric_name='cosine', feature_name='musicnn':   0%|          | 0/10094 [00:00<?, ?it/s]

... for metric_name='cosine', feature_name='resnet':   0%|          | 0/10094 [00:00<?, ?it/s]

... for metric_name='cosine', feature_name='early_fusion':   0%|          | 0/10094 [00:00<?, ?it/s]

... for metric_name='score aggregation', feature_name='late_fusion':   0%|          | 0/10094 [00:00<?, ?it/s]

Calculating 'mean_ndcg_at_k' with {'k': 10}:   0%|          | 0/11 [00:00<?, ?it/s]

... for metric_name='random', feature_name='random':   0%|          | 0/10094 [00:00<?, ?it/s]

... for metric_name='cosine', feature_name='tf-idf':   0%|          | 0/10094 [00:00<?, ?it/s]

... for metric_name='cosine', feature_name='bert':   0%|          | 0/10094 [00:00<?, ?it/s]

... for metric_name='dot', feature_name='word2vec':   0%|          | 0/10094 [00:00<?, ?it/s]

... for metric_name='cosine', feature_name='mfcc_bow':   0%|          | 0/10094 [00:00<?, ?it/s]

... for metric_name='cosine', feature_name='blf_spectral':   0%|          | 0/10094 [00:00<?, ?it/s]

... for metric_name='cosine', feature_name='ivec256':   0%|          | 0/10094 [00:00<?, ?it/s]

... for metric_name='cosine', feature_name='musicnn':   0%|          | 0/10094 [00:00<?, ?it/s]

... for metric_name='cosine', feature_name='resnet':   0%|          | 0/10094 [00:00<?, ?it/s]

... for metric_name='cosine', feature_name='early_fusion':   0%|          | 0/10094 [00:00<?, ?it/s]

... for metric_name='score aggregation', feature_name='late_fusion':   0%|          | 0/10094 [00:00<?, ?it/s]

Calculating 'genre_coverage_at_k' with {'k': 10}:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating 'mean_genre_diversity_at_k' with {'k': 10}:   0%|          | 0/11 [00:00<?, ?it/s]

Unnamed: 0,metric,feature,rs_object,mean_precision_at_k,mean_recall_at_k,precision_and_recall_interval,mean_ndcg_at_k,genre_coverage_at_k,mean_genre_diversity_at_k
0,random,random,<task1.retrieval_system.RetrievalSystem object...,0.445265,0.001015,"[[0.009954202217941022, 0.44519714681989475], ...",0.132403,1.0,5.069324
1,cosine,tf-idf,<task1.retrieval_system.RetrievalSystem object...,0.510709,0.00134,"[[0.012571036678462849, 0.4968733901327588], [...",0.163754,0.982014,4.974558
2,cosine,bert,<task1.retrieval_system.RetrievalSystem object...,0.557192,0.001862,"[[0.015937234048039744, 0.5369635426986397], [...",0.196265,0.956835,4.845966
3,dot,word2vec,<task1.retrieval_system.RetrievalSystem object...,0.455934,0.000933,"[[0.010528252219201853, 0.4745710322964205], [...",0.12416,0.480216,4.68167
4,cosine,mfcc_bow,<task1.retrieval_system.RetrievalSystem object...,0.588518,0.001737,"[[0.01545524587717471, 0.5700604319397719], [0...",0.21605,0.981115,4.743527
5,cosine,blf_spectral,<task1.retrieval_system.RetrievalSystem object...,0.589935,0.0018,"[[0.015473361125256452, 0.5671250247671918], [...",0.218365,0.972122,4.73767
6,cosine,ivec256,<task1.retrieval_system.RetrievalSystem object...,0.565564,0.001712,"[[0.014549593969567245, 0.5329869229245116], [...",0.208839,0.999101,4.905104
7,cosine,musicnn,<task1.retrieval_system.RetrievalSystem object...,0.623598,0.001937,"[[0.017839413791002175, 0.6090033683376298], [...",0.231601,0.997302,4.70599
8,cosine,resnet,<task1.retrieval_system.RetrievalSystem object...,0.542074,0.00155,"[[0.012884095659153591, 0.5148622944323407], [...",0.202587,0.967626,4.967308
9,cosine,early_fusion,<task1.retrieval_system.RetrievalSystem object...,0.664058,0.002245,"[[0.019825429197236427, 0.6416049138101897], [...",0.256975,0.922662,4.693437


In [37]:
for rs in result.itertuples():
    metrics_df = rs.precision_and_recall_interval

    plt.figure(figsize=(12, 8))
    plt.plot(
        metrics_df["recall"],
        metrics_df["precision"],
        marker="o",
        label=f"{rs.metric=} ({rs.feature=})",
    )

    # Annotate each point with its k value
    for k, recall, precision in zip(
        metrics_df.index, metrics_df["recall"], metrics_df["precision"]
    ):
        plt.annotate(
            f"k={k}",
            (recall, precision),
            textcoords="offset points",
            xytext=(0, 10),
            ha="center",
        )

    plt.title(f"Precision-Recall Curve for {rs.metric=} with {rs.feature=}")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.legend()
    plt.grid(True)
    plt.show()

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

<Figure size 1200x800 with 0 Axes>