In [34]:
import sys

sys.path.append("../")

import pandas as pd
import os
import matplotlib.pyplot as plt
from task1.retrieval_system import RetrievalSystem, SongInfo
from task1.similarity_measure import (
    cosine_similarity,
    dot_product,
    manhattan_distance,
    euclidean_distance,
    random_similarity,
)

from utils import embed_and_merge

In [39]:
def read(feature, h=0):
    file_path = os.path.join(
        "..", "data", f"id_{feature}_mmsr.tsv"
    )
    return pd.read_csv(file_path, delimiter="\t", header=h)
df = read("information", 0)
# add genre information for metric calculation
genres = read("genres", 0)
# convert genre to actual list via eval
genres["genre"] = genres["genre"].apply(eval).apply(set)
df = df.merge(genres, on="id", how="left")
for text_feature in ["lyrics_tf-idf"]:
        stats = read(text_feature, 0)
        df = embed_and_merge(df, stats, text_feature.split("_")[1])
        
visual_feature = "resnet"
stats = read(visual_feature, 0)
df = embed_and_merge(df, stats, visual_feature)

for audio_feature in ["ivec256", "musicnn"]:
    stats = read(audio_feature, 0)
    df = embed_and_merge(df, stats, audio_feature)
    
# data for task 2 does not include the item with id "03Oc9WeMEmyLLQbj" = row 5
df = df.drop(5)
df = df.reset_index()
df.head()

Unnamed: 0,index,id,artist,song,album_name,genre,tf-idf,resnet,ivec256,musicnn
0,0,01Yfj2T3YTwJ1Yfy,We As Human,Take The Bullets Away (feat. Lacey Sturm),We As Human,"{christian rock, rock}","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0818293914712727, ...","[0.0, 0.110133, 0.31062, 0.0, 0.003017, 0.1360...","[-0.4192236661911011, -1.262790322303772, -0.3...","[0.12903129, 0.0011226882, 0.0065768533, 0.082..."
1,1,01gyRHLquwXDlhkO,The Notorious B.I.G.,Somebody's Gotta Die,Life After Death (Remastered Edition),"{hip hop, grindcore, death metal, rap}","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.016343, 0.029984, 0.018722, 0.0, 0.17283, 0...","[1.4574661254882812, 0.619476318359375, -0.351...","[0.026824217, 0.00087343465, 0.009360876, 0.31..."
2,2,01rMxQv6vhyE1oQX,Against the Current,Chasing Ghosts,In Our Bones,"{rock, pop punk}","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.000348, 0.0, 1.073413, 0.0, 0.097732, 0.082...","[1.1996194124221802, -0.2549396753311157, 0.48...","[0.2518178, 0.0043474026, 0.07843659, 0.056584..."
3,3,02RGE9FNH65RtMS7,Barthezz,Infected,Trance - The Early Years (1997-2002),"{techno, trance, progressive trance}","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.747856, 1.07442, 0.618729, 0.03181, 0.09682...","[-0.6352253556251526, 0.6530497074127197, -1.6...","[0.00045163534, 0.0013956887, 0.002990173, 0.8..."
4,4,02ZnlCGZEbkfCDxo,Laura Pausini,Tra Te E Il Mare,The Best of Laura Pausini - E Ritorno Da Te,"{pop, world, latin, latin pop, water, europop,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.2413163920156013, ...","[0.586726, 2.380147, 2.270726, 0.009559, 0.062...","[-0.1336851567029953, 0.2777222692966461, -0.2...","[0.21950364, 0.011686802, 0.054231934, 0.06058..."


In [42]:
res = pd.DataFrame(columns=["id","tfidf", "resnet", "ivec256"])
max_rows = 10
for row in df.itertuples():
    songInfo = SongInfo(title=row.song, artist=row.artist)
    rs_tfidf = RetrievalSystem(
        df=df,
        sim_metric=cosine_similarity,
        sim_feature="ivec256",
    )
    rs_resnet = RetrievalSystem(
        df=df,
        sim_metric=cosine_similarity,
        sim_feature="resnet",
    )
    rs_ivec256 = RetrievalSystem(
        df=df,
        sim_metric=cosine_similarity,
        sim_feature="ivec256",
    )
    res.loc[len(res)] = [row.id, ';'.join(rs_tfidf.retrieve(songInfo, 10)['id']), ';'.join(rs_resnet.retrieve(songInfo, 10)['id']), ';'.join(rs_ivec256.retrieve(songInfo, 10)['id'])]
    
res.to_csv("cached_results.csv", index=False)
    
  