In [7]:
import pandas as pd
import numpy as np


In [8]:
images_df = pd.read_csv('../data/images/images_dataset.csv')
genre_df = pd.read_csv('../data/text/genre_embeddings.csv')
audio_df = pd.read_csv('../data/music/audio_dataset.csv')
images_genre_df = pd.read_csv('../data/images_genre_scores.csv')
images_genre_df.head()

Unnamed: 0,image_path,genre,score
0,a-mishra_expression-of-sadness-i.jpg,metal,0.167402
1,a-mishra_expression-of-sadness-i.jpg,rock,0.156467
2,a-mishra_expression-of-sadness-i.jpg,blues,0.107376
3,a-mishra_expression-of-sadness-i.jpg,hip-Hop,0.08061
4,a-mishra_expression-of-sadness-i.jpg,country,0.062826


In [9]:
images_genre_df = images_df.merge(images_genre_df, on='image_path')
images_genre_df.head()

Unnamed: 0,image_path,embeddings,genre,score
0,a-mishra_expression-of-sadness-i.jpg,"[0.0019837517756968737, -0.030643584206700325,...",metal,0.167402
1,a-mishra_expression-of-sadness-i.jpg,"[0.0019837517756968737, -0.030643584206700325,...",rock,0.156467
2,a-mishra_expression-of-sadness-i.jpg,"[0.0019837517756968737, -0.030643584206700325,...",blues,0.107376
3,a-mishra_expression-of-sadness-i.jpg,"[0.0019837517756968737, -0.030643584206700325,...",hip-Hop,0.08061
4,a-mishra_expression-of-sadness-i.jpg,"[0.0019837517756968737, -0.030643584206700325,...",country,0.062826


# Metrics

In [10]:
from metrics.genre import top_k_genre_accuracy, map
from embeddings.scorer import EmbeddingScorer
from embeddings.embedder import ImageBindEmbedder

scorer = EmbeddingScorer()

In [11]:
import ast

results = {"audio_id": [], "image_id": [], "audio_path": [], "image_path": [], "score": []}
image_embs = []
for i, row in images_df.iterrows():
    list_from_str = ast.literal_eval(row['embeddings'])
    image_embs.append(np.array(list_from_str))
image_embs = np.array(image_embs)


for i, row in audio_df.iterrows():
    list_from_str = ast.literal_eval(row['embeddings'])
    emb = np.array(list_from_str).reshape(1, -1)
    top_k, scores = scorer.find_topk(query=emb, targets=image_embs, top_k=3)
    for idx, score in zip(top_k, scores):
        results["audio_id"].append(i)
        results["image_id"].append(idx)
        results["audio_path"].append(row["audio_path"])
        results["image_path"].append(images_df.iloc[idx]["image_path"])
        results["score"].append(score)

In [12]:
res_df = pd.DataFrame(results)
res_df["genre"] = res_df["audio_path"].apply(lambda x: x.split("\\")[-2])
res_df.head(3)

Unnamed: 0,audio_id,image_id,audio_path,image_path,score,genre
0,0,293,../data/music\blues\Andrew Christopher Smith -...,lilla-cabot-perry_the-cellist.jpg,0.34319,blues
1,0,91,../data/music\blues\Andrew Christopher Smith -...,danielson-gambogi-elin_pianospelare-1907.jpg,0.271812,blues
2,0,491,../data/music\blues\Andrew Christopher Smith -...,william-merritt-chase_the-song.jpg,0.21392,blues


In [13]:
# join with genre_df
final_df = res_df.merge(images_genre_df, left_on='image_path', right_on='image_path')

In [14]:
final_df = final_df[['audio_id', 'image_id', 'audio_path', 'image_path', 'genre_x', 'score_x', 'genre_y', 'score_y']]
final_df.rename(columns={'genre_x': 'audio_genre', 'score_x': 'audio_score', 'genre_y': 'image_genre', 'score_y': 'image_score'}, inplace=True)

In [15]:
# gruop by audio_id and get the top-5 image_id
final_df = final_df.sort_values(by=['audio_id', 'audio_score', 'image_score'], ascending=False)

In [16]:
for k in [1, 2, 3, 5]:
    grouped_df = final_df.groupby('audio_id').head(k)
    y_pred = grouped_df['audio_genre'].to_numpy()
    y_true = grouped_df['image_genre'].to_numpy()
    print(f"Top-{k} genre accuracy: {top_k_genre_accuracy(y_true, y_pred, k)}")

Top-1 genre accuracy: 0.24
Top-2 genre accuracy: 0.42
Top-3 genre accuracy: 0.44
Top-5 genre accuracy: 0.58
