In [None]:
import ast
import os

import pandas as pd
import numpy as np
from scipy.optimize import linear_sum_assignment
from sklearn.metrics.pairwise import cosine_similarity

from scripts.create_embeddings_dataset import batch_create_images_dataset, create_audio_dataset, \
    batch_create_audio_embeddings

# Images dataset

In [None]:
output_dir = "../images"

In [None]:
batch_create_images_dataset(images_dir='../images/imagesf2', output_dir="../images", batch_size=10, count=1000,
                            offset=4500)

In [None]:
batches = [pklf for pklf in os.listdir(output_dir) if "images_dataset" in pklf and pklf.endswith(".pkl")]
dfs = []
for batch in batches:
    dfs.append(pd.read_pickle(os.path.join(output_dir, batch)))

images_df = pd.concat(dfs, ignore_index=True)
images_df.drop_duplicates(subset=["image_path"], inplace=True)
images_df.to_pickle(os.path.join(output_dir, "images_dataset_concat.pkl"))

# Audio dataset

In [None]:
audio_dir = "../music/fma_small"
output_dir = "../music/"

In [None]:
music_df = create_audio_dataset(audio_dir, output_path="../music/music_df.csv",count=10000)
music_df.head()

In [None]:
batch_create_audio_embeddings(audio_df_path="../music/music_df.csv", output_dir="../music", batch_size=25, count=2000, offset=1700)

In [None]:
batches = [pklf for pklf in os.listdir(output_dir) if "audio_dataset" in pklf and pklf.endswith(".pkl")]
dfs = []
for batch in batches:
    dfs.append(pd.read_pickle(os.path.join(output_dir, batch)))

images_df = pd.concat(dfs, ignore_index=True)
images_df.drop_duplicates(subset=["audio_path"], inplace=True)
images_df.to_pickle(os.path.join(output_dir, "audio_dataset_concat.pkl"))

# Match images and audio

In [None]:
music_dataset_path = "../music/audio_dataset_concat.pkl"
images_dataset_path = "../images/images_dataset_concat.pkl"

music_df = pd.read_pickle(music_dataset_path)
images_df = pd.read_pickle(images_dataset_path)

print("Images dataset size: ", len(images_df))
print("Audio dataset size: ", len(music_df))

In [None]:
def match_datasets(image_embeddings_df: pd.DataFrame, music_embeddings_df: pd.DataFrame) -> pd.DataFrame:
    image_embeddings = image_embeddings_df[["embeddings"]]
    music_embeddings = music_embeddings_df[["embeddings"]]

    image_embeddings = np.array([np.array(e) for e in image_embeddings["embeddings"].tolist()])
    music_embeddings = np.array([np.array(e) for e in music_embeddings["embeddings"].tolist()])

    similarity_matrix = cosine_similarity(image_embeddings, music_embeddings)

    cost_matrix = 1 - similarity_matrix
    image_ind, music_ind = linear_sum_assignment(cost_matrix)

    matched_pairs = pd.DataFrame()
    matched_pairs["image_path"] = image_embeddings_df.iloc[image_ind]["image_path"].values
    matched_pairs["audio_path"] = music_embeddings_df.iloc[music_ind]["audio_path"].values
    matched_pairs["music_embedding"] = music_embeddings_df.iloc[music_ind]["embeddings"].values
    matched_pairs["image_embedding"] = image_embeddings_df.iloc[image_ind]["embeddings"].values
    matched_pairs["score"] = 1 - cost_matrix[image_ind, music_ind]

    return matched_pairs

In [None]:
matched_df = match_datasets(images_df, music_df)
matched_df.sort_values(by="score", ascending=False, inplace=True)
matched_df.reset_index(drop=True, inplace=True)

In [None]:
matched_df.head()