# Computing distances with RBF Kernel

### We load the dataset and cut it to make it computable. Visualizing in EDA it seems that they represent the complete dataset in somewhat representative way.
### Let's gets the first 20000 entries

In [6]:
# Loading dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path

# Loading dataframe

In [7]:
DATAFRAME_PATH = Path("./dataset")
DF_FILENAME = "song_info_release_dataset_29986_entries_filtered.csv"
dataframe = pd.read_csv(DATAFRAME_PATH / DF_FILENAME)
dataframe

Unnamed: 0,id,artist,song,album_name,genres,popularity,release,duration_ms
0,0009fFIM1eYThaPg,Cheryl,Rain on Me,3 Words,pop,12.0,2009,229947
1,0010xmHR6UICBOYT,Oddisee,After Thoughts,The Beauty in All,underground hip hop,46.0,2013,325096
2,002Jyd0vN4HyCpqL,Blue Öyster Cult,ME 262,Secret Treaties,"hard rock,rock,classic rock",31.0,1974,285693
3,006TYKNjNxWjfKjy,Rhapsody,Flames of Revenge,Legendary Years (Re-Recorded),"symphonic metal,power metal,symphonic power metal",33.0,2017,332867
4,007LIJOPQ4Sb98qV,The Chameleons,Nostalgia,What Does Anything Mean? Basically (2009 Remas...,"post-punk,new wave",19.0,2009,326067
...,...,...,...,...,...,...,...,...
29981,H1sPgVsyvDpq0Te6,Vulfpeck,Outro,Vollmilch,"funk,soul",42.0,2012,275849
29982,H1sqqdwmIzskpKKy,Scar Symmetry,The Anomaly,The Unseen Empire,"melodic death metal,melodic death metal,progre...",36.0,2011,230887
29983,H1tA6joyPNVBQ8JL,TRF,BOY MEETS GIRL,WORKS -THE BEST OF TRF-,j-pop,40.0,1998,288560
29984,H1yNQyucqwOzKgpi,La casa azul,Los Chicos Hoy Saltarán A La Pista,La Polinesia Meridional,"easy listening,soft rock,latin,pop,electropop",40.0,2011,202972


In [8]:
dataframe_sorted = dataframe.copy(deep=True)

In [9]:
# Fix wrong release data for some songs
dataframe_sorted['release'].replace(1013, 2013, inplace=True)

# Sort the dataset by date
dataframe_sorted = dataframe_sorted.sort_values(by=['release'])

dataframe_sorted

Unnamed: 0,id,artist,song,album_name,genres,popularity,release,duration_ms
5987,3MEb9LZbB80nQ1a8,Louis Armstrong,St. James Infirmary,The Complete Hot Five And Hot Seven Recordings...,"jazz,blues",29.0,1928,191867
24349,DqO2fLBqdVsERa1Z,Louis Armstrong,Mack the Knife,The Great American Songbook,"jazz,swing,jazz,blues,swing",43.0,1929,201467
2841,1Z7Pb158yANCZ7zN,Billie Holiday,Georgia On My Mind,Lady Day: The Complete Billie Holiday On Colum...,"jazz,vocal jazz,blues",24.0,1933,198560
822,0SI6oF0XlACvZdQT,Billie Holiday,All Of Me,Lady Day: The Complete Billie Holiday On Colum...,"jazz,vocal jazz,blues,jazz,blues",54.0,1933,181440
15583,8rCzU7kVpoJ0Z37D,Billie Holiday,A Fine Romance,Lady Day: The Complete Billie Holiday On Colum...,"jazz,jazz,blues",24.0,1933,171467
...,...,...,...,...,...,...,...,...
18420,ASqcyaIPBKyLiZPZ,Meghan Trainor,Marry Me,THE LOVE TRAIN,"pop,soul,jazz",47.0,2019,215982
9857,5YWnfGKCEtigHqqg,Talos,Far Out Dust,Far Out Dust,ambient,43.0,2019,282044
6636,3juSIghV4YtniyYj,Said the Whale,Love Always,Cascadia,"indie rock,indie pop",26.0,2019,167707
6673,3l5khhGHMUJ0ROPu,Salvador Sobral,Anda Estragar-me Os Planos,"Paris, Lisboa","jazz,soul,eurovision",29.0,2019,166400


In [10]:
# Loading Transfer Learning Features
transfer_learning_features = np.load("dataset/features.npy")
print("Shape: ", np.shape(transfer_learning_features))

# The transfer learning features have another ordenation, so we have to map our df to that
# Open list of files.txt
list_of_files = []
with open("dataset/list_of_files.txt", "r") as files_list:
    list_of_files = files_list.read().split(sep="\n")

print(list_of_files[:5])

# Get only the ids of the files so i can search by ID and get the row index
only_ids = []
for file_name in list_of_files:
    temp_path = Path(file_name)
    only_ids.append(temp_path.stem) # Gets only the filename without the extension .type

# Get every ID of our sorted dataframe
ids_sorted = dataframe_sorted["id"].to_numpy()

# Now i only need to create a list of indexes and add as a column
mapping_of_indexes = []

# Make the mapping of the indexes
for song_id in ids_sorted:
    mapping_of_indexes.append(only_ids.index(song_id))

# Adding as a column
dataframe_sorted["mapping_to_fv_index"] = mapping_of_indexes

# Reseting the index so that iloc works
df_sorted_reset_index = dataframe_sorted.reset_index()
df_sorted_reset_index

Shape:  (109269, 160)
['/home/giovanni-server/dev/college/TCC/dataset/music4all/audios/0YPssy4JykwvVcCR.mp3', '/home/giovanni-server/dev/college/TCC/dataset/music4all/audios/rkwibLwcBO99jzZL.mp3', '/home/giovanni-server/dev/college/TCC/dataset/music4all/audios/0ymAoxuOvi75xaY3.mp3', '/home/giovanni-server/dev/college/TCC/dataset/music4all/audios/0YMdOC9wMLrw2Wen.mp3', '/home/giovanni-server/dev/college/TCC/dataset/music4all/audios/RKwkKPcuPrqoJquj.mp3']


Unnamed: 0,index,id,artist,song,album_name,genres,popularity,release,duration_ms,mapping_to_fv_index
0,5987,3MEb9LZbB80nQ1a8,Louis Armstrong,St. James Infirmary,The Complete Hot Five And Hot Seven Recordings...,"jazz,blues",29.0,1928,191867,71045
1,24349,DqO2fLBqdVsERa1Z,Louis Armstrong,Mack the Knife,The Great American Songbook,"jazz,swing,jazz,blues,swing",43.0,1929,201467,20950
2,2841,1Z7Pb158yANCZ7zN,Billie Holiday,Georgia On My Mind,Lady Day: The Complete Billie Holiday On Colum...,"jazz,vocal jazz,blues",24.0,1933,198560,91346
3,822,0SI6oF0XlACvZdQT,Billie Holiday,All Of Me,Lady Day: The Complete Billie Holiday On Colum...,"jazz,vocal jazz,blues,jazz,blues",54.0,1933,181440,65996
4,15583,8rCzU7kVpoJ0Z37D,Billie Holiday,A Fine Romance,Lady Day: The Complete Billie Holiday On Colum...,"jazz,jazz,blues",24.0,1933,171467,108063
...,...,...,...,...,...,...,...,...,...,...
29981,18420,ASqcyaIPBKyLiZPZ,Meghan Trainor,Marry Me,THE LOVE TRAIN,"pop,soul,jazz",47.0,2019,215982,64357
29982,9857,5YWnfGKCEtigHqqg,Talos,Far Out Dust,Far Out Dust,ambient,43.0,2019,282044,103866
29983,6636,3juSIghV4YtniyYj,Said the Whale,Love Always,Cascadia,"indie rock,indie pop",26.0,2019,167707,79436
29984,6673,3l5khhGHMUJ0ROPu,Salvador Sobral,Anda Estragar-me Os Planos,"Paris, Lisboa","jazz,soul,eurovision",29.0,2019,166400,36509


# Loading all features in a feature matrix

In [11]:
FEATURES_PATH = Path("../../dataset/dataset_mfcc")
FILE_ENDING = "_mfcc.npy"

def get_mfcc_feature_vector(df):
    """ Load and append the feature vector extracted to a variable """
    feature_vector = []
    ids_list = df['id'].to_list()
    for song_id in ids_list:
        file_name = song_id + FILE_ENDING
        file_path = FEATURES_PATH / file_name
        feature_vector.append(np.load(file_path).tolist())
    return feature_vector

mfcc_feature_vector = get_mfcc_feature_vector(dataframe_sorted)
print(np.shape(mfcc_feature_vector))

(29986, 120)


In [12]:
def get_transfer_learning_feature_vector(df, transfer_learning_feature_vector):
    feature_vector = []
    indexes_list = df['mapping_to_fv_index'].to_list()
    for index in indexes_list:
        feature_vector.append(transfer_learning_feature_vector[index])
    return feature_vector

transfer_learning_feature_vector = get_transfer_learning_feature_vector(df_sorted_reset_index, transfer_learning_features)
print(np.shape(transfer_learning_feature_vector))

(29986, 160)


# Export to avoid computations again

In [13]:
def export_feature_vector(feature_vector, name):
    """ Save computation next time! """
    size = np.shape(feature_vector)[0]
    np.save(f"{name}_feature_vector_{size}_samples.npy", feature_vector)
    print(f"Saving {name} feature_vector of {size} samples complete!")

export_feature_vector(transfer_learning_feature_vector, "transfer_learning")
export_feature_vector(mfcc_feature_vector, "mfcc")

Saving transfer_learning feature_vector of 29986 samples complete!
Saving mfcc feature_vector of 29986 samples complete!


# Computing the similarity matrix using the RBF Kernel

In [14]:
from sklearn.metrics.pairwise import rbf_kernel

try: # Try to free memory when running for a second time
    del similarity_matrix
    import gc
    gc.collect()
except NameError:
    pass

gamma = 0.1
similarity_matrix = rbf_kernel(transfer_learning_feature_vector, gamma=gamma)
np.fill_diagonal(similarity_matrix, 0)

In [15]:
def save_similarity_matrix(gamma, matrix):
    size = np.shape(matrix)[0]
    np.save(f"similarity_matrix_{size}_samples_gamma_{gamma}.npy", matrix)
    print(f"Saving similarity matrix of size {size} complete!")

save_similarity_matrix(gamma, similarity_matrix)

Saving similarity matrix of size 29986 complete!


In [17]:
# Just checking how many relevant similarities we get
rows, _ = np.where(similarity_matrix > 0.6)
len(rows)

587025378

# Helper functions to help analyze/filter the dataset 

In [16]:
from math import ceil

def show_similarity(df, similarity_matrix, song_index, song_two_index):
    """ Shows the similarity of two entries of a similarity matrix """
    print()
    print(df.iloc[song_index])
    print(df.iloc[song_two_index])
    print()
    similarity = similarity_matrix[song_index, song_two_index]
    print(f"Similarity: {similarity}")
    
def get_songs_list(df, similarity_matrix, similarity_threshold, limit=10):
    """ Get the list of song similarities for a similarity threshold """
    rows, columns = np.where(similarity_matrix > similarity_threshold)
    print(len(similarity_matrix[rows, columns]))
    songs_list = np.array((rows, columns)).T
    songs_list = songs_list[:ceil(len(songs_list)/2)]
    for song in songs_list[:limit]:
        show_similarity(df, similarity_matrix, song[0], song[1])

def find_empty_songs_export_dataset(df, similarity_matrix):
    """ Function to remove and export the dataset without the empty songs
        needs to be ran after the similarity matrix has been computed """
    def get_df_size(df):
        return np.shape(df)[0]
        
    def find_madonna_song_index(df):
        """ Finds a song that we know that doesn't have audio """
        return df.query("song == 'Candy Shop Medley (Live)'").index
    
    madonna_song_index = find_madonna_song_index(df)
    # the [0] its because it is a 2d array containing only 1 list
    madonna_row = similarity_matrix[madonna_song_index][0] 
    empty_samples_to_exclude_list = np.where(madonna_row > 0.9)[0]
    np.append(empty_samples_to_exclude_list, madonna_song_index)

    print(f"There are {len(empty_samples_to_exclude_list)} empty songs")

    # find their ids
    empty_samples_to_exclude_list_ids = df['id'][empty_samples_to_exclude_list].to_list()
    
    print("Removing empty songs from dataset")
    df = df[~df['id'].isin(empty_samples_to_exclude_list_ids)]
    
    print(f"Exporting dataset of size {get_df_size(df)}")
    df.to_csv(f"song_info_dataset_{get_df_size(df)}_entries_filtered.csv", index=False)

# Exporting the dataset filtered

In [12]:
find_empty_songs_export_dataset(dataframe_sorted, similarity_matrix)

There are 0 empty songs
Removing empty songs from dataset
Exporting dataset of size 29986


In [19]:
import sys
import gc

def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
                            key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

#del similarity_matrix
gc.collect()


             similarity_matrix:  3.3 GiB
    transfer_learning_features: 66.7 MiB
                            __: 11.7 MiB
         df_sorted_reset_index: 11.7 MiB
                           _10: 11.7 MiB
                           ___: 11.7 MiB
              dataframe_sorted: 11.7 MiB
                            _9: 11.7 MiB
                     dataframe: 11.2 MiB
                            _7: 11.2 MiB


0

# Showing some songs with high similarity

### When using gamma = 0.01, only masters and remasters of the same songs show as similar

In [14]:
get_songs_list(dataframe, similarity_matrix, 0.8)

140

id                                             06SC55N0G2GjCNuS
artist                                              David Bowie
song                                         Station to Station
album_name         Station To Station (2016 Remastered Version)
genres        rock,progressive rock,classic rock,art rock,ro...
Name: 159, dtype: object
id                                        7JmWTjyaKLzHmKBo
artist                                         David Bowie
song          Station To Station (2016 Remastered Version)
album_name    Station To Station (2016 Remastered Version)
genres                         glam rock,rock,classic rock
Name: 12886, dtype: object

Similarity: 0.9999999999995635

id                    0EvGCDSvdzIZdcQ3
artist                      Kid Abelha
song                      Peito Aberto
album_name                   Pega Vida
genres        mpb,pop nacional,mpb,pop
Name: 403, dtype: object
id            6xboh6SVR8Z0qHXj
artist              Kid Abelha
song         

# Showing the same information but for the beatles

In [15]:
beatles_df = dataframe.query("artist == 'The Beatles'")
print(len(beatles_df))
beatles_feature_vector = get_feature_vector(beatles_df)
beatles_similarity_matrix = rbf_kernel(beatles_feature_vector, gamma=0.001)
np.fill_diagonal(beatles_similarity_matrix, 0)
get_songs_list(beatles_df, beatles_similarity_matrix, 0.7)

37
2

id                                      6NRxkIqKybHZZGvZ
artist                                       The Beatles
song          Why Don't We Do It In The Road? - 2018 Mix
album_name                                   The Beatles
genres                             classic rock,rock,pop
Name: 11252, dtype: object
id                                             BD1AtNm4DiD3tjQ0
artist                                              The Beatles
song          Why Don't We Do It In The Road? - Remastered 2009
album_name                             The Beatles (Remastered)
genres                   rock,classic rock,pop,psychedelic rock
Name: 19734, dtype: object

Similarity: 1.0


In [62]:
np.shape(similarity_matrix)

(19995, 19995)