# Computing distances with RBF Kernel

### This are the steps to calculate the disruption:
1. Extract the feature representation of the audio
2. Calculate the "distance" for each song against any other song and store this result (in a matrix)
3. Use this "distance" or "similarity" matrix to build the network

As the dataset is too big to use in its entirety, we use 1/3 of it (as it is the limit I can use to compute)

Ps. When visualizing in EDA it seems that the cut dataset still is a good representation of the complete dataset

### Loading dataset

The first step is to load the dataset

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from pathlib import Path

This treated dataset contains the id, artists, song, album name, genres list, popularity, release and duration of each song

In [3]:
dataset_size = 30000
feat_type = "transfer_learning"

DATASETS_FOLDER = Path("./dataset")
DATAFRAME_PATH = DATASETS_FOLDER / "cleaned_datasets" 
DF_FILENAME = f"cleaned_song_info_{dataset_size}_entries.csv"

dataframe = pd.read_csv(DATAFRAME_PATH / DF_FILENAME)
print(f"columns: {dataframe.columns}\nsize: {len(dataframe)}")

columns: Index(['id', 'artist', 'song', 'album_name', 'genres', 'spotify_id',
       'popularity', 'release', 'danceability', 'energy', 'key', 'mode',
       'valence', 'tempo', 'duration_ms'],
      dtype='object')
size: 109178


To work with our dataframe a good practice is to make a copy of it so we do not modify the original one

In [4]:
# Copy the dataframe
working_dataframe = dataframe.copy(deep=True)

We'll have to fix some wrong release dates before we work with this dataset

In [4]:
# Sort the dataframe by release date (as this is going to be important when generating the similarity matrix)
dataframe_sorted = working_dataframe.sort_values(by=['release'])
dataframe_sorted.head()

Unnamed: 0,id,artist,song,album_name,genres,spotify_id,popularity,release,danceability,energy,key,mode,valence,tempo,duration_ms
5986,3MEb9LZbB80nQ1a8,Louis Armstrong,St. James Infirmary,The Complete Hot Five And Hot Seven Recordings...,"jazz,blues",7fAa9rz4UmwuB4AGh50Gmp,29.0,1928,0.693,0.182,5.0,0.0,0.588,116.508,191867
24346,DqO2fLBqdVsERa1Z,Louis Armstrong,Mack the Knife,The Great American Songbook,"jazz,swing,jazz,blues,swing",0RNxWy0PC3AyH4ThH3aGK6,43.0,1929,0.673,0.377,0.0,1.0,0.713,88.973,201467
822,0SI6oF0XlACvZdQT,Billie Holiday,All Of Me,Lady Day: The Complete Billie Holiday On Colum...,"jazz,vocal jazz,blues,jazz,blues",1LGqJ3nvxpVXDWpEzq4DJD,54.0,1933,0.504,0.0644,2.0,0.0,0.403,106.994,181440
2840,1Z7Pb158yANCZ7zN,Billie Holiday,Georgia On My Mind,Lady Day: The Complete Billie Holiday On Colum...,"jazz,vocal jazz,blues",2JrkYswtbsdxOATXvGTlNf,24.0,1933,0.489,0.0571,7.0,0.0,0.273,80.172,198560
15581,8rCzU7kVpoJ0Z37D,Billie Holiday,A Fine Romance,Lady Day: The Complete Billie Holiday On Colum...,"jazz,jazz,blues",2dvty7OgXD51fJGj9Hn0HG,24.0,1933,0.596,0.182,5.0,1.0,0.711,123.961,171467


## Mapping our dataframe to the transfer learning features

The transfer learning features have the same ordering of the files of the folder they were extracted, which is not the same as the dataframe now (and the dataframe is now ordered)

That means to use them we have to map each song to its corresponding index in the feature dataframe

In [5]:
# Loading Transfer Learning Features
print("Loading Transfer Learning Features...")
transfer_learning_features = np.load(DATASETS_FOLDER / "input" / "extracted_features" / "transfer_learning" / "features.npy")
print("Shape of the transfer learning features: ", np.shape(transfer_learning_features))

# Open list of files.txt
print("Open list of files to make the mapping...")
list_of_files = []
with open(DATASETS_FOLDER / "input" / "extracted_features" / "transfer_learning" / "list_of_files.txt", "r") as files_list:
    # split by line ending, each path is a line in this file
    list_of_files = files_list.read().split(sep="\n")

# The common information we have is the ID, so we can use it to map to our dataset.
print("Getting only the IDs from file paths...")
only_ids = []
# For all paths in the files list, get only the file name (which is the ID!)
for file_name in tqdm(list_of_files):
    temp_path = Path(file_name)
    only_ids.append(temp_path.stem) # returns only the filename without the extension

# Get every ID of our sorted dataframe
print("Creating the mappings of song to the feature vector indexes ")
ids_sorted = dataframe_sorted["id"].to_numpy()

# Now we only need to create a new column containing the indexes corresponding to the feature vector
mapping_of_indexes = []
# Make the mapping of the indexes
for song_id in tqdm(ids_sorted):
    mapping_of_indexes.append(only_ids.index(song_id))

# Adding as a column
print("Adding the new column with the mapping")
dataframe_sorted["mapping_to_fv_index"] = mapping_of_indexes

# Reseting the index so that iloc works
print("Reseting the index so that iloc works in the sorted dataframe...")
df_sorted_reset_index = dataframe_sorted.reset_index()
print("All done!")
df_sorted_reset_index.head()

Loading Transfer Learning Features...
Shape of the transfer learning features:  (109269, 160)
Open list of files to make the mapping...
Getting only the IDs from file paths...


100%|██████████| 109269/109269 [00:02<00:00, 52573.12it/s]


Creating the mappings of song to the feature vector indexes 


100%|██████████| 30000/30000 [00:44<00:00, 674.03it/s]

Adding the new column with the mapping
Reseting the index so that iloc works in the sorted dataframe...
All done!





Unnamed: 0,index,id,artist,song,album_name,genres,spotify_id,popularity,release,danceability,energy,key,mode,valence,tempo,duration_ms,mapping_to_fv_index
0,5986,3MEb9LZbB80nQ1a8,Louis Armstrong,St. James Infirmary,The Complete Hot Five And Hot Seven Recordings...,"jazz,blues",7fAa9rz4UmwuB4AGh50Gmp,29.0,1928,0.693,0.182,5.0,0.0,0.588,116.508,191867,71045
1,24346,DqO2fLBqdVsERa1Z,Louis Armstrong,Mack the Knife,The Great American Songbook,"jazz,swing,jazz,blues,swing",0RNxWy0PC3AyH4ThH3aGK6,43.0,1929,0.673,0.377,0.0,1.0,0.713,88.973,201467,20950
2,822,0SI6oF0XlACvZdQT,Billie Holiday,All Of Me,Lady Day: The Complete Billie Holiday On Colum...,"jazz,vocal jazz,blues,jazz,blues",1LGqJ3nvxpVXDWpEzq4DJD,54.0,1933,0.504,0.0644,2.0,0.0,0.403,106.994,181440,65996
3,2840,1Z7Pb158yANCZ7zN,Billie Holiday,Georgia On My Mind,Lady Day: The Complete Billie Holiday On Colum...,"jazz,vocal jazz,blues",2JrkYswtbsdxOATXvGTlNf,24.0,1933,0.489,0.0571,7.0,0.0,0.273,80.172,198560,91346
4,15581,8rCzU7kVpoJ0Z37D,Billie Holiday,A Fine Romance,Lady Day: The Complete Billie Holiday On Colum...,"jazz,jazz,blues",2dvty7OgXD51fJGj9Hn0HG,24.0,1933,0.596,0.182,5.0,1.0,0.711,123.961,171467,108063


Exporting this dataset

In [6]:
df_sorted_reset_index.to_csv(DATASETS_FOLDER / "input" / "csvs" / f"sorted_song_info_{len(df_sorted_reset_index)}.csv")

## Loading all features in a feature matrix

This next step consists in appending the feature vectors in a list, correspoding to the new orderning of the dataset.
That way we'll have a 1 to 1 mapping of song to its corresponding feature vector

This way we'll have a 2 arrays of feature vectors:
- One of MFCC features 
- One of Transfer Learning features 

And then they will be ready to use to make a `similarity matrix`

In [7]:
MFCC_FEATURES_PATH = Path("../../dataset/dataset_mfcc")
FILE_ENDING = "_mfcc.npy"

def get_mfcc_feature_vector(df):
    """ Load and append the feature vector extracted to a variable 
    This function is slower because every npy is in its separate file, that means heavy IO usage.
    """
    feature_vector = []
    ids_list = df['id'].to_list()
    print("Loading MFCC Features...")
    for song_id in tqdm(ids_list):
        file_name = song_id + FILE_ENDING
        file_path = MFCC_FEATURES_PATH / file_name
        feature_vector.append(np.load(file_path).tolist())
    print("All done!")
    return feature_vector

def get_transfer_learning_feature_vector(df, transfer_learning_feature_vector):
    feature_vector = []
    print("Loading Transfer Learning Features...")
    indexes_list = df['mapping_to_fv_index'].to_list()
    for index in tqdm(indexes_list):
        feature_vector.append(transfer_learning_feature_vector[index])
    print("All done!")
    return feature_vector

def get_feature_vector(df, feature_type, transfer_learning_feature_vector=None):
    if feature_type.lower() == "mfcc":
        return get_mfcc_feature_vector(df)
    elif feature_type.lower() == "transfer_learning":
        if type(transfer_learning_feature_vector) == None:
            raise ValueError("transfer_learning_feature_vector cannot be empty!")
        return get_transfer_learning_feature_vector(df, transfer_learning_feature_vector)
    else:
        raise TypeError("Not a valid feature vector type!")

In [8]:
#mfcc_feature_vector = get_feature_vector(df_sorted_reset_index, "mfcc")
#print(np.shape(mfcc_feature_vector))

In [9]:
transfer_learning_feature_vector = get_feature_vector(df_sorted_reset_index, "transfer_learning", transfer_learning_features)
print(np.shape(transfer_learning_feature_vector))

Loading Transfer Learning Features...


100%|██████████| 30000/30000 [00:00<00:00, 1088628.46it/s]

All done!
(30000, 160)





## Export to avoid computations again

In [10]:
def export_feature_vector(feature_vector, feat_type):
    """ Saves a feature vector to avoid making this list again """
    size = np.shape(feature_vector)[0]
    np.save( DATASETS_FOLDER / "input" / "feature_vectors" / feat_type / f"{feat_type}_feature_vector_{size}_samples.npy", feature_vector)
    print(f"Saving {feat_type} feature vector of {size} samples complete!")

export_feature_vector(transfer_learning_feature_vector, feat_type="transfer_learning")
# export_feature_vector(mfcc_feature_vector, feat_type="mfcc")

Saving transfer_learning feature vector of 30000 samples complete!


## Computing the similarity matrix using the RBF Kernel

In [11]:
from sklearn.metrics.pairwise import rbf_kernel

gamma = 0.1
similarity_matrix = rbf_kernel(transfer_learning_feature_vector, gamma=gamma)
np.fill_diagonal(similarity_matrix, 0) # Just a measure to avoid comparing to itself when generating the network

In [14]:
def save_similarity_matrix(gamma, matrix, feat_type):
    size = np.shape(matrix)[0]
    np.save(DATASETS_FOLDER / "input" / "similarity_matrices" / feat_type / f"{feat_type}_{size}_samples_{gamma}_gamma.npy", matrix)
    print(f"Saving similarity matrix of size {size} complete!")

save_similarity_matrix(gamma, similarity_matrix, "transfer_learning")

Saving similarity matrix of size 30000 complete!
