In [90]:
%load_ext autoreload
%autoreload 2

In [138]:
## take a sample reduce it
import dotenv
import os
import joblib
import numpy as np
from helper.spotify_api import get_spotify_token, search_track_id_by_name, get_audio_features_by_id
from helper.dimension_reduction import PCA, AutoEncoder
from helper.cluster import Kmeans, Kmedians

In [139]:
dotenv.load_dotenv()

True

In [140]:
song_name = "Bohemian Rhapsody"
artist_name = "Queen"
dimension_reduction = "PCA"
dimension = 3
clustering = "KMeans"
clusters = 4
distance = "Euclidean"

dimension_reduction_options = {
    "AutoEncoder":{"key": "auto", "min": 1, "max": 3},
    "PCA":{"key":"pca", "min": 1, "max": 12},
    "Base":{"key":"base"}
}
clustering_options = {
    "KMeans":{"key": "kmeans", "min": 2, "max": 5},
    "KMedians":{"key":"kmedians", "min": 1, "max": 5}
}
distance_options = {
    "Euclidean": {"key": "euclidean"},
    "Manhattan": {"key": "manhattan"}
}

In [101]:
access_token = get_spotify_token(os.environ["SPOTIFY_ID"], os.environ["SPOTIFY_TOKEN"])

In [103]:
song_response = search_track_id_by_name(access_token=access_token, song_name=song_name, artist_name=artist_name)
track_id = song_response[1]["id"]

In [113]:
audio_features = get_audio_features_by_id(access_token=access_token, track_id=track_id)

In [124]:
audio_features_val = {k: v for k, v in audio_features.items() if k not in ["type", "id", "uri", "track_href", "analysis_url"]}
audio_features_sort = {key: audio_features_val[key] for key in sorted(audio_features_val)}


In [127]:
np_sample = np.array([v for k, v in audio_features_sort.items()])

In [134]:
picked_dim = dimension_reduction_options[dimension_reduction]["key"]
picked_clus = clustering_options[clustering]["key"]

In [144]:
dim_red_model = joblib.load(f"trained/{picked_dim}_c{dimension}/{picked_dim}.pkl")

In [145]:
reduced_sample = dim_red_model.project(np_sample)

In [148]:
clustering_model = joblib.load(f"trained/{picked_dim}_c{dimension}/{picked_clus}_c{clusters}/cluster.pkl")

In [155]:
prediction = clustering_model.predict(reduced_sample.reshape(1, -1))

In [196]:
arr_data = np.load(f"trained/{picked_dim}_c{dimension}/data.npy")
arr_id = np.load(f"trained/audio_index.npy", allow_pickle=True)
arr_id_reshaped = arr_id.reshape(-1, 1)

In [198]:
combined_array = np.concatenate((arr_data, arr_id_reshaped), axis=1)

In [207]:
index_mask = np.where(clustering_model.labels == prediction[0])[0]
filtered_arr = combined_array[index_mask]


In [211]:
filtered_arr[:,0:3]

array([[0.035943967838919705, 0.70469404866273, 0.5570870001332336],
       [1.5640388327239463, 1.4259073639147242, -2.031124789434123],
       [0.5367195238380779, 0.9093334318138804, 0.7919016883991309],
       ...,
       [0.17154562092179693, 1.3783395123491464, 0.1184101086454909],
       [1.114747888293611, 0.998000286321577, 0.15022199539093864],
       [0.15189237952884171, 0.6119756328077779, -0.2745819008724956]],
      dtype=object)

In [213]:
distances = np.linalg.norm(filtered_arr[:,0:3].astype(float) - reduced_sample, axis=1)
# manhattan_distances = np.sum(np.abs(filtered_arr - reduced_sample), axis=1)
# norm_single = reduced_sample / np.linalg.norm(reduced_sample)
# norm_array = filtered_arr / np.linalg.norm(filtered_arr, axis=1, keepdims=True)

# cosine_similarities = np.dot(norm_array, norm_single)

# cosine_distances = 1 - cosine_similarities
distances = distances.reshape(-1, 1)


In [216]:
with_distance = np.concatenate((filtered_arr, distances), axis = 1)

In [218]:
closest_songs = with_distance[with_distance[:, -1].argsort()]

In [222]:
import pandas as pd
df_songs = pd.read_csv("dataset.csv")

In [226]:
df_songs_filter = df_songs.loc[df_songs["track_id"].isin(closest_songs[1:90, -2])]

In [228]:
filtered_df = df_songs_filter.loc[~df_songs_filter.duplicated(subset=["track_name", "artists"])]

In [229]:
filtered_df

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
8202,8202,4mn2kNTqiGLwaUR8JdhJ1l,The Animals,The Singles Plus,House of the Rising Sun,75,269906,False,0.316,0.484,...,-9.11,0,0.0308,0.000334,0.00445,0.0912,0.299,117.363,3,blues
8713,8713,3TETmhGNWOcquUTtcr6RIR,The Animals,The House of the Rising Sun: Best of The Animals,The House of the Rising Sun,58,269906,False,0.316,0.484,...,-9.11,0,0.0308,0.000334,0.00445,0.0912,0.299,117.363,3,blues
11853,11853,1f133Oqpwft2mGNp5cWqaS,Lower Than Atlantis,Safe In Sound,I Don't Want to Be Here Anymore,43,250813,False,0.286,0.35,...,-8.427,1,0.0301,0.0165,0.000319,0.112,0.0871,129.796,4,british
32166,32166,76IijT19KtStPt9ij4nNk5,Multani,Old World Songs in a New World Order,Madira,33,79830,False,0.542,0.575,...,-11.764,1,0.0808,0.129,0.687,0.272,0.0747,170.135,4,electronic
34120,34120,0FTfaxNtqgYbqPPcrWQ1IS,Nusrat Fateh Ali Khan,Shahen-Shah (Real World Gold),Kali Kali Zulfon Ke Phande Nah Dalo,48,706866,False,0.436,0.674,...,-12.834,1,0.0368,0.664,0.0044,0.266,0.752,138.674,3,folk
40827,40827,12dGdIwsVKWoi9llS1dwNA,Gabriel Guedes de Almeida,Eterno Presente (Ao Vivo),Eu Creio - Ao Vivo,39,320200,False,0.294,0.451,...,-8.162,1,0.0304,0.246,0.000122,0.0997,0.141,146.228,4,gospel
40978,40978,5URQ1xSOkuIhyP9jjd4tXk,Samuel Messias,Gerado no Altar (Ao Vivo),Gerado no Altar (Ao Vivo),38,357954,False,0.328,0.474,...,-7.972,1,0.0659,0.449,0.0,0.0772,0.236,174.799,4,gospel
42148,42148,5KSNy2mfWAKxKhwp6WdmvG,Mortician,Chainsaw Dismemberment,Dark Sanity,18,87333,False,0.322,0.715,...,-10.5,1,0.0475,4e-06,0.946,0.117,0.4,124.999,4,grindcore
54119,54119,5NC4CFZUNpnMlE8OEidYxc,Ochre,A Midsummer Nice Dream,Yugen,15,339906,False,0.725,0.73,...,-10.994,1,0.0769,0.302,0.877,0.369,0.111,98.997,4,idm
56608,56608,1170VohRSx6GwE6QDCHPPH,The Backseat Lovers,When We Were Friends,Kilby Girl,78,282205,False,0.329,0.444,...,-9.973,1,0.0417,0.0578,0.0352,0.113,0.225,162.279,4,indie-pop
