In [45]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import re
import numpy as np
import six
import sys
sys.modules['sklearn.externals.six'] = six
import mlrose
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
import math

In [2]:
# Function to grab audio features
def audio_features_to_grab(track_uri, spotify_client, list_of_audio_features):
    audio_features_to_return = {}
    all_audio_features = spotify_client.audio_features(track_uri)[0]

    for feature in list_of_audio_features:
        audio_features_to_return[feature] = all_audio_features[feature]

    return audio_features_to_return

In [3]:
# Function to get track info, including the audio features
def get_track_info(track_dict:dict, spotify_client, list_of_audio_features=['danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','time_signature']):
    d = {}
    
    # track_info = track_dict['track'] 
    track_uri = track_dict['uri']
    track_name = track_dict['name']
    track_duration = track_dict['duration_ms']
    track_popularity = track_dict['popularity']
    track_is_explicit = track_dict['explicit']
    
    first_artist_info = track_dict['artists'][0]
    first_artist_info_name = first_artist_info['name']

    d['uri'] = track_dict['uri']
    d['first_artist_name'] = first_artist_info['name']
    d['track_name'] = track_dict['name']
    d['track_duration_ms'] = track_dict['duration_ms']
    d['track_popularity'] = track_dict['popularity']
    d['is_track_explicit'] = track_dict['explicit']


    track_audio_features = audio_features_to_grab(track_uri=track_uri, spotify_client=spotify_client, list_of_audio_features=list_of_audio_features)
    
    d.update(track_audio_features)

    return d

In [4]:
# Function used to calculate the distance between two points
def distance_calc(point_1: list, point_2: list):
    x_difference = (point_2[0] - point_1[0]) ** 2
    y_difference = (point_2[1] - point_1[1]) ** 2
    final = math.sqrt(x_difference + y_difference)

    return round(final, 0)

In [5]:
# Function to get distance of all attributes between two songs
def distance_calc_all_variables(row1, row2, attributes:list):
    num_of_attributes = len(attributes)
    added_attr_differences = 0

    for i in range(num_of_attributes):
        attr = attributes[i]
        attr_difference = (row2[attr] - row1[attr]) ** 2
        added_attr_differences += attr_difference

    final = math.sqrt(added_attr_differences)

    return final

In [6]:
with open('hide/client_id.txt') as f:
    client_id = f.read()

with open('hide/client_password.txt') as f:
    client_password = f.read()

In [7]:
#Authentication - without user
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_password)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [8]:
playlist_link = "https://open.spotify.com/playlist/37i9dQZEVXbNG2KDcFcKOF?si=1333723a6eff4b7f"
playlist_uri = playlist_link.split("/")[-1].split("?")[0]
track_uris = [x["track"]["uri"] for x in sp.playlist_tracks(playlist_uri)["items"]]

In [9]:
sp.playlist_items(playlist_uri)['items'][0]['track'].keys()

dict_keys(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms', 'episode', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'name', 'popularity', 'preview_url', 'track', 'track_number', 'type', 'uri'])

In [10]:
playlist_dict = {}

for track in sp.playlist_items(playlist_uri)['items']:
    track_dict = track['track']
    track_info = get_track_info(track_dict=track_dict, spotify_client=sp)

    
    dict_key_str = track_dict['name']

    playlist_dict[dict_key_str] = track_info
    

In [11]:
playlist_df = pd.DataFrame(data=playlist_dict).T

In [12]:
track_rating_list = [x for x in range(51)][1:]
playlist_df['top_hit_placement'] = track_rating_list.copy()

In [13]:
playlist_df.head()

Unnamed: 0,uri,first_artist_name,track_name,track_duration_ms,track_popularity,is_track_explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,top_hit_placement
Flowers,spotify:track:4DHcnVTT87F0zZhRPYmZ3B,Miley Cyrus,Flowers,200600,89,False,0.707,0.681,0,-4.325,1,0.0668,0.0632,5e-06,0.0322,0.646,117.999,4,1
TQG,spotify:track:0DWdj2oZMBFSzRsi2Cvfzf,KAROL G,TQG,197933,97,True,0.72,0.63,4,-3.547,0,0.277,0.673,0.0,0.0936,0.607,179.974,4,2
Kill Bill,spotify:track:1Qrg8KqiBpW07V7PNxwwwL,SZA,Kill Bill,153946,95,False,0.644,0.735,8,-5.747,1,0.0391,0.0521,0.144,0.161,0.418,88.98,4,3
Boy's a liar Pt. 2,spotify:track:6AQbmUe0Qwf5PZnt4HmTXv,PinkPantheress,Boy's a liar Pt. 2,131013,97,False,0.696,0.809,5,-8.254,1,0.05,0.252,0.000128,0.248,0.857,132.962,4,4
BESO,spotify:track:609E1JCInJncactoMmkDon,ROSALÍA,BESO,194543,92,False,0.768,0.644,5,-6.671,0,0.136,0.736,0.000837,0.173,0.53,95.05,4,5


## Optimizing the playlist order using 2 Principal Components and the traveling salesmen optimization

In [14]:
top_hits_attributes_df = playlist_df[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature']]

In [15]:
top_hits_attributes_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
Flowers,0.707,0.681,0,-4.325,1,0.0668,0.0632,5e-06,0.0322,0.646,117.999,4
TQG,0.72,0.63,4,-3.547,0,0.277,0.673,0.0,0.0936,0.607,179.974,4
Kill Bill,0.644,0.735,8,-5.747,1,0.0391,0.0521,0.144,0.161,0.418,88.98,4
Boy's a liar Pt. 2,0.696,0.809,5,-8.254,1,0.05,0.252,0.000128,0.248,0.857,132.962,4
BESO,0.768,0.644,5,-6.671,0,0.136,0.736,0.000837,0.173,0.53,95.05,4


In [16]:
pca = PCA(n_components=2)
pca.fit(top_hits_attributes_df)
attributes_pca = pca.transform(top_hits_attributes_df)

In [17]:
pca_coordinates = pd.DataFrame(attributes_pca).rename(columns={0:'x_coordinate', 1:'y_coordinate'})

In [18]:
top_hits_attributes_df[['x_coordinate', 'y_coordinate']] = pca_coordinates[['x_coordinate', 'y_coordinate']].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_hits_attributes_df[['x_coordinate', 'y_coordinate']] = pca_coordinates[['x_coordinate', 'y_coordinate']].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_hits_attributes_df[['x_coordinate', 'y_coordinate']] = pca_coordinates[['x_coordinate', 'y_coordinate']].values


In [19]:
top_hits_attributes_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,x_coordinate,y_coordinate
Flowers,0.707,0.681,0,-4.325,1,0.0668,0.0632,5e-06,0.0322,0.646,117.999,4,-5.071431,-4.938246
TQG,0.72,0.63,4,-3.547,0,0.277,0.673,0.0,0.0936,0.607,179.974,4,56.971198,-2.107521
Kill Bill,0.644,0.735,8,-5.747,1,0.0391,0.0521,0.144,0.161,0.418,88.98,4,-33.948639,3.66177
Boy's a liar Pt. 2,0.696,0.809,5,-8.254,1,0.05,0.252,0.000128,0.248,0.857,132.962,4,9.95321,0.14538
BESO,0.768,0.644,5,-6.671,0,0.136,0.736,0.000837,0.173,0.53,95.05,4,-27.939964,0.671787


#### This section between the lines isn't actually needed. I did it thinking it was needed, but realized later MLRose doesn't need the distances calculated like this. So if you're looking at this notebook, can disregard. Or just enjoy it. Whatever.

-------------------------------

In [20]:
distances_df = pd.DataFrame()

Calculating the distances between every song

In [21]:
for i in range(len(top_hits_attributes_df)):
    i_coordinates = list(top_hits_attributes_df.iloc[i][['x_coordinate', 'y_coordinate']])

    for j in range(len(top_hits_attributes_df)):
        j_coordinates = list(top_hits_attributes_df.iloc[j][['x_coordinate', 'y_coordinate']])

        if i == j:
            distances_df.loc[i,j] = 0
        else:
            distances_df.loc[i,j] = distance_calc(i_coordinates, j_coordinates)

--------------------------

In [22]:
coordinates_list = [tuple(x) for x in top_hits_attributes_df[['x_coordinate', 'y_coordinate']].values]

In [23]:
# Defining fitness class
fitness_coordinates = mlrose.TravellingSales(coords = coordinates_list)

In [24]:
# Defining optimization problem
problem_fit = mlrose.TSPOpt(length=len(coordinates_list), fitness_fn=fitness_coordinates, maximize=False)

In [25]:
# Solving using the genetic algorithm
best_state, best_fitness = mlrose.genetic_alg(problem=problem_fit, 
                                              mutation_prob=0.2,
                                              max_attempts=100,
                                              random_state=2)

Below is the optimal ordering of songs to minimize distance between all of them in a row. This is the playlist "optimized" to go from one song to the next that sounds the most like it, and so on.

In [26]:
best_state

array([ 9,  6, 39,  7, 45, 42, 32, 31,  1, 23, 18, 13, 20, 41, 19,  2, 26,
       38, 47, 17, 36, 37,  5, 33, 16, 40, 34, 25, 43, 29, 27, 44, 28, 10,
       22,  4, 15, 46, 30, 14, 35, 21,  0, 24, 49,  3, 12,  8, 48, 11])

This is now the optimal, reordered playlist.

In [27]:
playlist_optimal_order = playlist_df.reset_index().drop(columns='index').reindex(best_state)

In [28]:
playlist_optimal_order.head()

Unnamed: 0,uri,first_artist_name,track_name,track_duration_ms,track_popularity,is_track_explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,top_hit_placement
9,spotify:track:2UW7JaomAMuX9pZrjVpHAU,Yng Lvcas,La Bebe - Remix,234352,90,True,0.812,0.479,2,-5.678,0,0.333,0.213,1e-06,0.0756,0.559,169.922,4,10
6,spotify:track:4Dvkj6JhhA12EX05fT7y2e,Harry Styles,As It Was,167303,92,False,0.52,0.731,6,-5.338,0,0.0557,0.342,0.00101,0.311,0.662,173.93,4,7
39,spotify:track:4fYte8ZvTK14NEhAOZocBi,OneRepublic,I Ain't Worried,148120,52,False,0.697,0.802,0,-6.412,1,0.0456,0.0988,3.6e-05,0.0614,0.822,139.951,4,40
7,spotify:track:4nrPB8O7Y7wsOCJdgXkthe,Bizarrap,"Shakira: Bzrp Music Sessions, Vol. 53",214945,97,False,0.778,0.632,2,-5.6,0,0.0493,0.274,0.0,0.0915,0.498,122.104,4,8
45,spotify:track:4iZ4pt7kvcaH6Yo8UoZ4s2,SZA,Snooze,201800,92,True,0.559,0.551,5,-7.231,1,0.132,0.141,0.0,0.11,0.392,143.008,4,46


## Optimizing the playlist using all attributes and traveling salesman optimization

In [29]:
distances_list_all_variables = []

This is calculating the distance of all attributes for every combination of song, not permutation, and appending to a list. 
EX: (0, 5, 45) is the distance between songs indexed 0 and 5 in the list, and a distance of 50. (5, 0, 50) won't be in this list  though, because the combination of 0 and 5 already exists.

In [30]:
top_hits_attributes_no_coordinates = top_hits_attributes_df.drop(columns=['x_coordinate', 'y_coordinate'])

In [31]:
for i in range(len(top_hits_attributes_df)):

    for j in range(len(top_hits_attributes_df)):

        if i >= j:
            pass
        else:
            distance = distance_calc_all_variables(top_hits_attributes_no_coordinates.iloc[i],
                                                   top_hits_attributes_no_coordinates.iloc[j],
                                                   list(top_hits_attributes_no_coordinates.columns))

            song1_song2_distance = (i, j, distance)

            distances_list_all_variables.append(song1_song2_distance)

In [32]:
top_hits_attributes_no_coordinates.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature'],
      dtype='object')

In [33]:
fitness_dists = mlrose.TravellingSales(distances=distances_list_all_variables)

In [34]:
problem_fit_distances = mlrose.TSPOpt(length=len(top_hits_attributes_no_coordinates),
                                      fitness_fn=fitness_dists,
                                      maximize=False)

This one will take much longer than the PCA optimization

In [35]:
# Solving using genetic algorithm, and distances, not coordinates
best_state_distances, best_fitness_distances = mlrose.genetic_alg(problem=problem_fit_distances,
                                                                  mutation_prob=0.2,
                                                                  max_attempts=100,
                                                                  random_state=2)

In [36]:
best_state_distances == best_state

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True])

In [37]:
best_fitness_distances

1191.9443322679617

In [38]:
best_fitness

1174.0389537509218

### Scaling the audio features then running TSP

In [41]:
list_of_audio_features=['danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','time_signature']

Converting the audio features to numeric datatype

In [42]:
playlist_df[list_of_audio_features] = playlist_df[list_of_audio_features].apply(pd.to_numeric)

In [50]:
playlist_df[list_of_audio_features].head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
Flowers,0.707,0.681,0,-4.325,1,0.0668,0.0632,5e-06,0.0322,0.646,117.999,4
TQG,0.72,0.63,4,-3.547,0,0.277,0.673,0.0,0.0936,0.607,179.974,4
Kill Bill,0.644,0.735,8,-5.747,1,0.0391,0.0521,0.144,0.161,0.418,88.98,4
Boy's a liar Pt. 2,0.696,0.809,5,-8.254,1,0.05,0.252,0.000128,0.248,0.857,132.962,4
BESO,0.768,0.644,5,-6.671,0,0.136,0.736,0.000837,0.173,0.53,95.05,4


Applying the MinMaxScaler to the audio features. This is used so when the distances are calculated on the features, some features aren't treated as more important than others. Applying the scaler makes sure all of the features are treated equal.

In [71]:
scaler = MinMaxScaler()
scaled_playlist_df = playlist_df.copy()
scaled_playlist_df[list_of_audio_features] = scaler.fit_transform(scaled_playlist_df[list_of_audio_features])

In [72]:
scaled_playlist_df.head()

Unnamed: 0,uri,first_artist_name,track_name,track_duration_ms,track_popularity,is_track_explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,top_hit_placement
Flowers,spotify:track:4DHcnVTT87F0zZhRPYmZ3B,Miley Cyrus,Flowers,200600,89,False,0.517578,0.611491,0.0,0.924717,1.0,0.122268,0.078373,8e-06,0.0,0.618469,0.37276,1.0,1
TQG,spotify:track:0DWdj2oZMBFSzRsi2Cvfzf,KAROL G,TQG,197933,97,True,0.542969,0.541724,0.363636,0.995283,0.0,0.74306,0.86572,0.0,0.173055,0.571081,0.826039,1.0,2
Kill Bill,spotify:track:1Qrg8KqiBpW07V7PNxwwwL,SZA,Kill Bill,153946,95,False,0.394531,0.685363,0.727273,0.795737,1.0,0.040461,0.064041,0.214925,0.363021,0.341434,0.160518,1.0,3
Boy's a liar Pt. 2,spotify:track:6AQbmUe0Qwf5PZnt4HmTXv,PinkPantheress,Boy's a liar Pt. 2,131013,97,False,0.496094,0.786594,0.454545,0.568345,1.0,0.072652,0.322143,0.000191,0.60823,0.874848,0.482198,1.0,4
BESO,spotify:track:609E1JCInJncactoMmkDon,ROSALÍA,BESO,194543,92,False,0.636719,0.560876,0.454545,0.711927,0.0,0.326639,0.947063,0.001249,0.396843,0.477521,0.204913,1.0,5


In [73]:
scaled_distances_list_all_variables = []

In [74]:
scaled_attributes_df = scaled_playlist_df[list_of_audio_features]

In [75]:
for i in range(len(scaled_attributes_df)):

    for j in range(len(scaled_attributes_df)):

        if i >= j:
            pass
        else:
            distance = distance_calc_all_variables(scaled_attributes_df.iloc[i],
                                                   scaled_attributes_df.iloc[j],
                                                   list(scaled_attributes_df.columns))

            song1_song2_distance = (i, j, distance)

            scaled_distances_list_all_variables.append(song1_song2_distance)

In [77]:
scaled_fitness_dists = mlrose.TravellingSales(distances=scaled_distances_list_all_variables)

scaled_problem_fit_distances = mlrose.TSPOpt(length=len(scaled_attributes_df),
                                      fitness_fn=scaled_fitness_dists,
                                      maximize=False)

In [78]:
# Solving using genetic algorithm, and distances, not coordinates
scaled_best_state_distances, scaled_best_fitness_distances = mlrose.genetic_alg(problem=scaled_problem_fit_distances,
                                                                  mutation_prob=0.2,
                                                                  max_attempts=100,
                                                                  random_state=2)

In [79]:
scaled_best_state_distances == best_state_distances

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False])

### Now to run the model with MinMaxScaler, PCA, and mlrose

In [81]:
pca = PCA(n_components=2)
pca.fit(scaled_attributes_df)
scaled_attributes_pca = pca.transform(scaled_attributes_df)

In [82]:
scaled_pca_coordinates = pd.DataFrame(scaled_attributes_pca).rename(columns={0:'x_coordinate', 1:'y_coordinate'})

In [85]:
scaled_coordinates_list = [tuple(x) for x in scaled_pca_coordinates[['x_coordinate', 'y_coordinate']].values]

In [86]:
# Defining fitness class
scaled_pca_fitness_coordinates = mlrose.TravellingSales(coords = scaled_coordinates_list)

In [88]:
# Defining optimization problem
scaled_pca_problem_fit = mlrose.TSPOpt(length=len(scaled_coordinates_list), fitness_fn=scaled_pca_fitness_coordinates, maximize=False)

In [89]:
# Solving using the genetic algorithm
scaled_pca_best_state, scaled_pca_best_fitness = mlrose.genetic_alg(problem=scaled_pca_problem_fit, 
                                              mutation_prob=0.2,
                                              max_attempts=100,
                                              random_state=2)

In [90]:
scaled_pca_best_state == scaled_best_state_distances

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False])

### Now to reorder the playlist with the different implementations

Playlist reordered using the best state from non-scaled PCA distances.

In [91]:
pca_playlist_optimal_order = playlist_df.reset_index().drop(columns='index').reindex(best_state)
pca_playlist_optimal_order.head()

Unnamed: 0,uri,first_artist_name,track_name,track_duration_ms,track_popularity,is_track_explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,top_hit_placement
9,spotify:track:2UW7JaomAMuX9pZrjVpHAU,Yng Lvcas,La Bebe - Remix,234352,90,True,0.722656,0.335157,0.181818,0.801995,0.0,0.908447,0.271788,2e-06,0.122322,0.512758,0.75252,1.0,10
6,spotify:track:4Dvkj6JhhA12EX05fT7y2e,Harry Styles,As It Was,167303,92,False,0.152344,0.679891,0.545455,0.832834,0.0,0.089486,0.438347,0.001507,0.785795,0.63791,0.781834,1.0,7
39,spotify:track:4fYte8ZvTK14NEhAOZocBi,OneRepublic,I Ain't Worried,148120,52,False,0.498047,0.777018,0.0,0.73542,1.0,0.059657,0.124338,5.3e-05,0.0823,0.832321,0.533315,1.0,40
7,spotify:track:4nrPB8O7Y7wsOCJdgXkthe,Bizarrap,"Shakira: Bzrp Music Sessions, Vol. 53",214945,97,False,0.65625,0.54446,0.181818,0.80907,0.0,0.070585,0.350549,0.0,0.167136,0.438639,0.402784,1.0,8
45,spotify:track:4iZ4pt7kvcaH6Yo8UoZ4s2,SZA,Snooze,201800,92,True,0.228516,0.433653,0.454545,0.661134,1.0,0.314826,0.178825,0.0,0.219278,0.309842,0.555673,1.0,46


Playlist reordered using the non-scaled, raw attributes. Produced the same ordering as the non-scaled, PCA component ordering.

In [93]:
raw_attrs_playlist_optimal_order = playlist_df.reset_index().drop(columns='index').reindex(best_state_distances)
raw_attrs_playlist_optimal_order.head()

Unnamed: 0,uri,first_artist_name,track_name,track_duration_ms,track_popularity,is_track_explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,top_hit_placement
9,spotify:track:2UW7JaomAMuX9pZrjVpHAU,Yng Lvcas,La Bebe - Remix,234352,90,True,0.722656,0.335157,0.181818,0.801995,0.0,0.908447,0.271788,2e-06,0.122322,0.512758,0.75252,1.0,10
6,spotify:track:4Dvkj6JhhA12EX05fT7y2e,Harry Styles,As It Was,167303,92,False,0.152344,0.679891,0.545455,0.832834,0.0,0.089486,0.438347,0.001507,0.785795,0.63791,0.781834,1.0,7
39,spotify:track:4fYte8ZvTK14NEhAOZocBi,OneRepublic,I Ain't Worried,148120,52,False,0.498047,0.777018,0.0,0.73542,1.0,0.059657,0.124338,5.3e-05,0.0823,0.832321,0.533315,1.0,40
7,spotify:track:4nrPB8O7Y7wsOCJdgXkthe,Bizarrap,"Shakira: Bzrp Music Sessions, Vol. 53",214945,97,False,0.65625,0.54446,0.181818,0.80907,0.0,0.070585,0.350549,0.0,0.167136,0.438639,0.402784,1.0,8
45,spotify:track:4iZ4pt7kvcaH6Yo8UoZ4s2,SZA,Snooze,201800,92,True,0.228516,0.433653,0.454545,0.661134,1.0,0.314826,0.178825,0.0,0.219278,0.309842,0.555673,1.0,46


Playlist reordered using the scaled, raw attributes.

In [94]:
scaled_pca_playlist_optimal_order = playlist_df.reset_index().drop(columns='index').reindex(scaled_best_state_distances)
scaled_pca_playlist_optimal_order.head()

Unnamed: 0,uri,first_artist_name,track_name,track_duration_ms,track_popularity,is_track_explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,top_hit_placement
49,spotify:track:54ipXppHLA8U4yqpOFTUhr,Imagine Dragons,Bones,165264,86,False,0.644531,0.705882,0.454545,0.984127,0.0,0.059362,0.022724,0.0,0.117813,0.54678,0.343958,1.0,50
13,spotify:track:0WtM2NBVQNNJLh6scP13H8,Rema,Calm Down (with Selena Gomez),239317,95,False,0.701172,0.78249,1.0,0.844807,1.0,0.037507,0.489994,0.000999,0.230552,0.808019,0.292307,1.0,14
40,spotify:track:2CeKVsFFXG4QzA415QygGb,Feid,Feliz Cumpleaños Ferxxo,155960,89,False,0.826172,0.463748,0.454545,0.81805,1.0,0.125222,0.108199,0.0,0.768884,0.517618,0.20454,1.0,41
34,spotify:track:5mHdCZtVyb4DcJw8799hZp,RAYE,Escapism.,272373,81,True,0.1875,0.694938,0.181818,0.831293,1.0,0.261666,0.174952,7e-05,0.172492,0.137303,0.212644,1.0,35
16,spotify:track:0V3wPSX9ygBnCm8psDIegu,Taylor Swift,Anti-Hero,200690,94,False,0.380859,0.559508,0.363636,0.720998,1.0,0.078263,0.164622,3e-06,0.30947,0.481166,0.219234,1.0,17


Playlist reordered using the scaled, PCA components.

In [95]:
scaled_pca_playlist_optimal_order = playlist_df.reset_index().drop(columns='index').reindex(scaled_pca_best_state)
scaled_pca_playlist_optimal_order.head()

Unnamed: 0,uri,first_artist_name,track_name,track_duration_ms,track_popularity,is_track_explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,top_hit_placement
6,spotify:track:4Dvkj6JhhA12EX05fT7y2e,Harry Styles,As It Was,167303,92,False,0.152344,0.679891,0.545455,0.832834,0.0,0.089486,0.438347,0.001507,0.785795,0.63791,0.781834,1.0,7
14,spotify:track:3Ua0m0YmEjrMi9XErKcNiR,Jimin,Like Crazy,212241,89,False,0.365234,0.682627,0.636364,0.823129,1.0,0.04873,0.0,0.0,0.915445,0.27339,0.387403,1.0,15
5,spotify:track:4W4fNrZYkobj539TOWsLO2,The Weeknd,Die For You (with Ariana Grande) - Remix,232857,83,False,0.259766,0.363885,0.090909,0.628209,0.0,0.133786,0.289864,0.0,0.853439,0.44836,0.0,1.0,6
47,spotify:track:3Kw7zkALCVxY4wmlnh2IWC,FIFTY FIFTY,Cupid - Twin Ver.,174253,87,False,0.664062,0.4829,1.0,0.564172,0.0,0.020673,0.549387,3e-06,0.859076,0.756987,0.387695,1.0,48
25,spotify:track:1lRtH4FszTrwwlK5gTSbXO,Natanael Cano,AMG,174942,93,True,0.644531,0.678523,1.0,0.713197,0.0,0.21205,0.193028,0.00016,0.681511,0.788578,0.505698,0.0,26


It is interesting that the non-scaled PCA components and non-scaled attributes produced the same ordering, while the scaled attributes and the scaled PCA components produced different ordering than the others. 

In the case of the non-scaled, it appears that PCA is probably able to capture most of the information in the first and second component, so this is why PCA and raw attributes produce the same ordering. Also though, the scale of each variable is quite different, with many of the attributes having a very small range of values, so they don't really contribute much to the distance calculation when running the algorithm on the raw attributes. I believe that optimizing the playlist using the scaled raw attributes and scaled PCA components produces "better" results because the attributes are all treated equally when scaled, but I don't know about best practices of using PCA and scaled attributes. Something to read up on.

In [96]:
print(f"raw attributes distance: {best_fitness_distances}")
print(f"PCA attributes distance: {best_fitness}")
print(f"Scaled raw attributes distance: {scaled_best_fitness_distances}")
print(f"Scaled PCA attributes distance: {scaled_pca_best_fitness}")


raw attributes distance: 1191.9443322679617
PCA attributes distance: 1174.0389537509218
Scaled raw attributes distance: 58.804940627892016
Scaled PCA attributes distance: 29.913332210036884


In [97]:
(best_fitness_distances - best_fitness) / best_fitness_distances

0.015021992246039404

In [98]:
(scaled_best_fitness_distances - scaled_pca_best_fitness) / scaled_best_fitness_distances

0.4913126024678177