In [45]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import re
import numpy as np
import six
import sys
sys.modules['sklearn.externals.six'] = six
import mlrose
from sklearn.decomposition import PCA
import math

# sys.path.append("/Users/alexmcgraw/Documents/Side_Projects/spotify_summarizer/scripts")
# print(sys.path)
# from scripts.spotipy_functions import get_track_info, distance_calc, distance_calc_all_variables

In [46]:
# Function to grab audio features
def audio_features_to_grab(track_uri, spotify_client, list_of_audio_features):
    audio_features_to_return = {}
    all_audio_features = spotify_client.audio_features(track_uri)[0]

    for feature in list_of_audio_features:
        audio_features_to_return[feature] = all_audio_features[feature]

    return audio_features_to_return

In [47]:
# Function to get track info, including the audio features
def get_track_info(track_dict:dict, spotify_client, list_of_audio_features=['danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','time_signature']):
    d = {}
    
    # track_info = track_dict['track'] 
    track_uri = track_dict['uri']
    track_name = track_dict['name']
    track_duration = track_dict['duration_ms']
    track_popularity = track_dict['popularity']
    track_is_explicit = track_dict['explicit']
    
    first_artist_info = track_dict['artists'][0]
    first_artist_info_name = first_artist_info['name']

    d['uri'] = track_dict['uri']
    d['first_artist_name'] = first_artist_info['name']
    d['track_name'] = track_dict['name']
    d['track_duration_ms'] = track_dict['duration_ms']
    d['track_popularity'] = track_dict['popularity']
    d['is_track_explicit'] = track_dict['explicit']


    track_audio_features = audio_features_to_grab(track_uri=track_uri, spotify_client=spotify_client, list_of_audio_features=list_of_audio_features)
    
    d.update(track_audio_features)

    return d

In [48]:
# Function used to calculate the distance between two points
def distance_calc(point_1: list, point_2: list):
    x_difference = (point_2[0] - point_1[0]) ** 2
    y_difference = (point_2[1] - point_1[1]) ** 2
    final = math.sqrt(x_difference + y_difference)

    return round(final, 0)

In [49]:
# Function to get distance of all attributes between two songs
def distance_calc_all_variables(row1, row2, attributes:list):
    num_of_attributes = len(attributes)
    added_attr_differences = 0

    for i in range(num_of_attributes):
        attr = attributes[i]
        attr_difference = (row2[attr] - row1[attr]) ** 2
        added_attr_differences += attr_difference

    final = math.sqrt(added_attr_differences)

    return final

In [50]:
with open('../hide/client_id.txt') as f:
    client_id = f.read()

with open('../hide/client_password.txt') as f:
    client_password = f.read()

In [51]:
#Authentication - without user
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_password)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [52]:
playlist_link = "https://open.spotify.com/playlist/37i9dQZEVXbNG2KDcFcKOF?si=1333723a6eff4b7f"
playlist_uri = playlist_link.split("/")[-1].split("?")[0]
track_uris = [x["track"]["uri"] for x in sp.playlist_tracks(playlist_uri)["items"]]

In [53]:
sp.playlist_items(playlist_uri)['items'][0]['track'].keys()

dict_keys(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms', 'episode', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'name', 'popularity', 'preview_url', 'track', 'track_number', 'type', 'uri'])

In [54]:
playlist_dict = {}

for track in sp.playlist_items(playlist_uri)['items']:
    track_dict = track['track']
    track_info = get_track_info(track_dict=track_dict, spotify_client=sp)

    
    dict_key_str = track_dict['name']

    playlist_dict[dict_key_str] = track_info
    

In [55]:
playlist_df = pd.DataFrame(data=playlist_dict).T

In [56]:
track_rating_list = [x for x in range(51)][1:]
playlist_df['top_hit_placement'] = track_rating_list.copy()

In [57]:
playlist_df.head()

Unnamed: 0,uri,first_artist_name,track_name,track_duration_ms,track_popularity,is_track_explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,top_hit_placement
Flowers,spotify:track:4DHcnVTT87F0zZhRPYmZ3B,Miley Cyrus,Flowers,200600,89,False,0.707,0.681,0,-4.325,1,0.0668,0.0632,5e-06,0.0322,0.646,117.999,4,1
TQG,spotify:track:0DWdj2oZMBFSzRsi2Cvfzf,KAROL G,TQG,197933,97,True,0.72,0.63,4,-3.547,0,0.277,0.673,0.0,0.0936,0.607,179.974,4,2
Kill Bill,spotify:track:1Qrg8KqiBpW07V7PNxwwwL,SZA,Kill Bill,153946,95,False,0.644,0.735,8,-5.747,1,0.0391,0.0521,0.144,0.161,0.418,88.98,4,3
Boy's a liar Pt. 2,spotify:track:6AQbmUe0Qwf5PZnt4HmTXv,PinkPantheress,Boy's a liar Pt. 2,131013,97,False,0.696,0.809,5,-8.254,1,0.05,0.252,0.000128,0.248,0.857,132.962,4,4
BESO,spotify:track:609E1JCInJncactoMmkDon,ROSALÍA,BESO,194543,91,False,0.768,0.644,5,-6.671,0,0.136,0.736,0.000837,0.173,0.53,95.05,4,5


## Optimizing the playlist order using 2 Principal Components and the traveling salesmen optimization

In [58]:
top_hits_attributes_df = playlist_df[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature']]

In [59]:
top_hits_attributes_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
Flowers,0.707,0.681,0,-4.325,1,0.0668,0.0632,5e-06,0.0322,0.646,117.999,4
TQG,0.72,0.63,4,-3.547,0,0.277,0.673,0.0,0.0936,0.607,179.974,4
Kill Bill,0.644,0.735,8,-5.747,1,0.0391,0.0521,0.144,0.161,0.418,88.98,4
Boy's a liar Pt. 2,0.696,0.809,5,-8.254,1,0.05,0.252,0.000128,0.248,0.857,132.962,4
BESO,0.768,0.644,5,-6.671,0,0.136,0.736,0.000837,0.173,0.53,95.05,4


In [60]:
pca = PCA(n_components=2)
pca.fit(top_hits_attributes_df)
attributes_pca = pca.transform(top_hits_attributes_df)

In [61]:
pca_coordinates = pd.DataFrame(attributes_pca).rename(columns={0:'x_coordinate', 1:'y_coordinate'})

In [62]:
top_hits_attributes_df[['x_coordinate', 'y_coordinate']] = pca_coordinates[['x_coordinate', 'y_coordinate']].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_hits_attributes_df[['x_coordinate', 'y_coordinate']] = pca_coordinates[['x_coordinate', 'y_coordinate']].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_hits_attributes_df[['x_coordinate', 'y_coordinate']] = pca_coordinates[['x_coordinate', 'y_coordinate']].values


In [63]:
top_hits_attributes_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,x_coordinate,y_coordinate
Flowers,0.707,0.681,0,-4.325,1,0.0668,0.0632,5e-06,0.0322,0.646,117.999,4,-5.071431,-4.938246
TQG,0.72,0.63,4,-3.547,0,0.277,0.673,0.0,0.0936,0.607,179.974,4,56.971198,-2.107521
Kill Bill,0.644,0.735,8,-5.747,1,0.0391,0.0521,0.144,0.161,0.418,88.98,4,-33.948639,3.66177
Boy's a liar Pt. 2,0.696,0.809,5,-8.254,1,0.05,0.252,0.000128,0.248,0.857,132.962,4,9.95321,0.14538
BESO,0.768,0.644,5,-6.671,0,0.136,0.736,0.000837,0.173,0.53,95.05,4,-27.939964,0.671787


In [64]:
distances_df = pd.DataFrame()

Calculating the distances between every song

#### distances_df doesn't end up being used in mlrose

In [65]:
for i in range(len(top_hits_attributes_df)):
    i_coordinates = list(top_hits_attributes_df.iloc[i][['x_coordinate', 'y_coordinate']])

    for j in range(len(top_hits_attributes_df)):
        j_coordinates = list(top_hits_attributes_df.iloc[j][['x_coordinate', 'y_coordinate']])

        if i == j:
            distances_df.loc[i,j] = 0
        else:
            distances_df.loc[i,j] = distance_calc(i_coordinates, j_coordinates)

In [66]:
coordinates_list = [tuple(x) for x in top_hits_attributes_df[['x_coordinate', 'y_coordinate']].values]

In [67]:
# Defining fitness class
fitness_coordinates = mlrose.TravellingSales(coords = coordinates_list)

In [68]:
# Defining optimization problem
problem_fit = mlrose.TSPOpt(length=len(coordinates_list), fitness_fn=fitness_coordinates, maximize=False)

In [69]:
# Solving using the genetic algorithm
best_state, best_fitness = mlrose.genetic_alg(problem=problem_fit, 
                                              mutation_prob=0.2,
                                              max_attempts=100,
                                              random_state=2)

Below is the optimal ordering of songs to minimize distance between all of them in a row. This is the playlist "optimized" to go from one song to the next that sounds the most like it, and so on.

In [70]:
best_state

array([ 9,  6, 39,  7, 45, 42, 32, 31,  1, 23, 18, 13, 20, 41, 19,  2, 26,
       38, 47, 17, 36, 37,  5, 33, 16, 40, 34, 25, 43, 29, 27, 44, 28, 10,
       22,  4, 15, 46, 30, 14, 35, 21,  0, 24, 49,  3, 12,  8, 48, 11])

This is now the optimal, reordered playlist.

In [71]:
playlist_optimal_order = playlist_df.reset_index().drop(columns='index').reindex(best_state)

In [72]:
playlist_optimal_order.head()

Unnamed: 0,uri,first_artist_name,track_name,track_duration_ms,track_popularity,is_track_explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,top_hit_placement
9,spotify:track:2UW7JaomAMuX9pZrjVpHAU,Yng Lvcas,La Bebe - Remix,234352,90,True,0.812,0.479,2,-5.678,0,0.333,0.213,1e-06,0.0756,0.559,169.922,4,10
6,spotify:track:4Dvkj6JhhA12EX05fT7y2e,Harry Styles,As It Was,167303,92,False,0.52,0.731,6,-5.338,0,0.0557,0.342,0.00101,0.311,0.662,173.93,4,7
39,spotify:track:4fYte8ZvTK14NEhAOZocBi,OneRepublic,I Ain't Worried,148120,52,False,0.697,0.802,0,-6.412,1,0.0456,0.0988,3.6e-05,0.0614,0.822,139.951,4,40
7,spotify:track:4nrPB8O7Y7wsOCJdgXkthe,Bizarrap,"Shakira: Bzrp Music Sessions, Vol. 53",214945,97,False,0.778,0.632,2,-5.6,0,0.0493,0.274,0.0,0.0915,0.498,122.104,4,8
45,spotify:track:4iZ4pt7kvcaH6Yo8UoZ4s2,SZA,Snooze,201800,91,True,0.559,0.551,5,-7.231,1,0.132,0.141,0.0,0.11,0.392,143.008,4,46


## Optimizing the playlist using all attributes and traveling salesman optimization

In [91]:
distances_list_all_variables = []

This is calculating the distance of all attributes for every combination of song, not permutation, and appending to a list. 
EX: (0, 5, 45) is the distance between songs indexed 0 and 5 in the list, and a distance of 50. (5, 0, 50) won't be in this list  though, because the combination of 0 and 5 already exists.

### Not sure why it isn't working correctly now. Worked at first once removed the coordinates, but stopped working now.

In [92]:
top_hits_attributes_no_coordinates = top_hits_attributes_df.drop(columns=['x_coordinate', 'y_coordinate'])

In [93]:
for i in range(len(top_hits_attributes_df)):

    for j in range(len(top_hits_attributes_df)):

        if i >= j:
            pass
        else:
            distance = distance_calc_all_variables(top_hits_attributes_df.iloc[i],
                                                   top_hits_attributes_df.iloc[j],
                                                   list(top_hits_attributes_no_coordinates.columns))

            song1_song2_distance = (i, j, distance)

            distances_list_all_variables.append(song1_song2_distance)

Attributes being used for distance calculation: ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']
Attributes being used for distance calculation: ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']
Attributes being used for distance calculation: ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']
Attributes being used for distance calculation: ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']
Attributes being used for distance calculation: ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'ti

In [94]:
fitness_dists = mlrose.TravellingSales(distances=distances_list_all_variables)

In [95]:
problem_fit_distances = mlrose.TSPOpt(length=len(top_hits_attributes_no_coordinates),
                                      fitness_fn=fitness_dists,
                                      maximize=False)

This one will take much longer than the PCA optimization

In [96]:
# Solving using genetic algorithm, and distances, not coordinates
best_state_distances, best_fitness_distances = mlrose.genetic_alg(problem=problem_fit_distances,
                                                                  mutation_prob=0.2,
                                                                  max_attempts=100,
                                                                  random_state=2)

In [97]:
best_state_distances == best_state

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True])

In [98]:
best_fitness_distances

1191.9443322679617

In [99]:
best_fitness

1174.0389537509218

Reordering the playlist to the optimal ordering, as determined by the traveling salesman algorithm with the original attributes.

In [102]:
playlist_optimal_order_original_attrs = playlist_df.reset_index().drop(columns='index').reindex(best_state_distances)
playlist_optimal_order_original_attrs.head()

Unnamed: 0,uri,first_artist_name,track_name,track_duration_ms,track_popularity,is_track_explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,top_hit_placement
9,spotify:track:2UW7JaomAMuX9pZrjVpHAU,Yng Lvcas,La Bebe - Remix,234352,90,True,0.812,0.479,2,-5.678,0,0.333,0.213,1e-06,0.0756,0.559,169.922,4,10
6,spotify:track:4Dvkj6JhhA12EX05fT7y2e,Harry Styles,As It Was,167303,92,False,0.52,0.731,6,-5.338,0,0.0557,0.342,0.00101,0.311,0.662,173.93,4,7
39,spotify:track:4fYte8ZvTK14NEhAOZocBi,OneRepublic,I Ain't Worried,148120,52,False,0.697,0.802,0,-6.412,1,0.0456,0.0988,3.6e-05,0.0614,0.822,139.951,4,40
7,spotify:track:4nrPB8O7Y7wsOCJdgXkthe,Bizarrap,"Shakira: Bzrp Music Sessions, Vol. 53",214945,97,False,0.778,0.632,2,-5.6,0,0.0493,0.274,0.0,0.0915,0.498,122.104,4,8
45,spotify:track:4iZ4pt7kvcaH6Yo8UoZ4s2,SZA,Snooze,201800,91,True,0.559,0.551,5,-7.231,1,0.132,0.141,0.0,0.11,0.392,143.008,4,46
