In [1]:
# Import the required libraries and dependencies
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# The function for calculate and convert playlist with tracks to playlist mean 
def  playlist_mean(df):   
    df_avg=pd.DataFrame()
    df_avg.at[0,'danceability']=df['danceability'].mean()
    df_avg.at[0,'energy']=df['energy'].mean()
    df_avg.at[0,'loudness']=df['loudness'].mean()
    df_avg.at[0,'speechiness']=df['speechiness'].mean()
    df_avg.at[0,'acousticness']=df['acousticness'].mean()
    df_avg.at[0,'instrumentalness']=df['instrumentalness'].mean()
    df_avg.at[0,'liveness']=df['liveness'].mean()
    df_avg.at[0,'valence']=df['valence'].mean()
    df_avg.at[0,'tempo']=df['tempo'].mean()
    df_avg.at[0,'duration_ms']=df['duration_ms'].mean()
    df_avg.at[0,'key']=df['key'].mode().iloc[0]
    df_avg.at[0,'mode']=df['mode'].mode().iloc[0]
    df_avg.at[0,'time_signature']=df['time_signature'].mode().iloc[0]


    df_avg=df_avg.astype({"key":'int',"mode":'int',"time_signature":'int'})     # Does not remove the decimal ".0" even if it is an integer!
    df_avg['key']=df_avg['key'].astype(str)     # adding this line seems to convert "key", "mode", and "time_signature" to objects...


    df_avg=df_avg.assign(mode_minor=0,mode_major=0,\
                key_none=0,key_0=0,key_1=0,key_2=0,key_3=0,key_4=0,key_5=0,key_6=0,key_7=0,key_8=0,key_9=0,key_10=0,key_11=0,\
                time_signature_0=0,time_signature_1=0,time_signature_2=0,time_signature_3=0,time_signature_4=0,time_signature_5=0,time_signature_6=0,time_signature_7=0,)



    if df_avg.iloc[0]['key']==-1:
        df_avg.at[0,'key_none']=1
    else:
        col_name='key_'+str(df_avg.iloc[0]['key']) 
        df_avg.at[0,col_name]=1

    col_name='time_signature_'+str(df_avg.iloc[0]['time_signature']) 
    df_avg.at[0,col_name]=1

    if df_avg.iloc[0]['mode']==0:
        df_avg.at[0,'mode_minor']=1
    else:
        df_avg.at[0,'mode_major']=1   

    return df_avg

## Creating Song Vectors

In [3]:
# Read `tracks.csv` file
tracks_df = pd.read_csv(Path("Resources/tracks.csv"))


In [4]:
# Check out the df
tracks_df

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,mode_minor,mode_major,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7
0,0.418,0.81600,1,-7.200,1,0.1240,0.00371,0.001640,0.2640,0.8340,...,0,1,0,0,0,0,1,0,0,0
1,0.366,0.03850,1,-29.403,1,0.1600,0.57500,0.000016,0.1050,0.0398,...,0,1,0,0,0,0,1,0,0,0
2,0.649,0.60500,7,-10.186,1,0.0330,0.00945,0.842000,0.3440,0.5310,...,0,1,0,0,0,0,1,0,0,0
3,0.528,0.00676,5,-30.605,1,0.0593,0.99500,0.938000,0.0651,0.0381,...,0,1,0,0,0,1,0,0,0,0
4,0.835,0.56400,0,-11.545,1,0.0673,0.05490,0.000000,0.0879,0.9640,...,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2214987,0.866,0.48800,1,-14.534,1,0.1010,0.00238,0.000104,0.1290,0.4780,...,0,1,0,0,0,0,1,0,0,0
2214988,0.359,0.38700,1,-11.947,1,0.0325,0.76900,0.000003,0.3290,0.2910,...,0,1,0,0,0,0,1,0,0,0
2214989,0.752,0.54200,8,-7.178,0,0.0653,0.42900,0.000000,0.1040,0.9080,...,1,0,0,0,0,0,1,0,0,0
2214990,0.772,0.41200,4,-11.682,1,0.0305,0.37800,0.000199,0.1840,0.6050,...,0,1,0,0,0,0,1,0,0,0


In [5]:
# Check the tracks columns
tracks_df.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'id', 'track_uri', 'track_href', 'analysis_url', 'duration_ms',
       'time_signature', 'key_none', 'key_0', 'key_1', 'key_2', 'key_3',
       'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10',
       'key_11', 'mode_minor', 'mode_major', 'time_signature_0',
       'time_signature_1', 'time_signature_2', 'time_signature_3',
       'time_signature_4', 'time_signature_5', 'time_signature_6',
       'time_signature_7'],
      dtype='object')

In [6]:
# Keep the wanted columns for vectorization
tracks_features_df = tracks_df[['track_uri', 'danceability', 'energy', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms', 'key_none', 'key_0', 'key_1', 'key_2', 'key_3',
       'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10',
       'key_11', 'mode_minor', 'mode_major', 'time_signature_0',
       'time_signature_1', 'time_signature_2', 'time_signature_3',
       'time_signature_4', 'time_signature_5', 'time_signature_6',
       'time_signature_7']]
# Check out df
tracks_features_df

Unnamed: 0,track_uri,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,mode_minor,mode_major,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7
0,spotify:track:37u0UvJxXK8EGWOjbsS2Em,0.418,0.81600,-7.200,0.1240,0.00371,0.001640,0.2640,0.8340,184.122,...,0,1,0,0,0,0,1,0,0,0
1,spotify:track:1qV82Jq1kXV8AS1qimr1JS,0.366,0.03850,-29.403,0.1600,0.57500,0.000016,0.1050,0.0398,168.049,...,0,1,0,0,0,0,1,0,0,0
2,spotify:track:4R0J5oREX8vuljW1OYg8nU,0.649,0.60500,-10.186,0.0330,0.00945,0.842000,0.3440,0.5310,144.155,...,0,1,0,0,0,0,1,0,0,0
3,spotify:track:5drWUoTthqxB0tEb3lLogI,0.528,0.00676,-30.605,0.0593,0.99500,0.938000,0.0651,0.0381,69.314,...,0,1,0,0,0,1,0,0,0,0
4,spotify:track:08YAU8YEzjXbPMBhuwwNjr,0.835,0.56400,-11.545,0.0673,0.05490,0.000000,0.0879,0.9640,109.999,...,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2214987,spotify:track:2qTuN39E6Js6TQ734h8kqZ,0.866,0.48800,-14.534,0.1010,0.00238,0.000104,0.1290,0.4780,101.854,...,0,1,0,0,0,0,1,0,0,0
2214988,spotify:track:633yVO5nYOnszN6b9yR1Eu,0.359,0.38700,-11.947,0.0325,0.76900,0.000003,0.3290,0.2910,88.005,...,0,1,0,0,0,0,1,0,0,0
2214989,spotify:track:6haw7Ma9ebcubusijZ69BO,0.752,0.54200,-7.178,0.0653,0.42900,0.000000,0.1040,0.9080,75.546,...,1,0,0,0,0,0,1,0,0,0
2214990,spotify:track:4Z6wHGFWv1uxPGxuthqQav,0.772,0.41200,-11.682,0.0305,0.37800,0.000199,0.1840,0.6050,108.352,...,0,1,0,0,0,0,1,0,0,0


## Creating Playlist Vector
* Split the data to feed and validate

In [7]:
# Process the input playlist data
# Read the csv file into a pandas DataFrame
input_playlist_df = pd.read_csv(Path("Resources/tracks_features_lisa2_techno_features.csv"))

# Review the DataFrame
input_playlist_df

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,key_10,key_11,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7
0,0.635,0.681,2,-8.729,1,0.1120,0.011100,0.816,0.1040,0.2350,...,0,0,0,0,0,0,1,0,0,0
1,0.595,0.960,10,-7.336,1,0.0675,0.001330,0.926,0.1300,0.0999,...,1,0,0,0,0,1,0,0,0,0
2,0.674,0.669,0,-8.747,0,0.0518,0.033200,0.716,0.1010,0.0834,...,0,0,0,0,0,0,1,0,0,0
3,0.457,0.995,7,-7.801,1,0.0454,0.000051,0.865,0.3540,0.6850,...,0,0,0,0,0,0,1,0,0,0
4,0.731,0.873,11,-7.746,0,0.0518,0.000268,0.851,0.1000,0.6710,...,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,0.824,0.992,1,-6.926,1,0.3280,0.000392,0.855,0.0991,0.0348,...,0,0,0,0,0,0,1,0,0,0
223,0.732,0.823,7,-11.652,1,0.0831,0.002410,0.204,0.0964,0.5900,...,0,0,0,0,0,0,1,0,0,0
224,0.656,0.977,1,-5.185,1,0.0812,0.000209,0.885,0.0917,0.3390,...,0,0,0,0,0,0,1,0,0,0
225,0.677,0.985,7,-5.278,1,0.1150,0.045300,0.868,0.1050,0.0720,...,0,0,0,0,0,0,1,0,0,0


In [8]:
# Check the columns of `input_playlist_df`
input_playlist_df.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'uri', 'duration_ms', 'time_signature', 'mode_minor', 'mode_major',
       'key_none', 'key_0', 'key_1', 'key_2', 'key_3', 'key_4', 'key_5',
       'key_6', 'key_7', 'key_8', 'key_9', 'key_10', 'key_11',
       'time_signature_0', 'time_signature_1', 'time_signature_2',
       'time_signature_3', 'time_signature_4', 'time_signature_5',
       'time_signature_6', 'time_signature_7'],
      dtype='object')

In [9]:
# Split the `input_playlist_df` into `input_playlist_feed_df` and `input_playlist_validate_df`
# splitting dataframe in a particular size
input_playlist_feed_df = input_playlist_df.sample(frac=0.8,random_state=200)
input_playlist_feed_df.reset_index()


Unnamed: 0,index,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,key_10,key_11,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7
0,39,0.599,0.928,0,-7.291,1,0.0426,0.000164,0.821,0.297,...,0,0,0,0,0,0,1,0,0,0
1,145,0.773,0.973,3,-3.801,0,0.0903,0.009020,0.787,0.278,...,0,0,0,0,0,0,1,0,0,0
2,188,0.652,0.950,1,-8.164,0,0.1770,0.058100,0.847,0.111,...,0,0,0,0,0,0,1,0,0,0
3,112,0.680,0.999,9,-5.167,1,0.0663,0.038300,0.767,0.281,...,0,0,0,0,0,0,1,0,0,0
4,217,0.744,0.978,0,-5.741,1,0.0547,0.073400,0.915,0.110,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,90,0.468,0.975,7,-8.029,1,0.0985,0.000198,0.790,0.391,...,0,0,0,0,0,0,1,0,0,0
178,216,0.583,0.986,8,-5.826,1,0.0745,0.000494,0.839,0.306,...,0,0,0,0,0,0,1,0,0,0
179,161,0.681,0.932,10,-6.652,0,0.0709,0.015600,0.837,0.094,...,1,0,0,0,0,0,1,0,0,0
180,3,0.457,0.995,7,-7.801,1,0.0454,0.000051,0.865,0.354,...,0,0,0,0,0,0,1,0,0,0


In [10]:
# Get the rest of the df as `input_playlist_validate_df`
input_playlist_validate_df = input_playlist_df[~input_playlist_df['uri'].isin(input_playlist_feed_df['uri'].values)]
input_playlist_validate_df.reset_index()

Unnamed: 0,index,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,key_10,key_11,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7
0,1,0.595,0.96,10,-7.336,1,0.0675,0.00133,0.926,0.13,...,1,0,0,0,0,1,0,0,0,0
1,7,0.449,0.993,7,-7.256,1,0.0515,9.7e-05,0.895,0.15,...,0,0,0,0,0,0,1,0,0,0
2,14,0.479,0.992,9,-5.71,0,0.0683,0.00686,0.952,0.0759,...,0,0,0,0,0,0,1,0,0,0
3,16,0.536,0.915,3,-7.767,0,0.298,0.131,0.624,0.0765,...,0,0,0,0,0,0,1,0,0,0
4,20,0.553,0.931,5,-8.196,0,0.0427,0.00204,0.413,0.238,...,0,0,0,0,0,0,1,0,0,0
5,23,0.627,0.998,1,-5.066,1,0.0922,0.00996,0.8,0.0578,...,0,0,0,0,0,0,1,0,0,0
6,26,0.636,0.994,11,-8.834,0,0.0623,0.00397,0.856,0.11,...,0,1,0,0,0,0,1,0,0,0
7,35,0.642,0.987,5,-7.145,1,0.0878,0.0936,0.799,0.316,...,0,0,0,0,0,1,0,0,0,0
8,42,0.66,0.904,5,-7.648,0,0.0948,0.0106,0.358,0.0691,...,0,0,0,0,0,0,1,0,0,0
9,51,0.582,0.969,1,-7.426,1,0.0438,0.00492,0.871,0.106,...,0,0,0,0,0,0,1,0,0,0


In [11]:
# Check the columns of `input_playlist_feed_df`
input_playlist_feed_df.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'uri', 'duration_ms', 'time_signature', 'mode_minor', 'mode_major',
       'key_none', 'key_0', 'key_1', 'key_2', 'key_3', 'key_4', 'key_5',
       'key_6', 'key_7', 'key_8', 'key_9', 'key_10', 'key_11',
       'time_signature_0', 'time_signature_1', 'time_signature_2',
       'time_signature_3', 'time_signature_4', 'time_signature_5',
       'time_signature_6', 'time_signature_7'],
      dtype='object')

In [12]:
# Aggregate the mean of `input_playlist_feed_df` 
input_playlist_feed_mean_df = playlist_mean(input_playlist_feed_df.drop(columns=['uri']))
input_playlist_feed_mean_df

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key_10,key_11,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7
0,0.628121,0.937555,-6.441813,0.112445,0.027696,0.66253,0.202684,0.290204,151.093819,306780.813187,...,0,0,0,0,0,0,1,0,0,0


In [13]:
# Keep the wanted features and reorder the columns
input_playlist_feed_mean_nokey_df = input_playlist_feed_mean_df[['danceability', 'energy', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
        'duration_ms', 'key_none', 'key_0', 'key_1', 'key_2', 'key_3', 'key_4', 'key_5',
       'key_6', 'key_7', 'key_8', 'key_9', 'key_10', 'key_11', 'mode_minor', 'mode_major',
       'time_signature_0', 'time_signature_1', 'time_signature_2',
       'time_signature_3', 'time_signature_4', 'time_signature_5',
       'time_signature_6', 'time_signature_7']]
# Check out the df
input_playlist_feed_mean_nokey_df

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,mode_minor,mode_major,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7
0,0.628121,0.937555,-6.441813,0.112445,0.027696,0.66253,0.202684,0.290204,151.093819,306780.813187,...,0,1,0,0,0,0,1,0,0,0


## Preparing for data for comparing similarity
* Compare aggregated input playlist features to song features that are not in the playlist `new_songs_feature`

## Scaling

In [14]:
# Scale the data
tracks_scaler = StandardScaler()
tracks_scaler.fit(tracks_features_df.drop('track_uri',axis=1))

In [15]:
# Scale the data
scaled_tracks_features = tracks_scaler.transform(tracks_features_df.drop('track_uri',axis=1))
scaled_tracks_features_df = pd.DataFrame(scaled_tracks_features, columns=tracks_features_df.drop('track_uri',axis=1).columns)

# Check out the df
scaled_tracks_features_df


Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,mode_minor,mode_major,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7
0,-0.717590,0.868764,0.437295,0.301916,-0.990444,-0.627309,0.289102,1.327155,2.143501,-0.085811,...,-0.725791,0.725791,-0.03769,-0.103107,0.0,-0.335807,0.389518,-0.137138,0.0,0.0
1,-0.999385,-2.053931,-3.508668,0.614474,0.621576,-0.631957,-0.548502,-1.614593,1.606274,0.429796,...,-0.725791,0.725791,-0.03769,-0.103107,0.0,-0.335807,0.389518,-0.137138,0.0,0.0
2,0.534229,0.075596,-0.093383,-0.488160,-0.974247,1.777799,0.710538,0.204832,0.807637,-0.086604,...,-0.725791,0.725791,-0.03769,-0.103107,0.0,-0.335807,0.389518,-0.137138,0.0,0.0
3,-0.121486,-2.173244,-3.722290,-0.259819,1.806698,2.052551,-0.758693,-1.620889,-1.693863,-0.083086,...,-0.725791,0.725791,-0.03769,-0.103107,0.0,2.977897,-2.567278,-0.137138,0.0,0.0
4,1.542188,-0.078527,-0.334907,-0.190362,-0.846000,-0.632003,-0.638584,1.808681,-0.334000,0.017834,...,-0.725791,0.725791,-0.03769,-0.103107,0.0,-0.335807,0.389518,-0.137138,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2214987,1.710181,-0.364218,-0.866119,0.102227,-0.994197,-0.631705,-0.422071,0.008517,-0.606240,-0.088797,...,-0.725791,0.725791,-0.03769,-0.103107,0.0,-0.335807,0.389518,-0.137138,0.0,0.0
2214988,-1.037319,-0.743887,-0.406352,-0.492501,1.168990,-0.631995,0.631518,-0.684138,-1.069131,0.926736,...,-0.725791,0.725791,-0.03769,-0.103107,0.0,-0.335807,0.389518,-0.137138,0.0,0.0
2214989,1.092400,-0.161227,0.441205,-0.207726,0.209605,-0.632003,-0.553770,1.601254,-1.485563,-0.017122,...,1.377808,-1.377808,-0.03769,-0.103107,0.0,-0.335807,0.389518,-0.137138,0.0,0.0
2214990,1.200783,-0.649909,-0.359255,-0.509866,0.065697,-0.631433,-0.132334,0.478930,-0.389049,-0.193126,...,-0.725791,0.725791,-0.03769,-0.103107,0.0,-0.335807,0.389518,-0.137138,0.0,0.0


In [16]:
# Scale the `input_playlist_feed_mean` with the `tracks_scaler`
scaled_input_playlist_feed_mean = tracks_scaler.transform(input_playlist_feed_mean_nokey_df)
scaled_input_playlist_feed_mean_df = pd.DataFrame(scaled_input_playlist_feed_mean, columns=input_playlist_feed_mean_nokey_df.columns)

# Check out the df
scaled_input_playlist_feed_mean_df


Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,mode_minor,mode_major,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7
0,0.421083,1.325701,0.572042,0.201594,-0.922763,1.264157,-0.033908,-0.687087,1.039561,0.379453,...,-0.725791,0.725791,-0.03769,-0.103107,0.0,-0.335807,0.389518,-0.137138,0.0,0.0


## Calculating Similarity

In [17]:
# Using cosine similarity
tracks_features_df['similarity'] = cosine_similarity(scaled_tracks_features_df.values, scaled_input_playlist_feed_mean_df.values)
# Get rid of the tracks that already exist in input playlist
recommend_tracks_df = tracks_features_df[~tracks_features_df['track_uri'].isin(input_playlist_df['uri'].values)]
recommend_tracks_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tracks_features_df['similarity'] = cosine_similarity(scaled_tracks_features_df.values, scaled_input_playlist_feed_mean_df.values)


Unnamed: 0,track_uri,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,mode_major,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7,similarity
0,spotify:track:37u0UvJxXK8EGWOjbsS2Em,0.418,0.81600,-7.200,0.1240,0.00371,0.001640,0.2640,0.8340,184.122,...,1,0,0,0,0,1,0,0,0,0.148095
1,spotify:track:1qV82Jq1kXV8AS1qimr1JS,0.366,0.03850,-29.403,0.1600,0.57500,0.000016,0.1050,0.0398,168.049,...,1,0,0,0,0,1,0,0,0,-0.133138
2,spotify:track:4R0J5oREX8vuljW1OYg8nU,0.649,0.60500,-10.186,0.0330,0.00945,0.842000,0.3440,0.5310,144.155,...,1,0,0,0,0,1,0,0,0,0.859433
3,spotify:track:5drWUoTthqxB0tEb3lLogI,0.528,0.00676,-30.605,0.0593,0.99500,0.938000,0.0651,0.0381,69.314,...,1,0,0,0,1,0,0,0,0,-0.222104
4,spotify:track:08YAU8YEzjXbPMBhuwwNjr,0.835,0.56400,-11.545,0.0673,0.05490,0.000000,0.0879,0.9640,109.999,...,1,0,0,0,0,1,0,0,0,-0.066515
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2214987,spotify:track:2qTuN39E6Js6TQ734h8kqZ,0.866,0.48800,-14.534,0.1010,0.00238,0.000104,0.1290,0.4780,101.854,...,1,0,0,0,0,1,0,0,0,-0.032875
2214988,spotify:track:633yVO5nYOnszN6b9yR1Eu,0.359,0.38700,-11.947,0.0325,0.76900,0.000003,0.3290,0.2910,88.005,...,1,0,0,0,0,1,0,0,0,-0.213124
2214989,spotify:track:6haw7Ma9ebcubusijZ69BO,0.752,0.54200,-7.178,0.0653,0.42900,0.000000,0.1040,0.9080,75.546,...,0,0,0,0,0,1,0,0,0,-0.285021
2214990,spotify:track:4Z6wHGFWv1uxPGxuthqQav,0.772,0.41200,-11.682,0.0305,0.37800,0.000199,0.1840,0.6050,108.352,...,1,0,0,0,0,1,0,0,0,-0.125773


In [18]:
# Get the same number of songs as the validate_df dataset
tracks_features_df_top = recommend_tracks_df.sort_values('similarity', ascending=False).head(len(input_playlist_validate_df))
tracks_features_df_top.head(45)

Unnamed: 0,track_uri,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,mode_major,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7,similarity
489376,spotify:track:1HDKet4smP66Q5rL8rXgaC,0.596,0.918,-6.827,0.0833,0.000218,0.654,0.209,0.383,139.933,...,1,0,0,0,0,1,0,0,0,0.988483
825931,spotify:track:7fxy1ikXBnPGbL1D2rRGNR,0.596,0.918,-6.827,0.0833,0.000218,0.654,0.209,0.383,139.933,...,1,0,0,0,0,1,0,0,0,0.988483
503106,spotify:track:6s7mGQj7DWONZxgXKcTyoe,0.632,0.893,-7.603,0.0654,0.0509,0.602,0.144,0.269,144.976,...,1,0,0,0,0,1,0,0,0,0.987067
272293,spotify:track:7HiIcy7Ed9hZIbRsFgnnFE,0.619,0.962,-7.667,0.114,0.00114,0.762,0.158,0.275,139.996,...,1,0,0,0,0,1,0,0,0,0.986511
828999,spotify:track:2ojtvFlHln9bYvk9qTIoHa,0.615,0.911,-4.134,0.0931,0.00271,0.87,0.193,0.216,160.032,...,1,0,0,0,0,1,0,0,0,0.982917
354184,spotify:track:7x0wcgMxVmoGLlvhw9P15j,0.654,0.927,-5.378,0.092,1.7e-05,0.901,0.234,0.243,166.094,...,1,0,0,0,0,1,0,0,0,0.982369
96242,spotify:track:6A9tv9syCliTuOyCuW0HWz,0.624,0.948,-6.324,0.0878,0.00272,0.766,0.306,0.275,140.032,...,1,0,0,0,0,1,0,0,0,0.982035
1825102,spotify:track:2D86FuBgs3yx6eLbgvoPam,0.654,0.976,-4.84,0.157,0.0139,0.665,0.0957,0.183,149.975,...,1,0,0,0,0,1,0,0,0,0.980148
143472,spotify:track:3krehk8mqui3G2AeotY0HV,0.641,0.95,-6.522,0.0606,5.8e-05,0.788,0.28,0.244,140.047,...,1,0,0,0,0,1,0,0,0,0.98004
1664938,spotify:track:45jzrVvCD5gwdYfaVbTrNB,0.629,0.997,-4.593,0.168,0.00728,0.669,0.223,0.379,150.05,...,1,0,0,0,0,1,0,0,0,0.979933


## Validation

In [19]:
# Check out `input_playlist_validate_df` df
input_playlist_validate_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,key_10,key_11,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7
1,0.595,0.96,10,-7.336,1,0.0675,0.00133,0.926,0.13,0.0999,...,1,0,0,0,0,1,0,0,0,0
7,0.449,0.993,7,-7.256,1,0.0515,9.7e-05,0.895,0.15,0.402,...,0,0,0,0,0,0,1,0,0,0
14,0.479,0.992,9,-5.71,0,0.0683,0.00686,0.952,0.0759,0.48,...,0,0,0,0,0,0,1,0,0,0
16,0.536,0.915,3,-7.767,0,0.298,0.131,0.624,0.0765,0.0626,...,0,0,0,0,0,0,1,0,0,0
20,0.553,0.931,5,-8.196,0,0.0427,0.00204,0.413,0.238,0.0397,...,0,0,0,0,0,0,1,0,0,0


In [20]:
# Use function to calculate the `input_playlist_validate_df` mean
input_playlist_mean = playlist_mean(input_playlist_validate_df)
input_playlist_mean

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key_10,key_11,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7
0,0.623222,0.965867,-6.438689,0.094027,0.012971,0.721767,0.225256,0.308336,150.224911,294861.688889,...,0,0,0,0,0,0,1,0,0,0


In [21]:
# Add the 'key','model','time_signature' back from `tracks_df`
reverse_tracks_features_df_top = tracks_features_df_top.merge(tracks_df,on='track_uri', how='inner')
reverse_tracks_features_df_top

Unnamed: 0,track_uri,danceability_x,energy_x,loudness_x,speechiness_x,acousticness_x,instrumentalness_x,liveness_x,valence_x,tempo_x,...,mode_minor_y,mode_major_y,time_signature_0_y,time_signature_1_y,time_signature_2_y,time_signature_3_y,time_signature_4_y,time_signature_5_y,time_signature_6_y,time_signature_7_y
0,spotify:track:1HDKet4smP66Q5rL8rXgaC,0.596,0.918,-6.827,0.0833,0.000218,0.654,0.209,0.383,139.933,...,0,1,0,0,0,0,1,0,0,0
1,spotify:track:7fxy1ikXBnPGbL1D2rRGNR,0.596,0.918,-6.827,0.0833,0.000218,0.654,0.209,0.383,139.933,...,0,1,0,0,0,0,1,0,0,0
2,spotify:track:6s7mGQj7DWONZxgXKcTyoe,0.632,0.893,-7.603,0.0654,0.0509,0.602,0.144,0.269,144.976,...,0,1,0,0,0,0,1,0,0,0
3,spotify:track:7HiIcy7Ed9hZIbRsFgnnFE,0.619,0.962,-7.667,0.114,0.00114,0.762,0.158,0.275,139.996,...,0,1,0,0,0,0,1,0,0,0
4,spotify:track:2ojtvFlHln9bYvk9qTIoHa,0.615,0.911,-4.134,0.0931,0.00271,0.87,0.193,0.216,160.032,...,0,1,0,0,0,0,1,0,0,0
5,spotify:track:7x0wcgMxVmoGLlvhw9P15j,0.654,0.927,-5.378,0.092,1.7e-05,0.901,0.234,0.243,166.094,...,0,1,0,0,0,0,1,0,0,0
6,spotify:track:6A9tv9syCliTuOyCuW0HWz,0.624,0.948,-6.324,0.0878,0.00272,0.766,0.306,0.275,140.032,...,0,1,0,0,0,0,1,0,0,0
7,spotify:track:2D86FuBgs3yx6eLbgvoPam,0.654,0.976,-4.84,0.157,0.0139,0.665,0.0957,0.183,149.975,...,0,1,0,0,0,0,1,0,0,0
8,spotify:track:3krehk8mqui3G2AeotY0HV,0.641,0.95,-6.522,0.0606,5.8e-05,0.788,0.28,0.244,140.047,...,0,1,0,0,0,0,1,0,0,0
9,spotify:track:45jzrVvCD5gwdYfaVbTrNB,0.629,0.997,-4.593,0.168,0.00728,0.669,0.223,0.379,150.05,...,0,1,0,0,0,0,1,0,0,0


In [22]:
# Reverse the columns of `tracks_features_df_top` in order to fit the playlist_mean function
reverse_tracks_features_df_top = reverse_tracks_features_df_top[['track_uri', 'danceability_x', 'energy_x', 'loudness_x',
       'speechiness_x', 'acousticness_x', 'instrumentalness_x', 'liveness_x',
       'valence_x', 'tempo_x', 'duration_ms_x', 'key_none_x', 'key_0_x',
       'key_1_x', 'key_2_x', 'key_3_x', 'key_4_x', 'key_5_x', 'key_6_x',
       'key_7_x', 'key_8_x', 'key_9_x', 'key_10_x', 'key_11_x', 'mode_minor_x',
       'mode_major_x', 'time_signature_0_x', 'time_signature_1_x',
       'time_signature_2_x', 'time_signature_3_x', 'time_signature_4_x',
       'time_signature_5_x', 'time_signature_6_x', 'time_signature_7_x',
       'similarity', 'key', 'mode','time_signature']]
reverse_tracks_features_df_top.columns = ['track_uri', 'danceability', 'energy', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms', 'key_none', 'key_0',
       'key_1', 'key_2', 'key_3', 'key_4', 'key_5', 'key_6',
       'key_7', 'key_8', 'key_9', 'key_10', 'key_11', 'mode_minor',
       'mode_major', 'time_signature_0', 'time_signature_1',
       'time_signature_2', 'time_signature_3', 'time_signature_4',
       'time_signature_5', 'time_signature_6', 'time_signature_7',
       'similarity', 'key', 'mode','time_signature']

In [23]:
# Calculate the mean of the recommend playlist with playlist_mean function
recommend_playlist_mean = playlist_mean(reverse_tracks_features_df_top)
recommend_playlist_mean

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key_10,key_11,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7
0,0.630489,0.926756,-5.847956,0.095556,0.007737,0.741422,0.177682,0.280222,148.420667,303789.6,...,0,0,0,0,0,0,1,0,0,0


In [24]:
# Expand all columns for view
pd.set_option('display.max_columns', None)

In [25]:
# Concate recommend and validate data together for better comparison
validation_df = pd.concat([recommend_playlist_mean,input_playlist_mean])
validation_df['dataset'] = ['recommend','validation']
validation_df = validation_df[['dataset', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'key', 'mode', 'time_signature', 'mode_minor', 'mode_major', 'key_none',
       'key_0', 'key_1', 'key_2', 'key_3', 'key_4', 'key_5', 'key_6', 'key_7',
       'key_8', 'key_9', 'key_10', 'key_11', 'time_signature_0',
       'time_signature_1', 'time_signature_2', 'time_signature_3',
       'time_signature_4', 'time_signature_5', 'time_signature_6',
       'time_signature_7']]
validation_df

Unnamed: 0,dataset,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,key,mode,time_signature,mode_minor,mode_major,key_none,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7
0,recommend,0.630489,0.926756,-5.847956,0.095556,0.007737,0.741422,0.177682,0.280222,148.420667,303789.6,7,1,4,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
0,validation,0.623222,0.965867,-6.438689,0.094027,0.012971,0.721767,0.225256,0.308336,150.224911,294861.688889,7,1,4,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
