# Clustering Playlists

## Processing Data

In [1]:
# Import libraries and dependencies
import pandas as pd
import hvplot.pandas
from pathlib import Path

In [3]:
# Read the csv file into a pandas DataFrame
playlist_weight_df = pd.read_csv(Path("Resources/playlist_stat_mean_std_mode_bin.csv"))

# Review the DataFrame
playlist_weight_df.head(5)

Unnamed: 0,pid,tracks_found,danceability,danceability_std,energy,energy_std,loudness,loudness_std,speechiness,speechiness_std,...,mode_major,time_signature,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7
0,5121,24,0.610625,0.157655,0.664625,0.137219,-8.727708,2.255706,0.056533,0.038166,...,1,4,0,0,0,0,1,0,0,0
1,5122,29,0.661655,0.132079,0.620138,0.153483,-6.476448,2.121357,0.111762,0.117234,...,1,4,0,0,0,0,1,0,0,0
2,5123,19,0.650632,0.183358,0.660842,0.162967,-5.246421,1.502051,0.119905,0.126137,...,1,4,0,0,0,0,1,0,0,0
3,5124,38,0.458263,0.123866,0.753395,0.195714,-6.276289,2.614154,0.052226,0.044282,...,1,4,0,0,0,0,1,0,0,0
4,5125,107,0.691075,0.120466,0.691374,0.151745,-6.160477,1.905478,0.190026,0.143879,...,1,4,0,0,0,0,1,0,0,0


In [4]:
# Check dataframe columns
playlist_weight_df.columns

Index(['pid', 'tracks_found', 'danceability', 'danceability_std', 'energy',
       'energy_std', 'loudness', 'loudness_std', 'speechiness',
       'speechiness_std', 'acousticness', 'acousticness_std',
       'instrumentalness', 'instrumentalness_std', 'liveness', 'liveness_std',
       'valence', 'valence_std', 'tempo', 'tempo_std', 'duration_ms',
       'duration_ms_std', 'key', 'key_none', 'key_0', 'key_1', 'key_2',
       'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10',
       'key_11', 'mode', 'mode_minor', 'mode_major', 'time_signature',
       'time_signature_0', 'time_signature_1', 'time_signature_2',
       'time_signature_3', 'time_signature_4', 'time_signature_5',
       'time_signature_6', 'time_signature_7'],
      dtype='object')

In [5]:
# Retain the useful columns to fit the model
playlist_weight_df = playlist_weight_df[['pid', 'danceability', 'danceability_std', 'energy',
       'energy_std', 'loudness', 'loudness_std', 'speechiness',
       'speechiness_std', 'acousticness', 'acousticness_std',
       'instrumentalness', 'instrumentalness_std', 'liveness', 'liveness_std',
       'valence', 'valence_std', 'tempo', 'tempo_std', 'duration_ms',
       'duration_ms_std', 'key', 'key_none', 'key_0', 'key_1', 'key_2',
       'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10',
       'key_11', 'mode', 'mode_minor', 'mode_major', 'time_signature',
       'time_signature_0', 'time_signature_1', 'time_signature_2',
       'time_signature_3', 'time_signature_4', 'time_signature_5',
       'time_signature_6', 'time_signature_7']]
# Check out `playlist_mean_df`
playlist_weight_df.head(5)

Unnamed: 0,pid,danceability,danceability_std,energy,energy_std,loudness,loudness_std,speechiness,speechiness_std,acousticness,...,mode_major,time_signature,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7
0,5121,0.610625,0.157655,0.664625,0.137219,-8.727708,2.255706,0.056533,0.038166,0.133017,...,1,4,0,0,0,0,1,0,0,0
1,5122,0.661655,0.132079,0.620138,0.153483,-6.476448,2.121357,0.111762,0.117234,0.172197,...,1,4,0,0,0,0,1,0,0,0
2,5123,0.650632,0.183358,0.660842,0.162967,-5.246421,1.502051,0.119905,0.126137,0.230037,...,1,4,0,0,0,0,1,0,0,0
3,5124,0.458263,0.123866,0.753395,0.195714,-6.276289,2.614154,0.052226,0.044282,0.103376,...,1,4,0,0,0,0,1,0,0,0
4,5125,0.691075,0.120466,0.691374,0.151745,-6.160477,1.905478,0.190026,0.143879,0.130277,...,1,4,0,0,0,0,1,0,0,0


## Scaling

In [6]:
# Import libraries and dependencies
from sklearn.preprocessing import StandardScaler

In [7]:
# Creating `scaler` 
scaler = StandardScaler()
# Scale the `playlist_mean_df` and drop the 'pid', 'key','mode','time_signature' columns
scaler.fit(playlist_weight_df.drop(['pid','key','mode','time_signature'],axis=1))
scaled_playlist_weight = scaler.transform(playlist_weight_df.drop(['pid','key','mode','time_signature'],axis=1))

In [8]:
# Create `scaled_playlist_mean_df` for the scaled_playlist_mean
scaled_playlist_weight_df = pd.DataFrame(scaled_playlist_weight,columns=playlist_weight_df.drop(['pid','key','mode','time_signature'],axis=1).columns, index=playlist_weight_df.index)
# Check out `scaled_playlist_mean_df`
scaled_playlist_weight_df.head()


Unnamed: 0,danceability,danceability_std,energy,energy_std,loudness,loudness_std,speechiness,speechiness_std,acousticness,acousticness_std,...,mode_minor,mode_major,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7
0,0.052642,1.068008,0.213927,-0.700676,-0.534153,-0.389743,-0.651185,-0.789438,-0.622047,-0.226146,...,-0.442894,0.442894,-0.005099,-0.003162,0.0,-0.066555,0.06697,-0.004359,0.0,0.0
1,0.579708,0.056448,-0.121494,-0.293096,0.367402,-0.516287,0.328348,1.091248,-0.39816,0.191659,...,-0.442894,0.442894,-0.005099,-0.003162,0.0,-0.066555,0.06697,-0.004359,0.0,0.0
2,0.465851,2.084598,0.185405,-0.055414,0.859986,-1.099622,0.472775,1.303022,-0.067641,0.232899,...,-0.442894,0.442894,-0.005099,-0.003162,0.0,-0.066555,0.06697,-0.004359,0.0,0.0
3,-1.521029,-0.268359,0.883228,0.76524,0.447559,-0.052115,-0.727574,-0.643949,-0.791429,-0.031478,...,-0.442894,0.442894,-0.005099,-0.003162,0.0,-0.066555,0.06697,-0.004359,0.0,0.0
4,0.883569,-0.402855,0.415606,-0.336635,0.493938,-0.719627,1.716434,1.72502,-0.637704,-0.24608,...,-0.442894,0.442894,-0.005099,-0.003162,0.0,-0.066555,0.06697,-0.004359,0.0,0.0


## Finding K Value
* Elbow Method vs. Silhouette Score

In [9]:
# Import libraries and dependencies
from sklearn.cluster import KMeans, AgglomerativeClustering, Birch
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import hvplot.pandas

In [10]:
# Set up the KMeans model where k is the number for clusters ranging from 1 to 10
kmeans_per_k = [KMeans(n_clusters=k, n_init=10, random_state=42).fit(scaled_playlist_weight_df)
                for k in range(1, 20)]
# Create comprehensive list for inertias
inertias = [model.inertia_ for model in kmeans_per_k]

In [11]:
# Create elbow_df and sil_score_df
elbow_df = pd.DataFrame({
    "k": range(1, 20),
    "inertia": inertias
})

In [12]:
# Plot the DataFrame
elbow_df.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=range(1, 20)
)

## Clustering 

In [13]:
# Initialize the K-Means model with n_clusters=6
model = KMeans(n_clusters=17)

# Fit the model for the scaled_playlist_df
model.fit(scaled_playlist_weight_df)

# Save the predicted model clusters to a new DataFrame.
playlist_clusters = model.predict(scaled_playlist_weight_df)

# View the playlist clusters
print(playlist_clusters)



[15 14 11 ...  4 16  4]


In [14]:
# Create a copy of the concatenated DataFrame
clustered_scaled_playlist_df = scaled_playlist_weight_df.copy()

# Create a new column in the copy of the concatenated DataFrame with the predicted clusters
clustered_scaled_playlist_df["playlist_clusters"] = playlist_clusters

# Review the DataFrame
clustered_scaled_playlist_df.head()

Unnamed: 0,danceability,danceability_std,energy,energy_std,loudness,loudness_std,speechiness,speechiness_std,acousticness,acousticness_std,...,mode_major,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7,playlist_clusters
0,0.052642,1.068008,0.213927,-0.700676,-0.534153,-0.389743,-0.651185,-0.789438,-0.622047,-0.226146,...,0.442894,-0.005099,-0.003162,0.0,-0.066555,0.06697,-0.004359,0.0,0.0,15
1,0.579708,0.056448,-0.121494,-0.293096,0.367402,-0.516287,0.328348,1.091248,-0.39816,0.191659,...,0.442894,-0.005099,-0.003162,0.0,-0.066555,0.06697,-0.004359,0.0,0.0,14
2,0.465851,2.084598,0.185405,-0.055414,0.859986,-1.099622,0.472775,1.303022,-0.067641,0.232899,...,0.442894,-0.005099,-0.003162,0.0,-0.066555,0.06697,-0.004359,0.0,0.0,11
3,-1.521029,-0.268359,0.883228,0.76524,0.447559,-0.052115,-0.727574,-0.643949,-0.791429,-0.031478,...,0.442894,-0.005099,-0.003162,0.0,-0.066555,0.06697,-0.004359,0.0,0.0,8
4,0.883569,-0.402855,0.415606,-0.336635,0.493938,-0.719627,1.716434,1.72502,-0.637704,-0.24608,...,0.442894,-0.005099,-0.003162,0.0,-0.066555,0.06697,-0.004359,0.0,0.0,4


In [15]:
# Add the 'pid' column to `clustered_scaled_playlist_df`
clustered_scaled_playlist_df["pid"] = playlist_weight_df['pid']
clustered_scaled_playlist_df

Unnamed: 0,danceability,danceability_std,energy,energy_std,loudness,loudness_std,speechiness,speechiness_std,acousticness,acousticness_std,...,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7,playlist_clusters,pid
0,0.052642,1.068008,0.213927,-0.700676,-0.534153,-0.389743,-0.651185,-0.789438,-0.622047,-0.226146,...,-0.005099,-0.003162,0.0,-0.066555,0.06697,-0.004359,0.0,0.0,15,5121
1,0.579708,0.056448,-0.121494,-0.293096,0.367402,-0.516287,0.328348,1.091248,-0.398160,0.191659,...,-0.005099,-0.003162,0.0,-0.066555,0.06697,-0.004359,0.0,0.0,14,5122
2,0.465851,2.084598,0.185405,-0.055414,0.859986,-1.099622,0.472775,1.303022,-0.067641,0.232899,...,-0.005099,-0.003162,0.0,-0.066555,0.06697,-0.004359,0.0,0.0,11,5123
3,-1.521029,-0.268359,0.883228,0.765240,0.447559,-0.052115,-0.727574,-0.643949,-0.791429,-0.031478,...,-0.005099,-0.003162,0.0,-0.066555,0.06697,-0.004359,0.0,0.0,8,5124
4,0.883569,-0.402855,0.415606,-0.336635,0.493938,-0.719627,1.716434,1.725020,-0.637704,-0.246080,...,-0.005099,-0.003162,0.0,-0.066555,0.06697,-0.004359,0.0,0.0,4,5125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,-0.531497,-1.359872,1.293767,-0.973276,1.329750,-0.785043,-0.777181,-1.044317,-0.544760,-0.448365,...,-0.005099,-0.003162,0.0,-0.066555,0.06697,-0.004359,0.0,0.0,15,39
999996,-0.609674,3.458063,0.800007,-0.450466,0.781950,-1.176564,0.467361,0.205818,-0.819008,-1.394414,...,-0.005099,-0.003162,0.0,-0.066555,0.06697,-0.004359,0.0,0.0,4,927783
999997,-0.044884,-0.572394,0.836030,-0.270647,0.549304,-0.537157,-0.580016,-0.337445,-0.626378,-0.251163,...,-0.005099,-0.003162,0.0,-0.066555,0.06697,-0.004359,0.0,0.0,4,927784
999998,0.365403,-0.252314,0.509430,-0.616146,0.708808,-0.834908,-0.049170,0.317493,-0.582120,-0.631821,...,-0.005099,-0.003162,0.0,-0.066555,0.06697,-0.004359,0.0,0.0,16,927785


In [16]:
# Add the cluster column to `playlist_weight_df` - this one is not scaled
playlist_weight_df["playlist_clusters"] = playlist_clusters
playlist_weight_df

Unnamed: 0,pid,danceability,danceability_std,energy,energy_std,loudness,loudness_std,speechiness,speechiness_std,acousticness,...,time_signature,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7,playlist_clusters
0,5121,0.610625,0.157655,0.664625,0.137219,-8.727708,2.255706,0.056533,0.038166,0.133017,...,4,0,0,0,0,1,0,0,0,15
1,5122,0.661655,0.132079,0.620138,0.153483,-6.476448,2.121357,0.111762,0.117234,0.172197,...,4,0,0,0,0,1,0,0,0,14
2,5123,0.650632,0.183358,0.660842,0.162967,-5.246421,1.502051,0.119905,0.126137,0.230037,...,4,0,0,0,0,1,0,0,0,11
3,5124,0.458263,0.123866,0.753395,0.195714,-6.276289,2.614154,0.052226,0.044282,0.103376,...,4,0,0,0,0,1,0,0,0,8
4,5125,0.691075,0.120466,0.691374,0.151745,-6.160477,1.905478,0.190026,0.143879,0.130277,...,4,0,0,0,0,1,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,39,0.554069,0.096269,0.807845,0.126341,-4.073379,1.836028,0.049429,0.027450,0.146542,...,4,0,0,0,0,1,0,0,0,15
999996,927783,0.546500,0.218084,0.742357,0.147203,-5.441286,1.420364,0.119600,0.080008,0.098549,...,4,0,0,0,0,1,0,0,0,4
999997,927784,0.601183,0.116179,0.747135,0.154378,-6.022222,2.099200,0.060546,0.057168,0.132259,...,4,0,0,0,0,1,0,0,0,4
999998,927785,0.640906,0.124272,0.703818,0.140592,-5.623927,1.783088,0.090477,0.084703,0.140004,...,4,0,0,0,0,1,0,0,0,16


In [17]:
# Store data for later use as a csv
playlist_weight_df.to_csv("./Validation_test/playlist_clusters17.csv", index=False)

OSError: Cannot save file into a non-existent directory: 'Validation_test'

In [None]:
# Store data for later use as a csv
clustered_scaled_playlist_df.to_csv("./Validation_test/scaled_playlist_clusters17.csv", index=False)

## Save The Model

In [13]:
import pickle
from pickle import dump

In [None]:
# save the song classification model as a pickle file
model_pkl_file = "./Resources/playlist_std_mode_cluster17_model.pkl"  

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(model, file)

In [None]:
# Save and export the scaler
scalerfile = './Resources/scaler_playlist_std_mode_cluster17.sav'
pickle.dump(scaler, open(scalerfile, 'wb'))

## Visualization Of Clustering

In [14]:
# Import libraries and dependencies
from sklearn.manifold import TSNE
import seaborn as sns
import time

### Cluster:17

In [15]:
# Create a copy of `clustered_scaled_tracks_df`
visual_cluster_df = clustered_scaled_playlist_df.copy()

In [16]:
# Create a TSNE model
m = TSNE(learning_rate='auto')

In [None]:
# Fit the data
start_time = time.time()

tsne_features = m.fit_transform(scaled_playlist_weight_df)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# Check out the first 3 rows of the coordinates of the data
tsne_features[1:4,:]

In [None]:
# Create the X and y for the data
visual_cluster_df["X"] = tsne_features[:,0]
visual_cluster_df["y"] = tsne_features[:,1]

In [None]:
# Plot the clustering result
start_time = time.time()

sns.scatterplot(x="X", y="y", hue='playlist_clusters', palette='Spectral', data=visual_cluster_df,legend='full')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

print("--- %s seconds ---" % (time.time() - start_time))

## Clustering Input and Recommendation

In [18]:
# Create function which converts a playlist into its weighted average via stddev without touching the categorical variables
def playlist_mean_std(df):   
    df_avg=pd.DataFrame()
    df_avg.at[0,'danceability']=df['danceability'].mean()
    df_avg.at[0,'danceability_std']=df['danceability'].std()
    df_avg.at[0,'energy']=df['energy'].mean()
    df_avg.at[0,'energy_std']=df['energy'].std()
    df_avg.at[0,'loudness']=df['loudness'].mean()
    df_avg.at[0,'loudness_std']=df['loudness'].std()
    df_avg.at[0,'speechiness']=df['speechiness'].mean()
    df_avg.at[0,'speechiness_std']=df['speechiness'].std()
    df_avg.at[0,'acousticness']=df['acousticness'].mean()
    df_avg.at[0,'acousticness_std']=df['acousticness'].std()
    df_avg.at[0,'instrumentalness']=df['instrumentalness'].mean()
    df_avg.at[0,'instrumentalness_std']=df['instrumentalness'].std()
    df_avg.at[0,'liveness']=df['liveness'].mean()
    df_avg.at[0,'liveness_std']=df['liveness'].std()
    df_avg.at[0,'valence']=df['valence'].mean()
    df_avg.at[0,'valence_std']=df['valence'].std()
    df_avg.at[0,'tempo']=df['tempo'].mean()
    df_avg.at[0,'tempo_std']=df['tempo'].std()
    df_avg.at[0,'duration_ms']=df['duration_ms'].mean()
    df_avg.at[0,'duration_ms_std']=df['duration_ms'].std()
    df_avg.at[0,'key']=df['key'].mode().iloc[0]
    df_avg.at[0,'mode']=df['mode'].mode().iloc[0]
    df_avg.at[0,'time_signature']=df['time_signature'].mode().iloc[0]


    df_avg=df_avg.astype({"key":'int',"mode":'int',"time_signature":'int'})     # Does not remove the decimal ".0" even if it is an integer!
    df_avg['key']=df_avg['key'].astype(str)     # adding this line seems to convert "key", "mode", and "time_signature" to objects...


    df_avg=df_avg.assign(key_none=0,key_0=0,key_1=0,key_2=0,key_3=0,key_4=0,key_5=0,key_6=0,key_7=0,key_8=0,key_9=0,key_10=0,key_11=0,\
                mode_minor=0,mode_major=0,\
                     time_signature_0=0,time_signature_1=0,time_signature_2=0,time_signature_3=0,time_signature_4=0,time_signature_5=0,time_signature_6=0,time_signature_7=0)


    if df_avg.iloc[0]['key']==-1:
        df_avg.at[0,'key_none']=1
    else:
        col_name='key_'+str(df_avg.iloc[0]['key']) 
        df_avg.at[0,col_name]=1

    col_name='time_signature_'+str(df_avg.iloc[0]['time_signature']) 
    df_avg.at[0,col_name]=1

    if df_avg.iloc[0]['mode']==0:
        df_avg.at[0,'mode_minor']=1
    else:
        df_avg.at[0,'mode_major']=1   

    return df_avg

In [24]:
# Process the input playlist data
# Read the csv file into a pandas DataFrame
input_playlist_df = pd.read_csv(Path("Resources/tracks_features_lisa2_techno_features.csv"))

# Review the DataFrame
input_playlist_df

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,key_10,key_11,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7
0,0.635,0.681,2,-8.729,1,0.1120,0.011100,0.816,0.1040,0.2350,...,0,0,0,0,0,0,1,0,0,0
1,0.595,0.960,10,-7.336,1,0.0675,0.001330,0.926,0.1300,0.0999,...,1,0,0,0,0,1,0,0,0,0
2,0.674,0.669,0,-8.747,0,0.0518,0.033200,0.716,0.1010,0.0834,...,0,0,0,0,0,0,1,0,0,0
3,0.457,0.995,7,-7.801,1,0.0454,0.000051,0.865,0.3540,0.6850,...,0,0,0,0,0,0,1,0,0,0
4,0.731,0.873,11,-7.746,0,0.0518,0.000268,0.851,0.1000,0.6710,...,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,0.824,0.992,1,-6.926,1,0.3280,0.000392,0.855,0.0991,0.0348,...,0,0,0,0,0,0,1,0,0,0
223,0.732,0.823,7,-11.652,1,0.0831,0.002410,0.204,0.0964,0.5900,...,0,0,0,0,0,0,1,0,0,0
224,0.656,0.977,1,-5.185,1,0.0812,0.000209,0.885,0.0917,0.3390,...,0,0,0,0,0,0,1,0,0,0
225,0.677,0.985,7,-5.278,1,0.1150,0.045300,0.868,0.1050,0.0720,...,0,0,0,0,0,0,1,0,0,0


In [25]:
# Split the `input_playlist_df` into `input_playlist_feed_df` and `input_playlist_validate_df`
# splitting dataframe in a particular size
input_playlist_feed_df = input_playlist_df.sample(frac=0.8,random_state=200)

# Get the rest of the df as `input_playlist_validate_df`
input_playlist_validate_df = input_playlist_df[~input_playlist_df['uri'].isin(input_playlist_feed_df['uri'].values)]


In [29]:
input_playlist_feed_df.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'uri', 'duration_ms', 'time_signature', 'mode_minor', 'mode_major',
       'key_none', 'key_0', 'key_1', 'key_2', 'key_3', 'key_4', 'key_5',
       'key_6', 'key_7', 'key_8', 'key_9', 'key_10', 'key_11',
       'time_signature_0', 'time_signature_1', 'time_signature_2',
       'time_signature_3', 'time_signature_4', 'time_signature_5',
       'time_signature_6', 'time_signature_7'],
      dtype='object')

In [30]:
# Calculate the mean using the function for `input_playlist_validate_df`
input_playlist_feed_mean_df = playlist_mean_std(input_playlist_feed_df)
input_playlist_feed_mean_df

Unnamed: 0,danceability,danceability_std,energy,energy_std,loudness,loudness_std,speechiness,speechiness_std,acousticness,acousticness_std,...,mode_minor,mode_major,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7
0,0.628121,0.085795,0.937555,0.076242,-6.441813,2.123678,0.112445,0.105227,0.027696,0.067988,...,0,1,0,0,0,0,1,0,0,0


In [34]:
input_playlist_feed_mean_df.columns

Index(['danceability', 'danceability_std', 'energy', 'energy_std', 'loudness',
       'loudness_std', 'speechiness', 'speechiness_std', 'acousticness',
       'acousticness_std', 'instrumentalness', 'instrumentalness_std',
       'liveness', 'liveness_std', 'valence', 'valence_std', 'tempo',
       'tempo_std', 'duration_ms', 'duration_ms_std', 'key', 'mode',
       'time_signature', 'key_none', 'key_0', 'key_1', 'key_2', 'key_3',
       'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10',
       'key_11', 'mode_minor', 'mode_major', 'time_signature_0',
       'time_signature_1', 'time_signature_2', 'time_signature_3',
       'time_signature_4', 'time_signature_5', 'time_signature_6',
       'time_signature_7'],
      dtype='object')

In [37]:
# Keep the wanted features and reorder the columns
input_playlist_feed_nokey_df = input_playlist_feed_mean_df[['danceability', 'danceability_std', 'energy', 'energy_std', 'loudness',
       'loudness_std', 'speechiness', 'speechiness_std', 'acousticness',
       'acousticness_std', 'instrumentalness', 'instrumentalness_std',
       'liveness', 'liveness_std', 'valence', 'valence_std', 'tempo',
       'tempo_std', 'duration_ms','duration_ms_std', 'key_none', 'key_0', 'key_1', 'key_2', 'key_3', 'key_4', 'key_5',
       'key_6', 'key_7', 'key_8', 'key_9', 'key_10', 'key_11', 'mode_minor', 'mode_major',
       'time_signature_0', 'time_signature_1', 'time_signature_2',
       'time_signature_3', 'time_signature_4', 'time_signature_5',
       'time_signature_6', 'time_signature_7']]
# Check out the df
input_playlist_feed_nokey_df

Unnamed: 0,danceability,danceability_std,energy,energy_std,loudness,loudness_std,speechiness,speechiness_std,acousticness,acousticness_std,...,mode_minor,mode_major,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7
0,0.628121,0.085795,0.937555,0.076242,-6.441813,2.123678,0.112445,0.105227,0.027696,0.067988,...,0,1,0,0,0,0,1,0,0,0


In [38]:
input_playlist_feed = scaler.transform(input_playlist_feed_nokey_df)
# Create `scaled_playlist_mean_df` for the scaled_playlist_mean
input_playlist_feed_df = pd.DataFrame(input_playlist_feed,columns=input_playlist_feed_nokey_df.columns, index=input_playlist_feed_nokey_df.index)
# Check out `scaled_playlist_mean_df`
input_playlist_feed_df


Unnamed: 0,danceability,danceability_std,energy,energy_std,loudness,loudness_std,speechiness,speechiness_std,acousticness,acousticness_std,...,mode_minor,mode_major,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7
0,0.233349,-1.774142,2.271748,-2.228765,0.381272,-0.514101,0.340461,0.805666,-1.223891,-1.860043,...,-0.442894,0.442894,-0.005099,-0.003162,0.0,-0.066555,0.06697,-0.004359,0.0,0.0


In [39]:
# Predict the label of the new playlist
input_playlist_cluster = model.predict(input_playlist_feed_nokey_df)
clustered_input_playlist_feed_nokey_df = input_playlist_feed_nokey_df.copy()
clustered_input_playlist_feed_nokey_df['cluster'] = input_playlist_cluster
clustered_input_playlist_feed_nokey_df

Unnamed: 0,danceability,danceability_std,energy,energy_std,loudness,loudness_std,speechiness,speechiness_std,acousticness,acousticness_std,...,mode_major,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7,cluster
0,0.628121,0.085795,0.937555,0.076242,-6.441813,2.123678,0.112445,0.105227,0.027696,0.067988,...,1,0,0,0,0,1,0,0,0,6


In [40]:
# Count the highest appearance of cluster
recommend_cluster_df = clustered_input_playlist_feed_nokey_df['cluster'].value_counts().to_frame()
recommend_cluster_df

Unnamed: 0_level_0,count
cluster,Unnamed: 1_level_1
6,1


In [41]:
# Get the most counted cluster as `recommend_cluster`
recommend_cluster = recommend_cluster_df.index[0]
recommend_cluster

6

In [46]:
# Get the recommend playlists based on the recommend cluster
recommend_playlists_df = clustered_scaled_playlist_df[clustered_scaled_playlist_df['playlist_clusters']==recommend_cluster]
recommend_playlists_df

Unnamed: 0,danceability,danceability_std,energy,energy_std,loudness,loudness_std,speechiness,speechiness_std,acousticness,acousticness_std,...,time_signature_0,time_signature_1,time_signature_2,time_signature_3,time_signature_4,time_signature_5,time_signature_6,time_signature_7,playlist_clusters,pid
4731,-0.310147,-1.605248,-2.339372,0.060581,-3.672096,2.892876,4.997821,6.967929,0.45455,0.391139,...,-0.005099,-0.003162,0.0,-0.066555,-14.931985,229.413554,0.0,0.0,6,4716
30616,-3.513707,0.399869,-3.771391,-2.38547,-4.679231,2.122836,-0.917815,-1.520812,4.056962,-1.987839,...,-0.005099,-0.003162,0.0,-0.066555,-14.931985,229.413554,0.0,0.0,6,190329
196898,-4.584555,-3.026204,1.817921,1.154028,-6.178699,2.831257,-0.172745,-1.044358,-0.138854,0.294485,...,-0.005099,-0.003162,0.0,-0.066555,-14.931985,229.413554,0.0,0.0,6,356624
219834,0.24538,1.821979,-1.924532,-0.027465,-2.492779,2.834642,1.463613,2.098353,1.542128,1.1093,...,-0.005099,-0.003162,0.0,-0.066555,-14.931985,229.413554,0.0,0.0,6,379560
251030,0.250692,-1.736783,0.307231,-1.233337,0.529209,-0.914031,1.414814,-0.034693,-0.079738,0.889488,...,-0.005099,-0.003162,0.0,-0.066555,-14.931985,229.413554,0.0,0.0,6,410756
270981,0.974164,-0.183247,0.90693,-2.034157,0.85116,-0.98719,1.941489,0.133102,-0.438494,-1.361734,...,-0.005099,-0.003162,0.0,-0.066555,-14.931985,229.413554,0.0,0.0,6,430707
529327,-4.095547,-1.104129,-1.408135,2.08253,-2.866827,8.052093,-0.760675,-1.475293,0.803473,0.472727,...,-0.005099,-0.003162,0.0,-0.066555,-14.931985,229.413554,0.0,0.0,6,689053
605245,-4.526961,-0.962413,0.542123,3.23159,-7.073333,4.021664,-0.175498,-1.154141,0.740468,1.74059,...,-0.005099,-0.003162,0.0,-0.066555,-14.931985,229.413554,0.0,0.0,6,764971
685505,-3.043588,5.2587,-0.837867,3.750145,-5.426618,8.906332,-0.511219,-1.135563,0.651772,2.096453,...,-0.005099,-0.003162,0.0,-0.066555,-14.931985,229.413554,0.0,0.0,6,845231
721428,-3.849728,-1.789048,-3.328281,0.850845,-3.982126,2.758231,-0.71669,-1.470963,2.069318,1.109611,...,-0.005099,-0.003162,0.0,-0.066555,-14.931985,229.413554,0.0,0.0,6,881154


In [47]:
recommend_playlists_df['pid']

4731        4716
30616     190329
196898    356624
219834    379560
251030    410756
270981    430707
529327    689053
605245    764971
685505    845231
721428    881154
731753    891479
793011    957007
889325     54190
907597     72466
921583     86452
951073    115942
971997    136866
988430    153299
997391    162272
Name: pid, dtype: int64

## These are all the recommended playlists' 'pid' number which can be used to find all the tracks from our database.


## Please refer to the `Model_Validation_Playlist_model.csv` file for further steps on how to get the recommended tracks as well as validating the model.