In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn import neighbors, tree, naive_bayes
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [2]:
# Assign %pwd to working_dir
working_dir = %pwd

In [12]:
# Reading kmeans output From Disk- start with the 13 clusters we discovered from clustering our PCA results
spotify = pd.read_csv((working_dir)+"/Data/spotify-classlabels-kmeans.csv", delimiter = ",")

#### First we will split the data into 80% train and 20% test

In [13]:
target_labels = spotify['cluster']
spotify = spotify.drop(columns=['cluster'])

spotify

Unnamed: 0,artists,id,name,year,mode_0,acousticness,explicit_1,instrumentalness,key,valence
0,['Mamie Smith'],0cS0A1fUEUd1EW3FcF8AEI,Keep A Song In Your Soul,1920s,1,0.991000,0,0.000522,0.454545,0.6340
1,"[""Screamin' Jay Hawkins""]",0hbkKFIJm7Z05H8Zl9w30f,I Put A Spell On You,1920s,1,0.643000,0,0.026400,0.454545,0.9500
2,['Mamie Smith'],11m7laMUgmOKqI3oYzuhne,Golfing Papa,1920s,0,0.993000,0,0.000018,0.000000,0.6890
3,['Oscar Velazquez'],19Lc5SfJJ5O1oaxY0fpwfh,True House Music - Xavier Santos & Carlos Gomi...,1920s,0,0.000173,0,0.801000,0.181818,0.0422
4,['Mixe'],2hJjbsLCytGsnAHfdsLejp,Xuniverxe,1920s,1,0.295000,1,0.000246,0.909091,0.2990
...,...,...,...,...,...,...,...,...,...,...
174384,"['DJ Combo', 'Sander-7', 'Tony T']",46LhBf6TvYjZU2SMvGZAbn,The One,2020s,1,0.009170,0,0.000060,0.545455,0.1860
174385,['Alessia Cara'],7tue2Wemjd0FZzRtDrQFZd,A Little More,2020s,0,0.795000,0,0.000000,0.363636,0.2280
174386,['Roger Fly'],48Qj61hOdYmUCFJbpQ29Ob,Together,2020s,1,0.806000,0,0.920000,0.363636,0.7140
174387,['Taylor Swift'],1gcyHQpBQ1lfXGdhZmWrHP,champagne problems,2020s,0,0.920000,1,0.000000,0.000000,0.3200


In [14]:
train, test, target_train, target_test = train_test_split(spotify, target_labels, test_size=0.2, random_state=33)

In [24]:
pca_names = np.array(
    ['mode_0', 
     'acousticness', 
     'explicit_1', 
     'instrumentalness', 
     'key', 
     'valence'])
train_numeric = train[pca_names]

test_numeric = test[pca_names]

In [38]:
#find best k for knn

df_k_accuracies_highest = pd.DataFrame(columns=['K','Accuracy'])

for n in range(30, 55):
    n_neighbors = n
    knn_classifier = neighbors.KNeighborsClassifier(n_neighbors)
    knn_classifier.fit(train_numeric, target_train)

    acc = knn_classifier.score(test_numeric, target_test)
    
    df_k_accuracies_highest = df_k_accuracies_highest.append({'K': n, 'Accuracy': acc}, ignore_index=True)
    
    print(f'acc for k={n}: {acc}')

acc for k=30: 0.9924594300131888
acc for k=31: 0.9925741154882735
acc for k=32: 0.9925454441195023
acc for k=33: 0.9924307586444177
acc for k=34: 0.9922300590630196
acc for k=35: 0.9923734159068754
acc for k=36: 0.9923447445381043
acc for k=37: 0.9926027868570445
acc for k=38: 0.992287401800562
acc for k=39: 0.9922300590630196
acc for k=40: 0.9922013876942485
acc for k=41: 0.9922587304317908
acc for k=42: 0.9919720167440793
acc for k=43: 0.9922300590630196
acc for k=44: 0.9923447445381043
acc for k=45: 0.9924020872756465
acc for k=46: 0.9921440449567063
acc for k=47: 0.9918860026377659
acc for k=48: 0.9916853030563679
acc for k=49: 0.9919433453753083
acc for k=50: 0.9919433453753083
acc for k=51: 0.9920293594816216
acc for k=52: 0.9921153735879351
acc for k=53: 0.9922300590630196
acc for k=54: 0.9919433453753083


In [39]:
df_k_accuracies = df_k_accuracies.append(df_k_accuracies_highest)
df_k_accuracies = df_k_accuracies.sort_values('Accuracy', ascending=False).nlargest(50, 'Accuracy')
df_k_accuracies

Unnamed: 0,K,Accuracy
7,37.0,0.992603
1,31.0,0.992574
2,32.0,0.992545
3,24.0,0.992517
0,30.0,0.992459
3,33.0,0.992431
5,26.0,0.992402
15,45.0,0.992402
5,35.0,0.992373
14,44.0,0.992345


In [67]:
#write k values to disk

df_k_accuracies.to_csv((working_dir)+"/Data/knn-1.csv",index = False)

### Walking through an instance of classifying a sample to find the cluster with our best K of 37, then using that to get more instances of the same cluster for playlist generation

In [45]:
n_neighbors = 37
knn_classifier = neighbors.KNeighborsClassifier(n_neighbors)
knn_classifier.fit(train_numeric, target_train)

KNeighborsClassifier(n_neighbors=37)

In [63]:
test_sample_full = test.sample(1)
test_sample = test_sample_full[pca_names]

prediction = knn_classifier.predict(test_sample)
print(prediction)

[7]


In [64]:
spotify = pd.read_csv((working_dir)+"/Data/spotify-classlabels-kmeans.csv", delimiter = ",")

In [65]:
playlist = spotify.loc[spotify['cluster'] == prediction[0]].sample(20)
playlist

Unnamed: 0,artists,id,name,year,mode_0,acousticness,explicit_1,instrumentalness,key,valence,cluster
80518,"['Richard Wagner', 'Josef Greindl', 'Philharmo...",0TaHJD1g6zUjsq7G9MFLm3,"Wagner: Tristan und Isolde, WWV 90, Act 3 Scen...",1950s,0,0.913,0,0.663,0.636364,0.0388,7
156961,['Frank Ocean'],0GAE689kVy7sFT2QWZH1Xa,Start,2010s,0,0.74,0,0.931,0.636364,0.0316,7
61418,['Rajkumari'],7I2ruUPSZlvX3WGPwGFkqq,"Jo Ham Pe Guzarti Hai (From ""Panna"")",1940s,0,0.995,0,0.911,0.545455,0.757,7
68715,"['Johann Sebastian Bach', 'Glenn Gould']",7gAYYUV8mC5Vfsp825sqiS,"Goldberg Variations, BWV 988: Variation 12 Can...",1980s,0,0.984,0,0.821,0.636364,0.975,7
43760,"['Felix Mendelssohn', 'Arturo Toscanini']",1tob591iJj5K6IpeC2V7Zc,"Octet in E-Flat Major, Op. 20: Scherzo. Allegr...",1940s,0,0.779,0,0.945,0.636364,0.559,7
129686,"['Johannes Brahms', 'Herbert von Karajan', 'Ph...",1jukvC9dcM8oSXH1yhycT0,"Brahms: Symphony No. 1 in C Minor, Op. 68: IV....",1950s,0,0.961,0,0.914,0.0,0.0589,7
86910,['Rob Whitesides-Woo'],7lxjfrhWFOaOOTRF0KK8F5,Cradle Song,1980s,0,0.967,0,0.915,0.0,0.306,7
160397,['Lata Mangeshkar'],2kD3Q8qaFOPNbqrafVHJ1z,Dard-E-Jigar,1950s,0,0.993,0,0.946,1.0,0.402,7
154293,"['Franz Liszt', 'Philip Thomson']",72avKXjk0UTESGbVsxmjwG,"Consolations, S172/R12: No. 2. Un poco piu mosso",1990s,0,0.992,0,0.795,1.0,0.0395,7
2736,"['Francisco Canaro', 'Ernesto Fama']",0Zss1Cp31Mkd4Qa0eZYw51,Justicia Baturra - Remasterizado,1930s,0,0.995,0,0.907,0.181818,0.53,7


In [66]:
test_sample_full

Unnamed: 0,artists,id,name,year,mode_0,acousticness,explicit_1,instrumentalness,key,valence
8711,['Thelonious Monk'],6kLp24Op7cOu8iQkU46sBv,I'm Confessin' (That I Love You),1960s,0,0.965,0,0.905,0.090909,0.206


### Now finding our best KNN classifier for the non-pca cluster set, where there are 3 main clusters