In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn import neighbors, tree, naive_bayes
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [19]:
# Assign %pwd to working_dir
working_dir = %pwd

In [20]:
# Reading kmeans output From Disk- start with the 13 clusters we discovered from clustering our PCA results
spotify = pd.read_csv((working_dir)+"/Data/spotify-classlabels-kmeans.csv", delimiter = ",")

# First we will develop a KNN classifier based on our cluster data set where we used PCA and Kmeans. Class labels are the cluster numbers.

In [21]:
target_labels = spotify['cluster']
spotify = spotify.drop(columns=['cluster'])

spotify

Unnamed: 0,artists,id,name,year,mode_0,acousticness,explicit_1,instrumentalness,key,valence
0,['Mamie Smith'],0cS0A1fUEUd1EW3FcF8AEI,Keep A Song In Your Soul,1920s,1,0.991000,0,0.000522,0.454545,0.6340
1,"[""Screamin' Jay Hawkins""]",0hbkKFIJm7Z05H8Zl9w30f,I Put A Spell On You,1920s,1,0.643000,0,0.026400,0.454545,0.9500
2,['Mamie Smith'],11m7laMUgmOKqI3oYzuhne,Golfing Papa,1920s,0,0.993000,0,0.000018,0.000000,0.6890
3,['Oscar Velazquez'],19Lc5SfJJ5O1oaxY0fpwfh,True House Music - Xavier Santos & Carlos Gomi...,1920s,0,0.000173,0,0.801000,0.181818,0.0422
4,['Mixe'],2hJjbsLCytGsnAHfdsLejp,Xuniverxe,1920s,1,0.295000,1,0.000246,0.909091,0.2990
...,...,...,...,...,...,...,...,...,...,...
174384,"['DJ Combo', 'Sander-7', 'Tony T']",46LhBf6TvYjZU2SMvGZAbn,The One,2020s,1,0.009170,0,0.000060,0.545455,0.1860
174385,['Alessia Cara'],7tue2Wemjd0FZzRtDrQFZd,A Little More,2020s,0,0.795000,0,0.000000,0.363636,0.2280
174386,['Roger Fly'],48Qj61hOdYmUCFJbpQ29Ob,Together,2020s,1,0.806000,0,0.920000,0.363636,0.7140
174387,['Taylor Swift'],1gcyHQpBQ1lfXGdhZmWrHP,champagne problems,2020s,0,0.920000,1,0.000000,0.000000,0.3200


In [22]:
train, test, target_train, target_test = train_test_split(spotify, target_labels, test_size=0.2, random_state=33)

In [23]:
pca_names = np.array(
    ['mode_0', 
     'acousticness', 
     'explicit_1', 
     'instrumentalness', 
     'key', 
     'valence'])
pca_train_numeric = train[pca_names]

pca_test_numeric = test[pca_names]

### Run the classifier against different Ks, put accuracies in Dataframe and write to disk.

In [None]:
#find best k for knn

df_k_accuracies_highest = pd.DataFrame(columns=['K','Accuracy'])

for n in range(30, 55):
    n_neighbors = n
    knn_classifier = neighbors.KNeighborsClassifier(n_neighbors)
    knn_classifier.fit(pca_train_numeric, target_train)

    acc = knn_classifier.score(pca_test_numeric, target_test)
    
    df_k_accuracies_highest = df_k_accuracies_highest.append({'K': n, 'Accuracy': acc}, ignore_index=True)
    
    print(f'acc for k={n}: {acc}')

In [None]:
df_k_accuracies = df_k_accuracies.append(df_k_accuracies_highest)
df_k_accuracies = df_k_accuracies.sort_values('Accuracy', ascending=False).nlargest(50, 'Accuracy')
df_k_accuracies

In [None]:
#write k values to disk

df_k_accuracies.to_csv((working_dir)+"/Data/knn-1.csv",index = False)

## From running the accuracies we find a K of 37 to be the best for our PCA-kmeans clusters dataset

### Next, we will walk through an instance of classifying a sample to find the cluster with our best K of 37, then using that to get more instances of the same cluster for playlist generation. This is a prototype for what will later be implemented in app.py

In [24]:
n_neighbors = 37
pca_best_knn_classifier = neighbors.KNeighborsClassifier(n_neighbors)
pca_best_knn_classifier.fit(pca_train_numeric, target_train)

KNeighborsClassifier(n_neighbors=37)

In [25]:
test_sample_full = test.sample(1)
test_sample = test_sample_full[pca_names]

prediction = pca_best_knn_classifier.predict(test_sample)
print(prediction)

[5]


In [26]:
spotify = pd.read_csv((working_dir)+"/Data/spotify-classlabels-kmeans.csv", delimiter = ",")

In [27]:
playlist = spotify.loc[spotify['cluster'] == prediction[0]].sample(20)
playlist

Unnamed: 0,artists,id,name,year,mode_0,acousticness,explicit_1,instrumentalness,key,valence,cluster
16714,['Eminem'],561jH07mF1jHuk7KlaeF0s,Mockingbird,2000s,0,0.209,1,0.0,0.0,0.254,5
171471,"['Boosie Badazz', 'Webbie']",2TosX3xYACy6rOe1HGC85K,Wipe Me Down (feat. Foxx),2000s,0,0.000153,1,0.0,0.090909,0.648,5
2163,['Эрих Мария Ремарк'],0cOOKx82INQTzj3fZGQPsN,Часть 4.6 - Обратный путь,1930s,0,0.828,1,0.0,0.0,0.333,5
24399,['Эрих Мария Ремарк'],07ZDTJzRs217gxSfbxAA7Y,Часть 145.3 & Часть 146.1 - Триумфальная арка,1940s,0,0.105,1,0.0,0.0,0.772,5
36185,"['Madvillain', 'Madlib', 'MF DOOM']",7dZanttu013xDQlAJprAZC,Money Folder,2000s,0,0.524,1,2.4e-05,0.727273,0.874,5
170077,['Kittie'],76g4Ybt7SW4PvloFDIzsSX,Charlotte,2000s,0,0.000568,1,0.672,0.454545,0.364,5
31877,['Misfits'],66mQWFirjSr6uqdaiACfp7,Hatebreeders,1980s,0,0.00479,1,0.0,1.0,0.522,5
142487,['YNW Melly'],1sSb4ATG8E30IGDdjAvlxh,Catchin Feelings,2010s,0,0.32,1,0.00791,0.818182,0.263,5
173311,"['Lil Yachty', 'Quavo', 'Skippa Da Flippa', 'Y...",4XkOcWt0C2JX1s2RXybosk,Minnesota,2010s,0,0.0951,1,0.0,0.181818,0.246,5
59472,['Эрих Мария Ремарк'],4vhvA2d1TgsmV8iGdjEh2d,Часть 1.3 - Обратный путь,1930s,0,0.865,1,0.0,0.181818,0.457,5


In [28]:
test_sample_full

Unnamed: 0,artists,id,name,year,mode_0,acousticness,explicit_1,instrumentalness,key,valence
58501,['Lady Gaga'],5JdzwFG2M4wSKEBU09zls6,Bad Romance,2020s,0,0.00314,1,5.3e-05,0.0,0.714


## Now finding our next KNN classifier for the non-pca cluster set, where there are 3 main clusters

In [29]:
spotify = pd.read_csv((working_dir)+"/Data/spotify-classlabels-kmeans-without_pca.csv", delimiter = ",")
spotify

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,...,popularity,speechiness,tempo,valence,year,explicit_0,explicit_1,mode_0,mode_1,cluster
0,0.991000,['Mamie Smith'],0.598,0.030637,0.224,0cS0A1fUEUd1EW3FcF8AEI,0.000522,0.454545,0.3790,0.741868,...,0.12,0.0936,0.615900,0.6340,1920s,1,0,1,0,2
1,0.643000,"[""Screamin' Jay Hawkins""]",0.852,0.027237,0.517,0hbkKFIJm7Z05H8Zl9w30f,0.026400,0.454545,0.0809,0.825918,...,0.07,0.0534,0.356823,0.9500,1920s,1,0,1,0,2
2,0.993000,['Mamie Smith'],0.647,0.029792,0.186,11m7laMUgmOKqI3oYzuhne,0.000018,0.000000,0.5190,0.750168,...,0.04,0.1740,0.400810,0.6890,1920s,1,0,0,1,1
3,0.000173,['Oscar Velazquez'],0.730,0.078215,0.798,19Lc5SfJJ5O1oaxY0fpwfh,0.801000,0.181818,0.1280,0.825135,...,0.17,0.0425,0.525640,0.0422,1920s,1,0,0,1,0
4,0.295000,['Mixe'],0.704,0.030054,0.707,2hJjbsLCytGsnAHfdsLejp,0.000246,0.909091,0.4020,0.845102,...,0.02,0.0768,0.501324,0.2990,1920s,0,1,1,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174384,0.009170,"['DJ Combo', 'Sander-7', 'Tony T']",0.792,0.026752,0.866,46LhBf6TvYjZU2SMvGZAbn,0.000060,0.545455,0.1780,0.859933,...,0.00,0.0356,0.517324,0.1860,2020s,1,0,1,0,2
174385,0.795000,['Alessia Cara'],0.429,0.026209,0.211,7tue2Wemjd0FZzRtDrQFZd,0.000000,0.363636,0.1960,0.756949,...,0.00,0.0360,0.388942,0.2280,2020s,1,0,0,1,1
174386,0.806000,['Roger Fly'],0.671,0.039977,0.589,48Qj61hOdYmUCFJbpQ29Ob,0.920000,0.363636,0.1130,0.745549,...,0.00,0.0282,0.443757,0.7140,2020s,1,0,1,0,2
174387,0.920000,['Taylor Swift'],0.462,0.044824,0.240,1gcyHQpBQ1lfXGdhZmWrHP,0.000000,0.000000,0.1130,0.750497,...,0.69,0.0377,0.703549,0.3200,2020s,0,1,0,1,1


In [30]:
target_labels = spotify['cluster']
spotify = spotify.drop(columns=['cluster'])

In [31]:
non_pca_train, non_pca_test, non_pca_target_train, non_pca_target_test = train_test_split(spotify, target_labels, test_size=0.2, random_state=33)

In [32]:
#drop the non-numeric attrs
non_pca_train = non_pca_train.drop(columns=['artists', 'id', 'name', 'year'])
non_pca_test = non_pca_test.drop(columns=['artists', 'id', 'name', 'year'])

### Similar to our last KNN classifier, we will test different Ks to find accuracies, write these findings to disk.

In [None]:
#find best k for knn

df_k_accuracies_highest = pd.DataFrame(columns=['K','Accuracy'])

for n in range(1, 50):
    n_neighbors = n
    knn_classifier = neighbors.KNeighborsClassifier(n_neighbors)
    knn_classifier.fit(non_pca_train, non_pca_target_train)

    acc = knn_classifier.score(non_pca_test, non_pca_target_test)
    
    df_k_accuracies_highest = df_k_accuracies_highest.append({'K': n, 'Accuracy': acc}, ignore_index=True)
    
    print(f'acc for k={n}: {acc}')

In [None]:
df_k_accuracies_highest = df_k_accuracies_highest.append(df_k_accuracies_highest)
df_k_accuracies_highest = df_k_accuracies_highest.sort_values('Accuracy', ascending=False).nlargest(50, 'Accuracy')
df_k_accuracies_highest

In [None]:
#write k values to disk

df_k_accuracies_highest.to_csv((working_dir)+"/Data/knn-2.csv",index = False)

In [34]:
non_pca_best_knn = neighbors.KNeighborsClassifier(21)
non_pca_best_knn.fit(non_pca_train, non_pca_target_train)

KNeighborsClassifier(n_neighbors=21)

## Classifier evaluations

In [35]:
# performance measure function from class notebook "ensemble-classification1.ipynb"
from sklearn import metrics

def measure_performance(X, y, clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
    y_pred = clf.predict(X)   
    if show_accuracy:
         print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
    if show_classification_report:
        print("Classification report")
        print(metrics.classification_report(y, y_pred),"\n")
      
    if show_confussion_matrix:
        print("Confusion matrix")
        print(metrics.confusion_matrix(y, y_pred),"\n")

In [36]:
measure_performance(pca_train_numeric, target_train, pca_best_knn_classifier)

Accuracy:0.993 

Classification report
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     15654
           1       0.99      1.00      0.99     12501
           2       0.99      1.00      1.00     16163
           3       0.99      0.99      0.99     11809
           4       1.00      0.99      1.00     14738
           5       1.00      1.00      1.00      5617
           6       1.00      1.00      1.00      3829
           7       0.99      1.00      0.99     13359
           8       0.99      0.99      0.99      6380
           9       0.99      0.99      0.99     17503
          10       0.99      0.99      0.99     12157
          11       0.99      0.99      0.99      5795
          12       0.99      0.99      0.99      4006

    accuracy                           0.99    139511
   macro avg       0.99      0.99      0.99    139511
weighted avg       0.99      0.99      0.99    139511
 

Confusion matrix
[[15541     0    41  

In [37]:
measure_performance(non_pca_train, non_pca_target_train, non_pca_best_knn)

Accuracy:0.993 

Classification report
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     49514
           1       0.99      0.99      0.99     48543
           2       1.00      1.00      1.00     41454

    accuracy                           0.99    139511
   macro avg       0.99      0.99      0.99    139511
weighted avg       0.99      0.99      0.99    139511
 

Confusion matrix
[[48993   521     0]
 [  443 48100     0]
 [    0     0 41454]] 



### The above metrics show that the two KNN classifiers based off the different cluster data are somewhat similar in accuracy and in some classification metrics but very different in the confusion matrix. This is clearly due to the far greater amount of clusters in the pca instance of the data, and it appears that there are less failed predictions across clusters in the pca version. There are a large amount of false predictions in the first two clusters of the non-pca version, 521 mispredictions in the first cluster and 443 in the second, compared to much less in the pca data set. This might lead us to the conclusion that the KNN with pca might be most effective in our case. Thinking about our data, this makes sense as well as there certainly are more than 3 types/genres/groups of music.  


### This notebook has shown that effective KNN classifiers can be developed using our clusters found from previous experiments. The first KNN was using our PCA-Kmeans clusters, and the most effective K was 37. The second KNN classifier was with all features and clusters using Kmeans, and the best K was found to be 21. KNN with our DBSCAN or HAC clusters will not be implemented for this project. 