In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn import tree
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [3]:
# Assign %pwd to working_dir
working_dir = %pwd

In [4]:
# Reading kmeans output From Disk- start with the 13 clusters we discovered from clustering our PCA results
spotify = pd.read_csv((working_dir)+"/Data/spotify-classlabels-kmeans.csv", delimiter = ",")

##### Split data into training and testing

In [5]:
target_labels = spotify['cluster']
spotify = spotify.drop(columns=['cluster'])

spotify

Unnamed: 0,artists,id,name,year,mode_0,acousticness,explicit_1,instrumentalness,key,valence
0,['Mamie Smith'],0cS0A1fUEUd1EW3FcF8AEI,Keep A Song In Your Soul,1920s,1,0.991000,0,0.000522,0.454545,0.6340
1,"[""Screamin' Jay Hawkins""]",0hbkKFIJm7Z05H8Zl9w30f,I Put A Spell On You,1920s,1,0.643000,0,0.026400,0.454545,0.9500
2,['Mamie Smith'],11m7laMUgmOKqI3oYzuhne,Golfing Papa,1920s,0,0.993000,0,0.000018,0.000000,0.6890
3,['Oscar Velazquez'],19Lc5SfJJ5O1oaxY0fpwfh,True House Music - Xavier Santos & Carlos Gomi...,1920s,0,0.000173,0,0.801000,0.181818,0.0422
4,['Mixe'],2hJjbsLCytGsnAHfdsLejp,Xuniverxe,1920s,1,0.295000,1,0.000246,0.909091,0.2990
...,...,...,...,...,...,...,...,...,...,...
174384,"['DJ Combo', 'Sander-7', 'Tony T']",46LhBf6TvYjZU2SMvGZAbn,The One,2020s,1,0.009170,0,0.000060,0.545455,0.1860
174385,['Alessia Cara'],7tue2Wemjd0FZzRtDrQFZd,A Little More,2020s,0,0.795000,0,0.000000,0.363636,0.2280
174386,['Roger Fly'],48Qj61hOdYmUCFJbpQ29Ob,Together,2020s,1,0.806000,0,0.920000,0.363636,0.7140
174387,['Taylor Swift'],1gcyHQpBQ1lfXGdhZmWrHP,champagne problems,2020s,0,0.920000,1,0.000000,0.000000,0.3200


In [6]:

train, test, target_train, target_test = train_test_split(spotify, target_labels, test_size=0.2, random_state=33)

In [8]:
pca_names = np.array(
    ['mode_0', 
     'acousticness', 
     'explicit_1', 
     'instrumentalness', 
     'key', 
     'valence'])
train_numeric = train[pca_names]

test_numeric = test[pca_names]

In [8]:
# going to implement grid search but needed to look at other homeworks! Returning to this! 

In [9]:
import math
from sklearn.model_selection import GridSearchCV

In [10]:
tree_model = tree.DecisionTreeClassifier()

In [20]:
parameters = {
    'criterion' : ['gini', 'entropy'],
    'min_samples_leaf': np.arange(2, 15),
    'max_depth': np.arange(3, 10)
}

gs = GridSearchCV(tree_model, parameters, verbose=1, cv=5)

In [21]:
gs.fit(train_numeric, target_train)

Fitting 5 folds for each of 182 candidates, totalling 910 fits


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': array([3, 4, 5, 6, 7, 8, 9]),
                         'min_samples_leaf': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])},
             verbose=1)

In [22]:
gs.best_params_, gs.best_score_

({'criterion': 'gini', 'max_depth': 9, 'min_samples_leaf': 2},
 0.9850262701213506)

In [25]:
#Testing 

tree_model = tree.DecisionTreeClassifier(criterion='gini', min_samples_leaf=2, max_depth=9)
tree_model = tree_model.fit(train_numeric, target_train)

In [27]:
treepreds_test = tree_model.predict(test_numeric)
print (treepreds_test)

[7 0 8 ... 2 9 0]


In [29]:
print (tree_model.score(test_numeric, target_test))

0.9852055737140891


In [30]:
print(classification_report(target_test, treepreds_test))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      3866
           1       0.99      0.99      0.99      3186
           2       0.99      0.99      0.99      4058
           3       0.98      0.97      0.97      3020
           4       0.99      0.99      0.99      3652
           5       1.00      1.00      1.00      1454
           6       1.00      1.00      1.00       952
           7       0.99      0.99      0.99      3341
           8       0.99      0.99      0.99      1662
           9       0.98      0.99      0.98      4219
          10       0.98      0.97      0.97      3027
          11       0.98      0.98      0.98      1446
          12       0.99      0.98      0.99       995

    accuracy                           0.99     34878
   macro avg       0.99      0.99      0.99     34878
weighted avg       0.99      0.99      0.99     34878



In [47]:
test_sample_full = test.sample(1)
test_sample = test_sample_full[pca_names]

prediction = tree_model.predict(test_sample)
print(prediction)

[0]


In [48]:
spotify = pd.read_csv((working_dir)+"/Data/spotify-classlabels-kmeans.csv", delimiter = ",")

In [49]:
playlist = spotify.loc[spotify['cluster'] == prediction[0]].sample(20)
playlist

Unnamed: 0,artists,id,name,year,mode_0,acousticness,explicit_1,instrumentalness,key,valence,cluster
137947,['The Mavericks'],0o0QiBgJqMJjaZLSR4aJFj,Foolish Heart,1990s,0,0.663,0,2e-06,0.181818,0.714,0
73609,['Edgar Jones'],53uGVJD1KAeZNrUWuIMkct,"You Know You Can Do It - Live, The Blow Up, Lo...",2020s,0,0.792,0,0.0,0.181818,0.763,0
15565,['Mijares'],3NGG1do9lT815M5REcCzzm,El Privilegio De Amar,1990s,0,0.692,0,1e-06,0.181818,0.319,0
54933,"['Amy Flurry', '2003 Jr Philadelphia Singers']",6sAGi4gx9flvCliLI4vdd0,Animal Kingdom,2010s,0,0.921,0,0.0,0.0,0.809,0
65572,"['Peter, Paul and Mary']",2WA9WuLFGcJrJEgZwLhFWm,Gilgarra Mountain,1960s,0,0.724,0,4e-06,0.090909,0.456,0
96190,['Hariram'],1ljivYBdnsquYYJMmRXupX,Gagar Gai Bharne Sakhi,1940s,0,0.849,0,1e-06,0.0,0.935,0
173991,['Sam Smith'],2ZTYlnhhV1UAReg7wIGolx,To Die For,2020s,0,0.832,0,5e-06,0.0,0.307,0
125207,['The High Kings'],6zX3HwSuoQThrabeoHJvCs,Irish Pub Song,2010s,0,0.724,0,0.0,0.0,0.968,0
146381,"['Jeff Alexander', 'Alfred Hitchcock']",1eTDy4YrFHNqAbi0udvucO,Body And Soul,1950s,0,0.894,0,0.454,0.090909,0.274,0
47261,['Astrud Gilberto'],5jMc7pAGFQ5lRqfaM7AsU5,O Morro (Nao Tem Vez),1960s,0,0.735,0,0.000713,0.0,0.608,0
