In [1]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

%matplotlib inline
from sklearn import svm
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

#spotify
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import csv
import io

In [2]:
df = pd.read_csv("top.csv")
df.head()
#df.info()

Unnamed: 0.1,Unnamed: 0,genre,name,artist,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,key,tempo
0,0,classical,Handel / Orch. Hale: Keyboard Suite in D Minor...,George Frideric Handel,0.0939,0.0336,-24.041,0.0606,0.927,0.83,0.0954,0.0516,2,67.359
1,1,classical,"Goldberg Variations, BWV 988: Aria",Johann Sebastian Bach,0.454,0.0139,-29.966,0.0514,0.995,0.943,0.0736,0.244,4,130.253
2,2,classical,"Clair de Lune, L. 32",Claude Debussy,0.335,0.00532,-31.646,0.0376,0.994,0.912,0.0621,0.0397,1,65.832
3,3,classical,"Sonata No. 14 ""Moonlight"" in C-Sharp Minor"", O...",Ludwig van Beethoven,0.184,0.00527,-37.264,0.0432,0.995,0.887,0.173,0.151,1,170.612
4,4,classical,Miroirs: III. Une barque sur l'océan,Maurice Ravel,0.17,0.0479,-27.021,0.0438,0.981,0.906,0.0795,0.0304,2,75.664


In [3]:
#extracting columns we will use to base classifications
#dropping unnecessary columns

# UNCOMMENT FOR MEGADATASET
# df.drop(columns=['year', 'release_date', 'id', 'duration_ms', 'artists', 
#                  'name', 'explicit', 'popularity', 'mode', 'key', 'loudness'], inplace=True)

# df.drop(columns=['Unnamed: 0','year','dur','pop'], inplace= True)
# df

df.drop(columns=['Unnamed: 0'], inplace=True)
df

Unnamed: 0,genre,name,artist,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,key,tempo
0,classical,Handel / Orch. Hale: Keyboard Suite in D Minor...,George Frideric Handel,0.0939,0.03360,-24.041,0.0606,0.92700,0.830000,0.0954,0.0516,2,67.359
1,classical,"Goldberg Variations, BWV 988: Aria",Johann Sebastian Bach,0.4540,0.01390,-29.966,0.0514,0.99500,0.943000,0.0736,0.2440,4,130.253
2,classical,"Clair de Lune, L. 32",Claude Debussy,0.3350,0.00532,-31.646,0.0376,0.99400,0.912000,0.0621,0.0397,1,65.832
3,classical,"Sonata No. 14 ""Moonlight"" in C-Sharp Minor"", O...",Ludwig van Beethoven,0.1840,0.00527,-37.264,0.0432,0.99500,0.887000,0.1730,0.1510,1,170.612
4,classical,Miroirs: III. Une barque sur l'océan,Maurice Ravel,0.1700,0.04790,-27.021,0.0438,0.98100,0.906000,0.0795,0.0304,2,75.664
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,rock,Band On The Run - Remastered 2010,Wings,0.4790,0.60100,-8.806,0.0318,0.08320,0.001420,0.1100,0.6690,7,124.966
996,rock,Everlong,Foo Fighters,0.4130,0.88100,-5.541,0.0367,0.00006,0.000308,0.0805,0.3640,11,158.066
997,rock,Helplessly Hoping - 2005 Remaster,"Crosby, Stills & Nash",0.5670,0.15900,-13.648,0.0331,0.91400,0.000000,0.1110,0.4360,7,146.913
998,rock,Killing In The Name,Rage Against The Machine,0.4660,0.83300,-4.215,0.3040,0.02660,0.000000,0.0327,0.6610,7,88.785


In [None]:
# fig = px.scatter(df, x='bpm', y='nrgy',color='top genre', hover_name='artist',hover_data=['title'])
# fig = px.scatter(df, x='tempo', y='energy',color='genre', hover_name='name', hover_data=['artist'])
# fig.show()

In [4]:
k = 5
knn_neighbors = 10

data = df

models = ["Lin SVM", "Poly SVM", "RBF SVM", "KNN", "LR", "NN"]

X = data.iloc[:, 3:]    # data
y = data.iloc[:, 0]     # labels

# use a min/max scaler for performance
scaling = MinMaxScaler(feature_range=(-1,1)).fit(X)
X = pd.DataFrame(scaling.transform(X))

results = [] # accuracy

# create the support vector machines with the linear, poly, and rbf kernels
clfs = [svm.SVC(kernel="linear"), svm.SVC(kernel="poly"), svm.SVC(kernel="rbf"), \
        KNeighborsClassifier(n_neighbors=knn_neighbors), LogisticRegression(random_state=0, solver="sag", multi_class="ovr"), \
        MLPClassifier(random_state=0, max_iter=200)]

kf = KFold(n_splits=k, shuffle=True)

for train_index, test_index in kf.split(X): # split KFold data
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]

    experiment_result = []      # the results using each svm for this partitioned data set

    for i in range(len(clfs)):                  # use the data set to train each svm
        clfs[i].fit(X_train, y_train)           # train the svm
        predictions = clfs[i].predict(X_test)    # predict
        accuracy = sum(predictions == y_test) / len(y_test)   # get accuracy
#         accuracy = clfs[i].score(X_test, y_test)
        
        # get the total results
        d = {"Prediction" : predictions, "Actual" : y_test, "Correct" : predictions == y_test}
        d = pd.DataFrame(data=d)
        
        # get results for each genre prediction
        genre_results = {}
        for n, g in d.groupby("Actual"):
            f = sum(g["Correct"] / len(g))
            if n not in genre_results:
                genre_results[n] = 0
            genre_results[n] += f
        
        # add in total accuracy
        genre_results["Total"] = accuracy
        genre_results = pd.DataFrame(data=genre_results, index=[0])

        # append to the experiment results
        experiment_result.append(genre_results)
        
    results.append(experiment_result)           # append the row of data to the results

# put the results into a dataframe
results = pd.DataFrame(results)
results.columns = models

# calculate the average accuracy for each column
# avg_accuracy = [sum(results.iloc[:, i]) / len(results) for i in range(len(results.columns))]
# results.loc["Avg"] = avg_accuracy

# display results
# print(results)

for m in models:
    # print model
    print(m)
    
    # get dataframe of results for the model
    results_df = pd.concat(results.loc[:, m].tolist()).reset_index(drop=True)
    
    # calculate averages
    avg_accuracy = [sum(results_df.iloc[:, i]) / len(results_df) for i in range(len(results_df.columns))]
    results_df.loc["Avg"] = avg_accuracy
    
    # print results
    print(results_df)
    print()



Lin SVM
     classical   country       edm      jazz     lo-fi     metal       pop  \
0     0.956522  0.500000  0.555556  0.600000  0.950000  0.789474  0.529412   
1     0.833333  0.470588  0.722222  0.727273  0.913043  0.818182  0.315789   
2     0.916667  0.600000  0.400000  0.650000  0.916667  0.750000  0.277778   
3     0.904762  0.705882  0.578947  0.500000  0.933333  0.750000  0.450000   
4     1.000000  0.450000  0.625000  0.640000  0.833333  0.782609  0.153846   
Avg   0.922257  0.545294  0.576345  0.623455  0.909275  0.778053  0.345365   

          r&b       rap      rock  Total  
0    0.380952  0.529412  0.666667  0.645  
1    0.263158  0.705882  0.240000  0.600  
2    0.294118  0.523810  0.350000  0.585  
3    0.370370  0.772727  0.333333  0.615  
4    0.500000  0.652174  0.315789  0.575  
Avg  0.361720  0.636801  0.381158  0.604  

Poly SVM
     classical   country       edm      jazz     lo-fi     metal       pop  \
0     0.913043  0.461538  0.481481  0.733333  0.950000  



# Spotify Scraper

In [8]:
CLIENT_ID = "9354a4c040c74126ac9f437a0d266c9c"
CLIENT_SECRET = "ab4449ee4c7e4e5899fbab2cd9e20e9e"

client_credentials_manager = SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)
sp= spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [34]:
artist = 'Boomb'
track = 'Born This Way'
song = sp.search(q="artist:" + artist + " track:" + track, type="track")
# print('spotify:track:'+song['tracks']['items'][0]['id'])
meta_af = sp.audio_features(song['tracks']['items'][0]['id'])
metadata= {'genre': '',
           'name': track, 'artist': artist,
           'danceability': meta_af[0]['danceability'],
           'energy': meta_af[0]['energy'], 'loudness': meta_af[0]['loudness'],
           'speechiness': meta_af[0]['speechiness'],'acousticness': meta_af[0]['acousticness'],
           'instrumentalness': meta_af[0]['instrumentalness'],'liveness': meta_af[0]['liveness'],
           'valence': meta_af[0]['valence'], 'key': meta_af[0]['key'],
           'tempo': meta_af[0]['tempo']}

In [35]:
df = pd.DataFrame(metadata, index=[0])
display(df)
X = df.iloc[:, 3:]    # data
y = df.iloc[:, 0]     # labels

X = pd.DataFrame(scaling.transform(X))
# print(X)


Unnamed: 0,genre,name,artist,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,key,tempo
0,,Born This Way,Lady Gaga,0.587,0.828,-5.108,0.161,0.00327,0,0.331,0.494,11,123.907


In [36]:
models = ['Lin SVM', 'Poly SVM', 'RBF SVM', 'KNN', 'LR', 'NN']
r = []
for i in range(len(clfs)):
    r.append(clfs[i].predict(X))

for i in range(len(r)):
    print(models[i] + " : ", r[i])

Lin SVM :  ['edm']
Poly SVM :  ['edm']
RBF SVM :  ['edm']
KNN :  ['edm']
LR :  ['edm']
NN :  ['edm']


{'href': 'https://api.spotify.com/v1/search?query=artist%3AKaty+Perry+track%3AHot+N+Cold&type=track&offset=0&limit=10',
 'items': [{'album': {'album_type': 'album',
    'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/6jJ0s89eD6GaHleKKya26X'},
      'href': 'https://api.spotify.com/v1/artists/6jJ0s89eD6GaHleKKya26X',
      'id': '6jJ0s89eD6GaHleKKya26X',
      'name': 'Katy Perry',
      'type': 'artist',
      'uri': 'spotify:artist:6jJ0s89eD6GaHleKKya26X'}],
    'available_markets': ['CA', 'JP', 'US'],
    'external_urls': {'spotify': 'https://open.spotify.com/album/0r2BUyPTmpbfuz4rR39mLl'},
    'href': 'https://api.spotify.com/v1/albums/0r2BUyPTmpbfuz4rR39mLl',
    'id': '0r2BUyPTmpbfuz4rR39mLl',
    'images': [{'height': 640,
      'url': 'https://i.scdn.co/image/ab67616d0000b273cd3978ebe35d93a07249b97f',
      'width': 640},
     {'height': 300,
      'url': 'https://i.scdn.co/image/ab67616d00001e02cd3978ebe35d93a07249b97f',
      'width': 300},
     {'he