# SPOTIFY GENRE CLASSIFIER VIA KNN CLASSIFIER

### IMPORT ALL NECESSARY LIBRARIES ##

In [57]:

import numpy as np
import pandas as pd
import matplotlib as plt
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### READ CSV AND DROP IRRELEVANT FEATURES

In [58]:
df = pd.read_csv("spotify_songs.csv")
df.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,63rPSO264uRjW1X5E6cWv6,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,1HoSmj2eLcsrR0vE9gThr4,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,1nqYsOef1yKKuGOVchbsk6,Call You Mine - The Remixes,2019-07-19,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,7m7vv9wlQ4i0LFuJiE2zsQ,Someone You Loved (Future Humans Remix),2019-03-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052


In [59]:
df = df.drop(["track_album_id","track_artist","track_id","track_name","track_popularity","track_album_release_date","playlist_id","track_album_name","duration_ms"],axis=1)
df.head()

Unnamed: 0,playlist_name,playlist_genre,playlist_subgenre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Pop Remix,pop,dance pop,0.748,0.916,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036
1,Pop Remix,pop,dance pop,0.726,0.815,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972
2,Pop Remix,pop,dance pop,0.675,0.931,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008
3,Pop Remix,pop,dance pop,0.718,0.93,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956
4,Pop Remix,pop,dance pop,0.65,0.833,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976


In [60]:
# printing crucial info about no. of categories for genre and subgenre along with 
# the number of playlists.
print(df["playlist_genre"].unique())
print(df["playlist_name"].nunique())
print(df["playlist_subgenre"].unique())

['pop' 'rap' 'rock' 'latin' 'r&b' 'edm']
449
['dance pop' 'post-teen pop' 'electropop' 'indie poptimism' 'hip hop'
 'southern hip hop' 'gangster rap' 'trap' 'album rock' 'classic rock'
 'permanent wave' 'hard rock' 'tropical' 'latin pop' 'reggaeton'
 'latin hip hop' 'urban contemporary' 'hip pop' 'new jack swing'
 'neo soul' 'electro house' 'big room' 'pop edm'
 'progressive electro house']


### Playlist names do provide helpful insight but genre and subgenre already provide comprehensive classifications

In [61]:
df.drop("playlist_name",inplace=True,axis=1)
encoded_df = pd.get_dummies(df,columns=['playlist_genre','playlist_subgenre']).astype(float)
encoded_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,playlist_subgenre_new jack swing,playlist_subgenre_permanent wave,playlist_subgenre_pop edm,playlist_subgenre_post-teen pop,playlist_subgenre_progressive electro house,playlist_subgenre_reggaeton,playlist_subgenre_southern hip hop,playlist_subgenre_trap,playlist_subgenre_tropical,playlist_subgenre_urban contemporary
0,0.748,0.916,6.0,-2.634,1.0,0.0583,0.102,0.0,0.0653,0.518,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.726,0.815,11.0,-4.969,1.0,0.0373,0.0724,0.00421,0.357,0.693,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.675,0.931,1.0,-3.432,0.0,0.0742,0.0794,2.3e-05,0.11,0.613,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.718,0.93,7.0,-3.778,1.0,0.102,0.0287,9e-06,0.204,0.277,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.65,0.833,1.0,-4.672,1.0,0.0359,0.0803,0.0,0.0833,0.725,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Shortcut to recognise all the one-hot encoded columns
label_cols = [col for col in encoded_df.columns if "playlist_subgenre_" in col]

# Find which subgenre is 1 for each row using argmax (index of max value)
# takes the exploded subgenre one-hot encoded columns and then converts to numpy matrix
subgenre_indices = encoded_df[label_cols].values.argmax(axis=1)
y = pd.Series([label_cols[i].replace("playlist_subgenre_", "") for i in subgenre_indices])
#separates the labels from dataset to define training features
X = encoded_df.drop(columns=label_cols)

print("Total songs:", y.shape)
print("Number of classes:", y.nunique())
print(y.value_counts())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=67)

#fit the transform on train set and tranform test set.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


#tracking variables for the best score and no. of neighbors
best_score = 0
best_n = 1
#Total number of neightbors to check in each subsequent run
for n in [1, 3, 5, 9, 15, 20]:
    model = KNeighborsClassifier(n_neighbors=n)
    model.fit(X_train_scaled, y_train)
    score = model.score(X_test_scaled, y_test)
    print(f"n={n} has Accuracy= {score:.4f}")
    if score > best_score:
        best_score = score
        best_n = n

print(f"\nBest amt of neighbors: {best_n} with accuracy: {best_score:.4f}")


    


Total songs: (32833,)
Number of classes: 24
progressive electro house    1809
southern hip hop             1675
indie poptimism              1672
latin hip hop                1656
neo soul                     1637
pop edm                      1517
electro house                1511
hard rock                    1485
gangster rap                 1458
electropop                   1408
urban contemporary           1405
hip hop                      1322
dance pop                    1298
classic rock                 1296
trap                         1291
tropical                     1288
latin pop                    1262
hip pop                      1256
big room                     1206
new jack swing               1133
post-teen pop                1129
permanent wave               1105
album rock                   1065
reggaeton                     949
Name: count, dtype: int64
n=1 has Accuracy= 0.3405
n=3 has Accuracy= 0.3636
n=5 has Accuracy= 0.3828
n=9 has Accuracy= 0.4014
n=15 has Accur

In [63]:
# Same approach for main genres using argmax
genre_cols = [col for col in encoded_df.columns if "playlist_genre_" in col]
genre_indices = encoded_df[genre_cols].values.argmax(axis=1)
y_main_genre = pd.Series([genre_cols[i].replace("playlist_genre_", "") for i in genre_indices])

# split dataset into train and test
X_train_mg, X_test_mg, y_train_mg, y_test_mg = train_test_split(X, y_main_genre, test_size=0.2, random_state=67)

# fit scaler transform to the train set and transform test
X_train_mg_scaled = scaler.fit_transform(X_train_mg)
X_test_mg_scaled = scaler.transform(X_test_mg)

print("\nMain genre distribution:", y_main_genre.value_counts().to_dict())

#Total number of neightbors to check in each subsequent run
for n in [1, 3, 5, 9,12]:
    model = KNeighborsClassifier(n_neighbors=n)
    model.fit(X_train_mg_scaled, y_train_mg)
    score = model.score(X_test_mg_scaled, y_test_mg)
    print(f"n={n} has Accuracy= {score:.4f}")



Main genre distribution: {'edm': 6043, 'rap': 5746, 'pop': 5507, 'r&b': 5431, 'latin': 5155, 'rock': 4951}
n=1 has Accuracy= 1.0000
n=3 has Accuracy= 0.9998
n=5 has Accuracy= 0.9998
n=9 has Accuracy= 1.0000
n=12 has Accuracy= 1.0000
