In [3]:
# math and dataframes
import pandas as pd
import numpy as np

# machine learning
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.model_selection import GridSearchCV

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [4]:
# import data 
# all songs with audio features
df_10M = pd.read_pickle('df_10M.pickle')

### PIPELINE OUTLINE

* remove outliers
    * sets that aren't music
    * by inspection using songs less than 1min or more than 10min (+ more extreme)
* OPTIONAL: encode 
    * alt: drop when clustering
* transform data using min max scaler
* cluster based on AF (not genre)
    * optimise based on silhouette 
* examine clusters by inspecting genres and popularity

### OUTLIERS

In [5]:
%%time
# REMOVE OUTLIERS
# based on details in outlier analysis

# add cluster column to df_10M and add 'outliers' as a cluster
df_10M['cluster'] = pd.NA

# extrema to exclude (domain knowledge + inspection of percentiles)
dur_min = df_10M.duration_ms < 60_000
dur_max = df_10M.duration_ms > 600_000
val_0 = df_10M.valence == 0
sp_min = df_10M.speechiness > 0.947000
tempo_0 = df_10M.tempo == 0
loud_min = df_10M.loudness < -34.668999
dance_min = df_10M.danceability < 0.064400

df_10M.loc[(dur_min | dur_max | val_0 | sp_min | tempo_0 | loud_min | dance_min), 'cluster'] = 'outlier'

# genres that aren't music
genres_to_exclude = set([
    'sleep', 'football', 'halloween', 'birthday', 'lullaby', 'ringtone', 'fan chant',
    'sound effects', 'spoken word', 'bible', 'prank', 'wrestling', 'language', 'oratory',
    'erotica', 'tone', 'vintage radio show', 'sound', 'quran', 'islamic recitation',
    'reading', 'asmr', 'mindfulness', 'meditation', 'guided meditation', 'workout product',
    'theme', 'environmental', 'motivation'    
])

df_10M.loc[df_10M.genre.isin(genres_to_exclude), 'cluster'] = 'outlier'

# create a feature set to cluster on
X = df_10M[df_10M.cluster != 'outlier'].reset_index(drop=True)

Wall time: 4.15 s


In [6]:
# 6% of data dropped as outliers
X.shape[0], (1 - X.shape[0] / df_10M.shape[0])*100

(8827719, 6.404280009393792)

### DROP AND ENCODE COLUMNS

In [7]:
# encode and/or drop columns
drop_columns = ['song', 'artist', 'genre', 'release_date', 'cluster', 'in_B100']
encode_columns = ['key', 'mode', 'time_signature']
X = X.drop(drop_columns, axis=1).drop(encode_columns, axis=1).set_index('id')

### TRANSFORM DATA

In [8]:
## transform data to range from 0 to 1
attributes_to_transform = ['duration_ms', 'loudness', 'tempo']

for attribute in attributes_to_transform:
    X[attribute] = (X[attribute] - X[attribute].min()) / (X[attribute].max() - X[attribute].min())

In [None]:
# X.to_pickle('X_precluster.pickle')

### CLUSTER DATA

In [2]:
# need to update sklearn? nonsense errors
# '0.24.2' my current version
import sklearn 
sklearn.__version__

'1.1.3'

In [1]:
# more errors
# need to upgrade this too
import threadpoolctl
threadpoolctl.__version__

'3.1.0'

In [9]:
%%time
# yay, it works now
X_small = X.head(10000)

n_clusters = [2, 4, 6, 8, 10, 15, 20]
results = {}

kmeans = KMeans(2).fit(X_small)


Wall time: 142 ms


In [10]:
%%time
X_small = X.head(10000)

n_clusters = [2, 4, 6, 8, 10, 15, 20]
results = {}

for n in n_clusters:
    kmeans = KMeans(n).fit(X_small)
    score = silhouette_score(X_small, kmeans.labels_)
    results[n] = score

Wall time: 11.1 s


In [11]:
results

{2: 0.3507026408894679,
 4: 0.3031840804271992,
 6: 0.2111607965642215,
 8: 0.2017013530402336,
 10: 0.1987264503297648,
 15: 0.15470800079119035,
 20: 0.1426658514993804}

In [None]:
%%time

X_small = X.head(10000)

n_clusters = [2, 4, 6, 8, 10, 15, 20]
n = n_clusters[0]
kmeans = KMeans(n_clusters=n).fit(X_small)

In [None]:
silhouette_score(X_small, kmeans.labels_)

In [None]:
## cluster based on AF (not genre)
## optimise based on silhouette 

# initial clusters to try, refine in steps
params = {
    'n_clusters': [2, 4, 6, 8, 10, 15, 20]
}

kmeans = KMeans()
grid = GridSearchCV(
    estimator=kmeans,
    param_grid=params, 
    scoring=silhouette_score
)



In [None]:
## examine performance by inspecting genres and popularity

