In [1]:
import spotipy
import re

# math and dataframes
import pandas as pd
import numpy as np

# outliers, encoding, and normalisation
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import make_column_transformer

# machine learning
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.model_selection import GridSearchCV

# plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
import time
import seaborn as sns
sns.set_theme()

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [2]:
# import data 
# all songs with audio features
df_10M = pd.read_pickle('df_10M.pickle')

### PIPELINE OUTLINE

* remove outliers
    * sets that aren't music
    * by inspection using songs less than 1min or more than 10min (+ more extreme)
* OPTIONAL: encode 
    * alt: drop when clustering
* transform data using min max scaler
* cluster based on AF (not genre)
    * optimise based on silhouette 
* examine clusters by inspecting genres and popularity

In [3]:
%%time
# REMOVE OUTLIERS
# based on details in outlier analysis

# add cluster column to df_10M and add 'outliers' as a cluster
df_10M['cluster'] = pd.NA

# extrema to exclude (domain knowledge + inspection of percentiles)
dur_min = df_10M.duration_ms < 60_000
dur_max = df_10M.duration_ms > 600_000
val_0 = df_10M.valence == 0
sp_min = df_10M.speechiness > 0.947000
tempo_0 = df_10M.tempo == 0
loud_min = df_10M.loudness < -34.668999
dance_min = df_10M.danceability < 0.064400

df_10M.loc[(dur_min | dur_max | val_0 | sp_min | tempo_0 | loud_min | dance_min), 'cluster'] = 'outlier'

# genres that aren't music
genres_to_exclude = set([
    'sleep', 'football', 'halloween', 'birthday', 'lullaby', 'ringtone', 'fan chant',
    'sound effects', 'spoken word', 'bible', 'prank', 'wrestling', 'language', 'oratory',
    'erotica', 'tone', 'vintage radio show', 'sound', 'quran', 'islamic recitation',
    'reading', 'asmr', 'mindfulness', 'meditation', 'guided meditation', 'workout product',
    'theme', 'environmental', 'motivation'    
])

df_10M.loc[df_10M.genre.isin(genres_to_exclude), 'cluster'] = 'outlier'

# create a feature set to cluster on
X = df_10M[df_10M.cluster != 'outlier']

Wall time: 3.33 s


In [4]:
# 6% of data dropped as outliers
X.shape[0], (1 - X.shape[0] / df_10M.shape[0])*100

(8827719, 6.404280009393792)

In [5]:
df_10M[df_10M.cluster=='outlier'].sample(10)

Unnamed: 0,id,song,artist,genre,release_date,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,in_B100,cluster
2273773,0ekXjGKN44zTHHno5WZOvt,"Dann press doch selber, Frau Dokta! - Aus dem ...",Dr. Josephine Chaos,lesen,2013-04-25,0.398,0.684,188453,0.333,0.0,2,0.495,-12.268,1,0.954,98.625,1,0.61,False,outlier
8735517,2wet1wSKm335OtWttvaLLb,Hit and Orch Rise,Troels Brun Folmann,,NaT,0.935,0.0,7947,0.09,0.991,1,0.0,-14.81,1,0.0,0.0,0,0.0,False,outlier
7565145,2lJQrCDEhxwlUrFAlP5Dkp,Bottling Plant,Sound Effects,,NaT,0.109,0.254,35707,0.869,0.968,6,0.311,-19.476,0,0.062,78.227,3,0.101,False,outlier
6498375,7qy5pcvFt94eEPBJtbG0wC,Pixie Williams Comments,Pixie Williams,classic nz pop,2011-07-10,0.649,0.75,22427,0.063,0.0,0,0.163,-20.082,0,0.649,84.038,4,0.0,False,outlier
3339260,2DGRRgKI7xudjFxOh8uFkP,Zombieland,Halloween Scary Sounds and Sound Effects,sound effects,2011-08-07,0.975,0.498,92137,0.116,0.234,6,0.124,-32.637,1,0.17,128.487,4,0.034,False,outlier
1593453,0bZUD0db8H8hweVN1a8Om7,Gun Oil & Stache Wax,Clayton Bush,,2010-09-16,0.889,0.263,32133,0.377,0.95,1,0.144,-12.016,0,0.054,155.372,4,0.355,False,outlier
5069233,0DbQ9VbsEsswq5eFvx26oP,"Beethoven: Sonata No. 32 in C Minor, Op. 111: ...",Ludwig van Beethoven,classical,2012-05-11,0.972,0.266,1009253,0.083,0.907,0,0.087,-22.801,1,0.038,63.057,4,0.041,False,outlier
2024500,65SdzEcUz8ExzemEvzb2Da,The Antigua Hotel - 1960,Death By Chocolate,twee pop,2012-02-07,0.749,0.646,48240,0.253,0.002,2,0.118,-13.722,1,0.05,132.123,4,0.052,False,outlier
8146146,7aGhOb6JfV50ftOgfMhZp1,Sue City Sue,The Fouryo's,vintage dutch pop,1961-10-25,0.905,0.607,33560,0.388,0.0,7,0.506,-16.378,0,0.04,129.299,4,0.961,False,outlier
8689954,6Wm0CIzFhneQUldZAg6R2o,Ein Jäger jagt ein wildes Schwein,Traditional,sleep,2010-01-01,0.949,0.751,102987,0.235,0.0,5,0.403,-13.706,1,0.039,136.352,4,0.945,False,outlier


In [None]:
# OPTIONAL: encode
drop_or_encode = ['key', 'mode', 'time_signature']

In [None]:
## transform to normalise data
scale = MinMaxScaler()

ct = make_column_transformer(
    (scale, ['duration_ms', 'loudness', 'tempo']),
    remainder='passthrough'
)


In [None]:
## cluster based on AF (not genre)
## optimise based on silhouette 




In [None]:
## examine performance by inspecting genres and popularity

