In [4]:
import pandas as pd
import numpy as np
df = pd.read_csv('../data/final/wrangled/pca.csv', index_col=0)

In [63]:
from sklearn.cluster import KMeans, DBSCAN
from kneed import KneeLocator

import matplotlib.pyplot as plt
import altair as alt

# Extract the features for clustering
X = df[['PC1', 'PC2']]

# Use the elbow method to find the optimal number of clusters
inertia = []
for n in range(1, 11):
    kmeans = KMeans(n_clusters=n, random_state=42)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

# Based on the elbow plot, choose the optimal number of clusters
# Use KneeLocator to find the elbow point
kneedle = KneeLocator(range(1, 11), inertia, curve='convex', direction='decreasing')
optimal_clusters = kneedle.elbow
kmeans = KMeans(n_clusters=5, random_state=42) # to align with DBSCAn
df['kmeans_labels'] = kmeans.fit_predict(X)

# Apply DBSCAN clustering
dbscan = DBSCAN(eps=0.4, min_samples=5)
df['dbscan_labels'] = dbscan.fit_predict(X)

# Create a scatter plot of the PCA results
scatter2 = alt.Chart(df.reset_index()).mark_circle().encode(
    x='PC1',
    y='PC2',
    color=alt.Color('instability', scale=alt.Scale(scheme='redblue', reverse=True)),
    tooltip=['year', 'PC1', 'PC2', 'instability']
)

scatter1 = alt.Chart(df.reset_index()).mark_circle().encode(
    x='PC1',
    y='PC2',
    color=alt.Color('kmeans_labels:N', legend=None),
    tooltip=['year', 'PC1', 'PC2', 'instability']
).properties(
    title='K-Means Clustering'
)

scatter3 = alt.Chart(df.reset_index()).mark_circle().encode(
    x='PC1',
    y='PC2',
    color=alt.condition(
        alt.datum.dbscan_labels == -1,
        alt.value('grey'),
        alt.Color('dbscan_labels:N', legend=None)
    ),
    tooltip=['year', 'PC1', 'PC2', 'instability'],
).properties(
    title='DBSCAN Clustering'
)


scatter1 | scatter2 | scatter3

In [64]:
from sklearn.metrics import silhouette_score

# Calculate silhouette score for KMeans
silhouette_kmeans = silhouette_score(X, df['kmeans_labels'])
print(f'Silhouette Score for KMeans: {silhouette_kmeans}')

# Calculate silhouette score for DBSCAN
silhouette_dbscan = silhouette_score(X, df['dbscan_labels'])
print(f'Silhouette Score for DBSCAN: {silhouette_dbscan}')

Silhouette Score for KMeans: 0.6216742140448069
Silhouette Score for DBSCAN: 0.5006554255105404


but does the clustering actually have meaning? The simple Silhouette score is not enough to determine the quality of the clustering. We will analyze this more in the next chapter.