# Spectral Clustering

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from plotly.subplots import make_subplots
from sklearn.cluster import SpectralClustering, MiniBatchKMeans
from sklearn.preprocessing import StandardScaler, normalize 
from sklearn.decomposition import PCA 
from sklearn.metrics import silhouette_score 
from sklearn import datasets
from itertools import combinations, product

np.random.seed(0)

## Toy examples with SK-Learn 

We will look at using Spectral Clustering on some synthetic data based on [scikit-learn example](https://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html#sphx-glr-auto-examples-cluster-plot-cluster-comparison-py)

In [2]:
n_samples = 1500
noisy_circles = datasets.make_circles(n_samples=n_samples, factor=0.5, noise=0.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=0.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)

X, y = datasets.make_blobs(n_samples=n_samples, random_state=170)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)

colours = ["#377eb8", "#ff7f00", "#4daf4a",]
cluster_results = {
    "true": {
        'circles': np.choose(noisy_circles[1], colours),
        'moons': np.choose(noisy_moons[1], colours),
        'blobs': np.choose(blobs[1], colours),
        'aniso': np.choose(aniso[1], colours)
    } 
}

In [3]:
def get_spectral_clusters_as_colour_labels(X, n_clusters):
    spectral = SpectralClustering(n_clusters=n_clusters, eigen_solver="arpack", affinity="nearest_neighbors")
    spectral.fit(X)
    return np.choose(spectral.labels_, colours)

def get_k_means_clusters_as_colour_labels(X, n_clusters):
    min_batch = MiniBatchKMeans(n_clusters=n_clusters)
    min_batch.fit(X)
    return np.choose(min_batch.labels_, colours)

# spectral clustering 
cluster_results['spectral'] = dict()

# Spectral clustering of noisy circle data
cluster_results['spectral']['circles'] = get_spectral_clusters_as_colour_labels(noisy_circles[0], 2)

# Spectral clustering of noisy moon data
cluster_results['spectral']['moons'] = get_spectral_clusters_as_colour_labels(noisy_moons[0], 2)

# Spectral clustering of blob data
cluster_results['spectral']['blobs'] = get_spectral_clusters_as_colour_labels(blobs[0], 3)

# Spectral clustering of Anisotropicly distributed
cluster_results['spectral']['aniso'] = get_spectral_clusters_as_colour_labels(aniso[0], 3)

# K-means mini batch for comparison
cluster_results['kmeans'] = dict()

# K-means mini batch clustering of noisy circle data
cluster_results['kmeans']['circles'] = get_k_means_clusters_as_colour_labels(noisy_circles[0], 2)

# K-means mini batch clustering of noisy moon data
cluster_results['kmeans']['moons'] = get_k_means_clusters_as_colour_labels(noisy_moons[0], 2)

# K-means mini batch clustering of blob data
cluster_results['kmeans']['blobs'] = get_k_means_clusters_as_colour_labels(blobs[0], 3)

# K-means mini batch clustering of Anisotropicly distributed
cluster_results['kmeans']['aniso'] = get_k_means_clusters_as_colour_labels(aniso[0], 3)




In [4]:
fig = make_subplots(
    rows=4, cols=3,
    subplot_titles=(
        "True Clusters", "Spectral", "MiniBatch\nKMeans"
    ),
    vertical_spacing=0.04
)

datapoints = [
    noisy_circles[0],
    noisy_moons[0],
    blobs[0],
    aniso[0]
]


for i, result_set in enumerate(cluster_results.keys()):
    _col = i+1
    for j, results in enumerate(cluster_results[result_set].keys()):
        _row = j+1
        fig.add_trace(
                go.Scatter(x=datapoints[j][:, 0], y=datapoints[j][:, 1], mode="markers", marker=dict(color=cluster_results[result_set][results])),
                row=_row, col=_col
        )

fig.update(layout_showlegend=False)
fig.update_layout(title_text="Synthetic Data", height=1000, width=1000)
fig.show()

Spectral clustering takes the relative position of data points into account.

## Real Data - Iris 

In [5]:
df = px.data.iris()
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_id
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1
2,4.7,3.2,1.3,0.2,setosa,1
3,4.6,3.1,1.5,0.2,setosa,1
4,5.0,3.6,1.4,0.2,setosa,1
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,3
146,6.3,2.5,5.0,1.9,virginica,3
147,6.5,3.0,5.2,2.0,virginica,3
148,6.2,3.4,5.4,2.3,virginica,3


### Visualizing Data 

With 4 features we have 6 possible 2D scatter plots

In [6]:
features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
feature_combinations = list(combinations(features, 2))

fig = make_subplots(
    rows=2, cols=3,
    vertical_spacing=0.10,
    horizontal_spacing=0.04,
    subplot_titles=([f"{subplot[0]} by {subplot[1]}" for subplot in feature_combinations])
)

for i, sub_plot in enumerate(feature_combinations):
    _row = 1 if i+1<=3 else 2
    _col = i+1 if i+1<=3 else i-2
    fig.add_trace(
                go.Scatter(x=df[sub_plot[0]], y=df[sub_plot[1]], mode="markers", marker=dict(color=np.choose(df['species_id'].to_numpy()-1, colours))),
                row=_row, col=_col,
        )
fig.update(layout_showlegend=False)
fig.update_layout(title_text="Feature plots", height=700, width=1200)
fig.show()

### Clustering with Principal components

#### Data preprocessing 

In [15]:
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
# Scaling the Data 
scaler = StandardScaler() 
X_scaled = scaler.fit_transform(X) 

# Normalizing the Data 
X_normalized = pd.DataFrame(normalize(X_scaled))


# Reducing features to 2D
pca = PCA(n_components = 3) 
X_principal = pca.fit_transform(X_normalized) 
X_principal = pd.DataFrame(X_principal) 
X_principal.columns = ['P1', 'P2', 'P3'] 

#### Clustering with 2 Principal C|omponents

In [19]:
# Clustering Data 
spectral_clusters = SpectralClustering(n_clusters = 3, affinity ='rbf')
pc2_labels = spectral_clusters.fit_predict(X_principal[['P1', 'P2']])

# plotting true and clustered points

fig = make_subplots(
    rows=1, cols=2,
    vertical_spacing=0.10,
    horizontal_spacing=0.04,
    subplot_titles=("True Classes", "Spectal Cluster on PC1 & PC2")
)

fig.add_trace(
            go.Scatter(x=X_principal['P1'], y=X_principal['P2'], mode="markers", marker=dict(color=np.choose(df['species_id'].to_numpy()-1, colours))),
            row=1, col=1,
        )

fig.add_trace(
            go.Scatter(x=X_principal['P1'], y=X_principal['P2'], mode="markers", marker=dict(color=np.choose(pc2_labels, colours))),
            row=1, col=2,
        )
    
fig.update(layout_showlegend=False)
fig.update_layout(height=700, width=1200)
fig.show()

#### Clustering with 3 Principal Components 

In [23]:
# Clustering Data 
spectral_clusters = SpectralClustering(n_clusters = 3, affinity ='rbf')
pc3_labels = spectral_clusters.fit_predict(X_principal[['P1', 'P2', 'P3']])

# plotting true and clustered points

fig = make_subplots(
    rows=1, cols=2,
    vertical_spacing=0.10,
    horizontal_spacing=0.04,
    subplot_titles=("True Classes", "Spectal Cluster on PC 1,2&3"),
    specs=[[{"type": "scene"}, {"type": "scene"}]]
)

fig.add_trace(
            go.Scatter3d(x=X_principal['P1'], y=X_principal['P2'], z=X_principal['P3'], mode="markers", marker=dict(color=np.choose(df['species_id'].to_numpy()-1, colours))),
            row=1, col=1,
        )

fig.add_trace(
            go.Scatter3d(x=X_principal['P1'], y=X_principal['P2'], z=X_principal['P3'], mode="markers", marker=dict(color=np.choose(pc3_labels, colours))),
            row=1, col=2,
        )
    
fig.update(layout_showlegend=False)
fig.update_layout(height=700, width=1200)
fig.show()

### Evaluating Feature sets and Affinity with Silhouette Score 

--  Silhouette score a metric used to calculate the goodness of a clustering technique

In [25]:
subset = [
    ("Raw Data", X),
    ("Normalized Data", X_normalized),
    ("2 PCs", X_principal[['P1', 'P2']]),
    ("3 PCs", X_principal[['P1', 'P2', 'P3']])
]

affinity_type = ['rbf', 'nearest_neighbors'] 

s_scores = []
label_sets = dict()

for (name, ss), at in product(subset, affinity_type):
    spectral_clusters = SpectralClustering(n_clusters = 3, affinity=at)
    labels = spectral_clusters.fit_predict(ss)
    s_scores.append(
        (f"{name}-{at}",
        silhouette_score(X, labels))
    )
    label_sets[f"{name}-{at}"] = labels


Graph is not fully connected, spectral embedding may not work as expected.



In [14]:
s_scores = pd.DataFrame(s_scores, columns=['experiment', 'score']).sort_values(by=['score'], ascending=False)
px.bar(s_scores, x='experiment', y='score')

In [29]:
# plotting true and clustered points

fig = make_subplots(
    rows=1, cols=3,
    vertical_spacing=0.10,
    horizontal_spacing=0.04,
    subplot_titles=("True Classes", 'Raw Data-rbf', "Spectal Cluster on PC1 & PC2")
)

fig.add_trace(
            go.Scatter(x=X_principal['P1'], y=X_principal['P2'], mode="markers", marker=dict(color=np.choose(df['species_id'].to_numpy()-1, colours))),
            row=1, col=1,
        )

fig.add_trace(
            go.Scatter(x=X_principal['P1'], y=X_principal['P2'], mode="markers", marker=dict(color=np.choose(pc2_labels, colours))),
            row=1, col=3,
        )

fig.add_trace(
            go.Scatter(x=X_principal['P1'], y=X_principal['P2'], mode="markers", marker=dict(color=np.choose(label_sets['Raw Data-rbf'], colours))),
            row=1, col=2,
        )
    
fig.update(layout_showlegend=False)
fig.update_layout(height=700, width=1200)
fig.show()