## KMeans

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import MiniBatchKMeans

In [None]:
df = pd.read_csv('spectra.csv')
df = df.sort_values(by='TIME', ascending=True)
df.TIME = df.TIME - df.TIME.iloc[0]
df.TIME = pd.to_numeric(df.TIME, downcast='integer')
df = df.set_index('TIME')
df.columns = pd.to_numeric(df.columns, downcast='integer')

In [None]:
# Define cluster numbers to try
# n_clusters = np.arange(3, 10, 1)
# n_clusters = np.arange(3, 5, 1)
n_clusters = np.array([4, 5])

# Fit and plot agglomerative clustering for each cluster number
fig, axs = plt.subplots(1, len(n_clusters), figsize=(25, 5))
for i, n in enumerate(n_clusters):
    kmeans = MiniBatchKMeans(n_clusters=n, batch_size=10_000, max_iter=100)
    y_pred = kmeans.fit_predict(df)
    for column in df.columns:
        axs[i].scatter(np.full(df[column].shape[0], column), df[column], c=y_pred, s=1)
        axs[i].set_title(f'{n} clusters')
    break

fig.savefig('kmeans-comparison-job.jpeg')

## Using pandas groupby with mean cluster values

In [None]:
# Define cluster numbers to try
# n_clusters = np.arange(3, 10, 1)
# n_clusters = np.arange(3, 5, 1)
# n_clusters = np.array([4, 5])
n_clusters = np.arange(4, 20, 4)

# Fit and plot agglomerative clustering for each cluster number
for i, n in enumerate(n_clusters):
    kmeans = MiniBatchKMeans(n_clusters=n, batch_size=10_000, max_iter=100)
    y_pred = kmeans.fit_predict(df)
    axes = df.groupby(y_pred).agg('mean').T.plot()
    axes.set_yscale('log')
    axes.set_xscale('log')
    axes.set_xlabel('log PI')
    axes.set_ylabel('log mean counts')
    axes.set_title(f'{n} clusters')
axes