# 1. Practicing k Means

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.datasets import make_blobs

data = make_blobs(n_samples=200, n_features=2, centers=4, cluster_std=1.8, random_state=101)
print(f'{data[0].shape=}')
data[0][:10, :]

In [3]:
plt.scatter(data[0][:, 0], data[0][:, 1], c=data[1], cmap='rainbow')

In [4]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4,
                # n_init='auto'
                )
kmeans.fit(data[0])

In [5]:
kmeans.cluster_centers_

In [6]:
kmeans.labels_

In [7]:
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(10, 6))
ax1.set_title('K Means')
ax1.scatter(data[0][:, 0], data[0][:, 1], c=kmeans.labels_, cmap='rainbow')

ax2.set_title('Original')
ax2.scatter(data[0][:, 0], data[0][:, 1], c=data[1], cmap='rainbow')

# K Means with Spatial Location
The weights would be higher for the spatial location of the ROIs

In [63]:
# init
import numpy as np
from src.utils.ROI import ROI
import pandas as pd

n_frames = 10000

In [64]:
# generate signals
rois = [ROI(x, y) for x in range(4) for y in range(4)]
roi_signals = {roi: np.random.uniform(low=-50, high=50, size=n_frames) for roi in rois}
signals_df = pd.DataFrame(roi_signals)
print(f'{signals_df.shape=}')
signals_df.head()

In [65]:
# add spatial indexes
ROI.WIDTH_PIXELS = ROI.HEIGHT_PIXELS = 16

spatial_idxs = {roi: [roi.center_pixels().x, roi.center_pixels().y] for roi in rois}
spatial_idxs = pd.DataFrame(spatial_idxs, index=['x_center', 'y_center'])

# normalizing signals
max_absolute_signal_value = signals_df.abs().max().max()
normalized_signals = signals_df / max_absolute_signal_value

# concatenate signals and spatial indexes
signals_indexes = pd.concat([spatial_idxs, normalized_signals])

# we convert the index to str because KMeans doesn't accept mixed indexes
signals_indexes.index = signals_indexes.index.astype(str)

print(f'{signals_indexes.shape=}')

signals_indexes
# normalized_signals

In [66]:
# testing distances 
from scipy.spatial.distance import euclidean

roi00 = rois[0]
roi11 = rois[5]

dists = {
    'Original Signals:': euclidean(signals_df[roi00], signals_df[roi11]),
    'Normalized Signals:': euclidean(normalized_signals[roi00], normalized_signals[roi11]),
    'Spatial Distance:': euclidean(spatial_idxs[roi00], spatial_idxs[roi11]),
    'Normalized Signals with spatial indexes:': euclidean(signals_indexes[roi00], signals_indexes[roi11]),
}

for key, value in dists.items():
    print(f'{key}: {value:.2f}')

print(f'| {n_frames} |', end='')
for _, value in dists.items():
    print(f'{value:.2f}', end=' | ')


Distances:

| n_frames | Original Signals: | Normalized Signals: | Spatial Distance: | Normalized Signals with spatial indexes: |
|----------|-------------------|---------------------|-------------------|------------------------------------------|
| 10000    | 4088.13           | 81.76               | 22.63             | 84.84                                    |
| 1000     | 1239.32           | 24.79               | 22.63             | 33.56                                    |
| 100      | 414.84            | 8.30                | 22.63             | 24.10                                    |
| 20       | 162.93            | 3.27                | 22.62             | 22.86                                    |
| 20       | 188.51            | 3.77                | 22.63             | 22.94                                    |
| 3        | 39.95             | 0.82                | 22.63             | 22.64                                    |

-> The final distance depends heavily on the number of frames. The more frames, the more the spatial indexes will weigh. 
We don't want this. The signal distance should not be taken into account more, if there are more frames.
-> Hence, we need to weigh/normalize based on the dimensions of the image, the amplitude of the signal, and the number of frames (time steps).

In [19]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4, n_init='auto')
# IMPORTANT: signals_df needs to be transposed here because a format of (n_samples, n_features) is expected
# n_samples is the number of ROIs and n_features is the number of frames + the 2 spatial indexes
kmeans.fit(signals_indexes.T)

In [20]:
kmeans.labels_

In [67]:
roi_cluster_dict = dict(zip(rois, kmeans.labels_))

# 3. Weighted K Means

We need to weigh/normalize based on the dimensions of the image, the amplitude of the signal, and the number of frames (time steps).

We want to introduce a factor that lets us weigh the spatial indexes more than the signals.


In [28]:
from collections import namedtuple
# init
import numpy as np
from src.utils.ROI import ROI
import pandas as pd

Dimensions = namedtuple('Dimensions', ['width', 'height'])

In [35]:
# Vars
n_frames = 4
ROI.N_VERTICAL = ROI.N_HORIZONTAL = 4
ROI.HEIGHT_PIXELS = ROI.WIDTH_PIXELS = 32
img_dims = Dimensions(ROI.WIDTH_PIXELS * ROI.N_HORIZONTAL, ROI.HEIGHT_PIXELS * ROI.N_VERTICAL)
print(f'{img_dims=}')

In [36]:
# generate signals
rois = [ROI(x, y) for x in range(4) for y in range(4)]
signals_df = pd.DataFrame({roi: np.random.uniform(low=-50, high=50, size=n_frames) for roi in rois})
print(f'{signals_df.shape=}')
# signals_df.head()

In [38]:
# generate indexes
spatial_indexes = pd.DataFrame({roi: [roi.center_pixels().x, roi.center_pixels().y] for roi in rois}, index=['x_center', 'y_center'])
spatial_indexes

In [39]:
# normalize signals based on the spatial to signal ratio (how much the spatial indexes should weigh)
spatial_weight = 0.2
signals_df