In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from IPython.core.debugger import set_trace
from scipy.stats import rankdata
import matplotlib.colors as mcolors
import itertools as it

In [2]:
%matplotlib widget

In [4]:
def step_kmeans(params: np.ndarray, x: np.ndarray):
    """Make one kmeans step"""
    dist = np.sqrt(np.sum(
        (x - params[:, np.newaxis]) ** 2,
        axis=-1
    ))
    indxs = ~(rankdata(dist, axis=0) - 1).astype(np.bool)
    return np.apply_along_axis(
        lambda i: x[i].mean(0),
        1, 
        indxs
    )

In [3]:
def kmeans(x, params):
    if x.ndim == 1:
        x = x[:, np.newaxis]
    elif x.ndim != 2:
        raise ValueError(f"x should have 2 dims not {x.ndim}")
    if params.ndim == 1:
        params = params[:, np.newaxis]
    elif params.ndim != 2:
        raise ValueError(f"x should have 2 dims not {params.ndim}")
        
    while True:
        yield params
        params = step_kmeans(params, x)
    

In [247]:
def run_until_eps(x, params, epsilon=0.01):
    solutions = iter(kmeans(x, params))
    params = next(solutions)
    error = epsilon + 1
    while error > epsilon:
        new_params = next(solutions)
        error = np.mean(np.sqrt(np.sum(
            (params - new_params) ** 2, axis=-1
        )))
        params = new_params
    return params
        

In [169]:
def snap_until_eps(x, params, epsilon=0.0):
    solutions = iter(kmeans(x, params))
    yield next(solutions)
    error = epsilon + 1
    while error > epsilon:
        new_params = next(solutions)
        yield new_params
        error = np.mean(np.sqrt(np.sum(
            (params - new_params) ** 2, axis=-1
        )))
        params = new_params
        

In [243]:
def mean_distance(x, params):
    dist = np.sqrt(np.sum(
        (x - params[:, np.newaxis]) ** 2,
        axis=-1
    ))
    return np.mean(np.min(dist, axis=0))

In [253]:
def elbow(x, ks, generator=None):
    if generator is None:
        generator = seedpp_max
    for k in ks:
        inits = generator(x, k)
        params = run_until_eps(x, inits)
        yield params, mean_distance(x, params)

In [292]:
def snd(x): return x[1]
def fst(x): return x[0]

In [237]:
def seedpp(x, no=3):
    seed = x[np.random.choice(len(x))]
    seed = seed[np.newaxis]
    
    i = 1
    while i < no:
        p = np.min(
            np.sum(
                ((x - seed[:, np.newaxis])**2), 
                axis=-1
            ),  
            axis=0
        )
        p /= p.sum(axis=-1)
        new_loc = x[np.random.choice(np.arange(len(x)), p=p)]
        seed = np.r_[seed, new_loc[np.newaxis]]
        i += 1
    return seed

In [201]:
def seedpp_max(x, no=3):
    seed = x[np.random.choice(len(x))]
    seed = seed[np.newaxis]
    
    i = 1
    while i < no:
        p = np.min(np.sum(((x - seed[:, np.newaxis])**2), axis=-1), axis=0).argmax()
        new_loc = x[p]
        seed = np.r_[seed, new_loc[np.newaxis]]
        i += 1
    return seed

In [144]:
def get_voronoi(x=(-5, 20), y=(-5, 15)):
    X = np.linspace(*x, 50)
    Y = np.linspace(*y, 50)
    X, Y = np.meshgrid(X, Y)
    X, Y = X[..., np.newaxis], Y[..., np.newaxis]
    return np.r_['-1', X, Y].reshape(-1, 2)

In [6]:
def take(i, iterable):
    return [item for _, item in zip(range(i), iterable)]

In [7]:
class Seq2D:
    """Plot the evolution of the 2D EM algorithm"""
    
    def __init__(self, ax, snaps, x, ba, grid):
        self.ax = ax
        self.snaps = snaps
        self.x = x
        self.ba = ba
        self.grid = grid
            
    def plot(self, i):
        self.ax.clear()
        params = self.snaps[i]
        dist = np.sqrt(np.sum((self.x - params[:, np.newaxis]) ** 2, axis=-1))
        indxs = compute_indx(self.x, params)
        grid_indxs = compute_indx(self.grid, params)
        for num, (param, indx, grid_index, bz) in enumerate(zip(params, indxs, grid_indxs, self.ba)):
            print(param)
            self.ax.scatter(*self.x[indx].T, c=bz, alpha=.4)
            self.ax.scatter(*self.grid[grid_index].T, c=bz, s=5, alpha=.4, marker='.')
            self.ax.scatter(*param.T, c=bz, s=1000, marker='x', label=f'cluster{num}')


In [8]:
def compute_indx(x, centroids):
    dist = np.sqrt(np.sum((x - centroids[:, np.newaxis]) ** 2, axis=-1))
    return ~(rankdata(dist, axis=0) - 1).astype(np.bool)

# Data Generation

In [227]:
a = np.random.multivariate_normal([10, 10], [[1, 0], [0, 1]], 300)
b = np.random.multivariate_normal([5, 0], [[1, 0], [0, 1]], 200)
c = np.random.multivariate_normal([5, 10], [[1, 0], [0, 1]], 200)
d = np.random.multivariate_normal([10, 0], [[1, 0], [0, 1]], 100)
e = np.random.multivariate_normal([15, 0], [[1, 0], [0, 1]], 200)
f = np.random.multivariate_normal([15, 10], [[1, 0], [0, 1]], 200)
x = np.r_[a, b, c, d, e, f]

no_cluster = 6

## Without seedpp inits

In [236]:
no_cluster = 6
colors = it.cycle(list(mcolors.BASE_COLORS))
cms = take(no_cluster, colors)

inits = x[np.random.choice(len(x), no_cluster)]
kmeans_snaps = list(snap_until_eps(x, inits))
fig, ax = plt.subplots()
smth = Seq2D(ax, kmeans_snaps, x, cms, get_voronoi(x=(0, 20)))
_ = widgets.interact(smth.plot, i=(0, len(kmeans_snaps) - 1, 1))

FigureCanvasNbAgg()

interactive(children=(IntSlider(value=3, description='i', max=7), Output()), _dom_classes=('widget-interact',)…

## With seedpp inits

In [231]:
colors = it.cycle(list(mcolors.BASE_COLORS))
cms = take(no_cluster, colors)

inits = seedpp(x, no_cluster)
# kmeans_snaps = [params for _, params in zip(range(no + 1), kmeans(x, inits))]
kmeans_snaps = list(snap_until_eps(x, inits))
fig, ax = plt.subplots()
smth = Seq2D(ax, kmeans_snaps, x, cms, get_voronoi(x=(0, 20)))
_ = widgets.interact(smth.plot, i=(0, len(kmeans_snaps) - 1, 1))

FigureCanvasNbAgg()

interactive(children=(IntSlider(value=4, description='i', max=8), Output()), _dom_classes=('widget-interact',)…

## With seedpp_max inits

In [232]:
colors = it.cycle(list(mcolors.BASE_COLORS))
cms = take(no_cluster, colors)

inits = seedpp_max(x, no_cluster)
# kmeans_snaps = [params for _, params in zip(range(no + 1), kmeans(x, inits))]
kmeans_snaps = list(snap_until_eps(x, inits))
fig, ax = plt.subplots()
smth = Seq2D(ax, kmeans_snaps, x, cms, get_voronoi(x=(0, 20)))
_ = widgets.interact(smth.plot, i=(0, len(kmeans_snaps) - 1, 1))

FigureCanvasNbAgg()

interactive(children=(IntSlider(value=3, description='i', max=7), Output()), _dom_classes=('widget-interact',)…

# Elbow for no of clusters

In [291]:
ma = list(elbow(x, list(range(2, 10))))

mdist = list(map(snd, ma))

plt.figure()
plt.plot(list(range(2, 10)), mdist)
plt.show()

FigureCanvasNbAgg()

In [295]:
params.shape

(6, 2)

In [294]:
x.shape

(1200, 2)

In [303]:
np.apply_along_axis(lambda ii: x[ii ==0], 1, rankdata(np.sum((x - params[:, np.newaxis]) ** 2, axis=-1), axis=0) - 1)

ValueError: could not broadcast input array from shape (103,2) into shape (119,2)

In [305]:
data = rankdata(np.sum((x - params[:, np.newaxis]) ** 2, axis=-1), axis=0)

In [316]:
clusters = [(p, x[d == 1]) for p, d in zip(params, data)]

In [320]:
cross = np.array(list(map(fst, clusters)))

In [332]:
np.where(np.array([1,2,3]) == 2)

(array([1]),)

In [331]:
rankdata(np.sum((cross[:, np.newaxis] - cross) ** 2, axis=-1), axis=1)

array([[1., 6., 5., 2., 3., 4.],
       [6., 1., 2., 4., 5., 3.],
       [5., 2., 1., 3., 6., 4.],
       [2., 6., 4., 1., 5., 3.],
       [2., 5., 6., 4., 1., 3.],
       [5., 4., 6., 2., 3., 1.]])

In [312]:
np.diff(np.array(mdist))

array([-1.01371786, -0.67854376, -0.58195245, -0.28579699, -0.04254842,
       -0.03883587, -0.07426073])

In [312]:
np.diff(np.array(mdist))

array([-1.01371786, -0.67854376, -0.58195245, -0.28579699, -0.04254842,
       -0.03883587, -0.07426073])

In [308]:
mask_st = data[0] == 0
mask_nd = data[0] == 1

In [307]:
(data[0] == 0) & (data[0] == 1)

array([False, False, False, ..., False, False, False])

In [None]:
def get_clusters(x, params):
    