In [1]:
import numpy as np

In [2]:
def meanShiftClustering(data, window, stop_criteria, param):
    def euclidean_multiple_distance(p_array, p):
        return np.sqrt(np.square(p_array - p).sum(axis=1))

    def gaussian_kernel(distance, param):
        return np.exp(-0.5 * np.square(distance/param['sigma'])) / (param['sigma'] * np.sqrt(2*np.pi))

    def mean_shift_iteration(data, point, window, distance_multiple_func, kernel, param):
        distances = distance_multiple_func(data, point)
        neighbors_idx = np.where(distances < window)[0]
        neighbors = data[neighbors_idx]
        neighbors_distances = distances[neighbors_idx]
        weights = kernel(neighbors_distances, param)
        centroid = (weights[:, np.newaxis] * neighbors).sum(axis=0) / weights.sum()
        return centroid

    distance_multiple_func = euclidean_multiple_distance
    kernel = gaussian_kernel

    max_iter = stop_criteria['max_iter']
    epsilon = stop_criteria['epsilon']

    converged = False
    data = data.copy()
    for i in range(max_iter):
        data_new = np.stack([mean_shift_iteration(data, point, window, distance_multiple_func, kernel, param) for point in data])
        max_shift = np.max(distance_multiple_func(data_new, data))
        data = data_new
        if max_shift < epsilon:
            converged = True
            break

    return data, converged

In [4]:
# data = np.loadtxt('data/g2-txt/g2-32-60.txt')
data = np.loadtxt('data/s/s1.txt')
window = 200000
stop_criteria = {'max_iter': 50, 'epsilon': 0.1}
param = {'sigma': 5.0}

new_data, converged = meanShiftClustering(data, window, stop_criteria, param)

In [5]:
print(converged)
print(new_data)

True
[[664159. 550946.]
 [665845. 557965.]
 [597173. 575538.]
 ...
 [650661. 861267.]
 [599647. 858702.]
 [684091. 842566.]]
