In [1]:
import numpy as np
# from src.serial import MatrixDBSCAN

In [57]:
# Status
UNKNOWN = -1
NOISE = -2

class DBSCAN(object):
    """
    Base Class of DBSCAN, please do NOT instantiate this Class
    """

    def __init__(self, dataset):
        """
        DBSCAN Classes should be instantiate with data point set
        """
        self.m, _ = (dataset, None)     # placeholder _ for future implementation of labels
        self.num_p = self.m.shape[0]
        self.tags = [UNKNOWN] * self.num_p

    def _get_dist(self, a, b, fast_mode: bool = False) -> float:
        """
        for float comparison, set all distance value precision to 5
        :param: a: int; index of given point in data matrix
        :param: b: same as a
        :param: fast_mode: bool -> if True, ignore sqrt() opration for distance
        """
        if fast_mode:
            result = np.power(self.m[b] - self.m[a], 2).sum()
        else:
            result = np.sqrt(np.power(self.m[b] - self.m[a], 2).sum())
        return round(result, 5)

    def _get_neighbours(self, p: int, eps: float, fast_mode=False) -> list:
        """
        return neighbours index of given point p in source data matrix
        :param: p: int; index of given point in data matrix
        :param: eps: float; the value of radius of density area
        """
        pass

    def _clustering(self, p, eps, min_pts, cluster_id, fast_mode=False):
        """
        tag given point p and all of its neighbours and sub-neighbours with the same cluster id
        :param: m: np.matrix; N * 2 matrix recoding all nodes' coordinates
        :param: eps: float; the value of radius of density area
        :param: min_pts: int; least neighbours should be in a density area
        :param: cluster_id: int; current id of cluster
        """
        pass
    
    def _find_core_pts(self, eps, min_pts):
        self.is_core = [0] * self.num_p
        for i in range(self.num_p):
            if len(self._get_neighbours(i, eps, min_pts)) > min_pts:
                self.is_core[i] = 1
        return self.is_core
        

    def predict(self, eps, min_pts, fast_mode=False) -> list:
        """
        return list of labels as the sequence in data matrix
        :param: m: np.matrix; N * 2 matrix recoding all nodes' coordinates
        :param: eps: float; the value of radius of density area
        :param: min_pts: int; least neighbours should be in a density area
        """
        self.eps = eps
        self.min_pts = min_pts

        cluster_id = 1
        for p_id in range(self.num_p):
            if self.tags[p_id] != UNKNOWN:
                continue
            if self._clustering(p_id, eps, min_pts, cluster_id, fast_mode):
                cluster_id += 1
        return np.array(self.tags)


class NaiveDBSCAN(DBSCAN):

    def __init__(self, dataset):
        super(NaiveDBSCAN, self).__init__(dataset)

    def _get_neighbours(self, p: int, eps: float, fast_mode=False) -> list:

        ngbs = []
        for idx in range(len(self.m)):
            if self._get_dist(p, idx, fast_mode) < eps:
                ngbs.append(idx)
        return ngbs

    def _clustering(self, p, eps, min_pts, cluster_id, fast_mode=False) -> bool:

        neighbours = self._get_neighbours(p, eps, fast_mode)
        if len(neighbours) < min_pts:
            self.tags[p] = NOISE
            return False
        else:
            self.tags[p] = cluster_id
            for idx in neighbours:
                self.tags[idx] = cluster_id
            while len(neighbours) > 0:
                sub_neighbours = self._get_neighbours(neighbours[0], eps, fast_mode)
                if len(sub_neighbours) >= min_pts:
                    for sub_n in sub_neighbours:
                        if self.tags[sub_n] < 0:
                            self.tags[sub_n] = cluster_id
                            if self.tags[sub_n] == UNKNOWN:
                                neighbours.append(sub_n)
                neighbours = neighbours[1:]
        return True
    

class MatrixDBSCAN(DBSCAN):

    def __init__(self, dataset):
        super(MatrixDBSCAN, self).__init__(dataset)
        self._get_distance_matrix()     # self.dist_m will be created
        del self.m

    def _get_distance_matrix(self):
        """
        Only once calculation will be on each point-pairs
        results will be stored in self.dist_m
        """

        self.dist_m = np.zeros((self.num_p, self.num_p))
        for p_id in range(self.num_p):
            for q_id in range(p_id, self.num_p):
                dist = self._get_dist(p_id, q_id)
                self.dist_m[q_id, p_id] = dist
                self.dist_m[p_id, q_id] = dist

    def _get_neighbours(self, p: int, eps: float, fast_mode=False) -> list:
        return np.nonzero(self.dist_m[p] < eps)[0]

    def _clustering(self, p, eps, min_pts, cluster_id, fast_mode=False) -> bool:
        """
        TODO: There should be some optimizations for this part, current code is too ugly
        """

        neighbours = self._get_neighbours(p, eps, fast_mode)
        if len(neighbours) < min_pts:
            self.tags[p] = NOISE
            return False
        else:
            self.tags[p] = cluster_id
            for idx in neighbours:
                self.tags[idx] = cluster_id
            while len(neighbours) > 0:
                sub_neighbours = self._get_neighbours(neighbours[0], eps, fast_mode)
                if len(sub_neighbours) >= min_pts:
                    for sub_n in sub_neighbours:
                        if self.tags[sub_n] < 0:
                            self.tags[sub_n] = cluster_id
                            if self.tags[sub_n] == UNKNOWN:
                                neighbours.append(sub_n)
                neighbours = neighbours[1:]
        return True

In [3]:
sc

In [31]:
test_file = '../../data/shape-sets/r15_600.txt'

In [32]:
def load_data_label(path):
    pts = sc.textFile(path).map(lambda x: x.strip().split()[:-1]).map(lambda x: tuple([float(i) for i in x]))
    return pts.collect()

In [69]:
dataset = load_data_label(test_file)
n_partitions = 4
eps = 0.7
min_pts = 12

In [64]:
# dataset

In [80]:
def partition(dataset, n_partitions, eps):
    
    # cut bins
    lower_bound = np.min(dataset, axis=0)
    upper_bound = np.max(dataset, axis=0)
    a = np.linspace(lower_bound, upper_bound, n_partitions+1, endpoint=True)
#     b = np.array([upper_bound])
#     tmp_bin = np.concatenate((a, b), axis=0)
    lower_bounds = [coordinates-eps for coordinates in a[:-1]]
    upper_bounds = [coordinates+eps for coordinates in a[1:]]
    print(lower_bounds)
    print(upper_bounds)
    
    # scatter points into bins with eps
    indexed_data = []
    for id_pts in range(len(dataset)):     # index of point in dataset
        for id_ptt in range(n_partitions):
            if not (dataset[id_pts] > lower_bounds[id_ptt]).all():
                continue
            if not (dataset[id_pts] < upper_bounds[id_ptt]).all():
                continue
            indexed_data.append([id_ptt, id_pts])
            
    res = sc.parallelize(indexed_data).groupByKey().map(lambda x: [x[0], list(x[1])])
    return res

def local_dbscan(partioned_rdd):
#     rdd_data = [data for data in partioned_rdd]
#     ids = [id_pts for id_pts in rdd_data[0]]
    dataset = np.array([b_dataset.value[idp] for idp in partioned_rdd])
    dbscan_obj = MatrixDBSCAN(dataset)
    dbscan_obj.predict(b_eps.value, b_min_pts.value)
    is_core_list = dbscan_obj._find_core_pts(b_eps.value, b_min_pts.value)
    
    return list(zip(zip(partioned_rdd, is_core_list), dbscan_obj.tags))

def merge(local_tags, dataset):
    global_tags = [UNKNOWN] * len(dataset)
    is_tagged = [0] * len(dataset)
    last_max_label = 0
    for local in local_tags:
        np_local = np.array(local[-1])
        np_local[:, -1] += last_max_label
        last_max_label = np.max(np_local[:, -1])
        
        # check and merge overlapped points
        tagged_indices = np.nonzero(is_tagged)[0]
        for tmp_i in range(len(np_local)):
            # should do tag check
            (p_id, is_core), label = np_local[tmp_i]
            if p_id in tagged_indices and is_core==1:
                np_local[-1][np_local[-1]==label] = global_tags[p_id]
        
        # update global tags
        for (p_id, is_core), label in np_local:
            if is_tagged[p_id]==1:
                continue
            global_tags[p_id] = label
            is_tagged[p_id] = 1
    return global_tags
            

In [81]:
b_dataset = sc.broadcast(dataset)
b_eps = sc.broadcast(eps)
b_min_pts = sc.broadcast(min_pts)

partitioned_rdd = partition(dataset, n_partitions, eps)
local_tags = partitioned_rdd.mapValues(lambda x: local_dbscan(x)).collect()
result_tags = merge(local_tags, dataset)

# partition(dataset, n_partitions, eps).collect()

[array([2.702, 2.478]), array([6.1325, 5.9365]), array([9.563, 9.395]), array([12.9935, 12.8535])]
[array([7.5325, 7.3365]), array([10.963, 10.795]), array([14.3935, 14.2535]), array([17.824, 17.712])]


In [None]:
[array([2.702, 2.478]), array([6.1325, 5.9365]), array([9.563, 9.395]), array([12.9935, 12.8535])]
[array([7.5325, 7.3365]), array([10.963, 10.795]), array([14.3935, 14.2535]), array([17.824, 17.712])]

In [82]:
result_tags

[2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 -1,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 -1,
 -1,
 9,
 9,
 9,
 -1,
 8,
 9,
 9,
 -1,
 9,
 -1,
 -1,
 9,
 9,
 -1,
 9,
 9,
 9,
 -1,
 9,
 -1,
 -1,
 -1,
 9,
 -1,
 9,
 -1,
 -1,
 9,
 9,
 -1,
 9,
 -1,
 9,
 9,
 -1,
 -1,
 8,
 -1,
 3,
 3,
 3,
 3,
 -1,
 -1,
 -1,
 3,
 3,
 3,
 -1,
 -1,
 -1,
 3,
 3,
 3,
 3,
 3,
 -1,
 -1,
 -1,
 3,
 3,
 -1,
 -1,
 -1,
 3,
 -1,
 -1,
 -1,
 -1,
 3,
 -1,
 -1,
 -1,
 3,
 -1,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
