In [5]:
import numpy as np
import time

In [11]:
# Status
UNKNOWN = -1
NOISE = -2

In [2]:
class DBSCAN(object):
    """
    Base Class of DBSCAN, please do NOT instantiate this Class
    """

    def __init__(self, path):
        """
        DBSCAN Classes should be instantiate with data file path
        """
        self.m, _ = self._load_data(path)
        self.num_p = self.m.shape[0]
        self.tags = [UNKNOWN] * self.num_p

    def _load_data(self, path: str):
        with open(path, 'r') as f:
            data = []
            label = []
            for l in f.readlines():
                source = l.strip().split()
                data.append([float(val) for val in source[:2]])
                label.append(int(source[-1]))
            return np.array(data), np.array(label)
        
    def _get_dist(self, a, b, fast_mode: bool = False) -> float:
        """
        for float comparison, set all distance value precision to 5
        :param: a: int; index of given point in data matrix
        :param: b: same as a
        :param: fast_mode: bool -> if True, ignore sqrt() opration for distance
        """
        if fast_mode:
            result = np.power(self.m[b] - self.m[a], 2).sum()
        else:
            result = np.sqrt(np.power(self.m[b] - self.m[a], 2).sum())
        return round(result, 5)
    
    def predict(self, eps, min_pts, fast_mode=False) -> list:
        """
        return list of labels as the sequence in data matrix
        :param: m: np.matrix; N * 2 matrix recoding all nodes' coordinates
        :param: eps: float; the value of radius of density area
        :param: min_pts: int; least neighbours should be in a density area
        """

        cluster_id = 1
        for p_id in range(self.num_p):
            if self.tags[p_id] != UNKNOWN:
                continue
            if self._clustering(p_id, eps, min_pts, cluster_id, fast_mode):
                cluster_id += 1
        return np.array(self.tags)

In [3]:
class NaiveDBSCAN(DBSCAN):

    def __init__(self, path):
        super(NaiveDBSCAN, self).__init__(path)

    def _get_neighbours(self, p: int, eps: float, fast_mode=False) -> list:

        ngbs = []
        for idx in range(len(self.m)):
            if self._get_dist(p, idx, fast_mode) < eps:
                ngbs.append(idx)
        return ngbs
    
    def _clustering(self, p, eps, min_pts, cluster_id, fast_mode=False) -> bool:

        neighbours = self._get_neighbours(p, eps, fast_mode)
        if len(neighbours) < min_pts:
            self.tags[p] = NOISE
            return False
        else:
            self.tags[p] = cluster_id
            for idx in neighbours:
                self.tags[idx] = cluster_id
            while len(neighbours) > 0:
                sub_neighbours = self._get_neighbours(neighbours[0], eps, fast_mode)
                if len(sub_neighbours) >= min_pts:
                    for sub_n in sub_neighbours:
                        if self.tags[sub_n] < 0:
                            self.tags[sub_n] = cluster_id
                            if self.tags[sub_n] == UNKNOWN:
                                neighbours.append(sub_n)
                neighbours = neighbours[1:]
        return True

In [207]:
def get_neighbours(instance, p: int, eps: float, fast_mode=False) -> list:

########### parallel #############

#     def ngbs_f(iterator):
#         ngbs = []
#         for i in iterator:
#             if instance._get_dist(p, i, fast_mode) < eps:
#                 ngbs.append(i)
#         return ngbs

#     idx_list = list(np.array(range(len(instance.m))))

#     idx_rdd = sc.parallelize(idx_list)

#     all_ngbs = idx_rdd.mapPartitions(ngbs_f).collect()

#     return all_ngbs


########### serial #############

    ngbs = []
    for idx in range(len(instance.m)):
        if instance._get_dist(p, idx, fast_mode) < eps:
            ngbs.append(idx)
    return ngbs

def clustering(instance, p, eps, min_pts, cluster_id, fast_mode=False) -> bool:
    
########### parallel #############

    def set_tag(x, cluster_id):
        instance.tags[x] = cluster_id
        return x

    neighbours = get_neighbours(instance, p, eps, fast_mode)
    if len(neighbours) < min_pts:
        instance.tags[p] = NOISE
        return False
    else:
        instance.tags[p] = cluster_id

        ngbs_rdd = sc.parallelize(neighbours)

        results = ngbs_rdd.map(lambda x: set_tag(x, cluster_id))

        while len(neighbours) > 0:
            sub_neighbours = get_neighbours(instance, neighbours[0], eps, fast_mode)
            if len(sub_neighbours) >= min_pts:
                for sub_n in sub_neighbours:
                    if instance.tags[sub_n] < 0:
                        instance.tags[sub_n] = cluster_id
                        if instance.tags[sub_n] == UNKNOWN:
                            neighbours.append(sub_n)
            neighbours = neighbours[1:]
    return True

########### serial #############

#     neighbours = get_neighbours(instance, p, eps, fast_mode)
#     if len(neighbours) < min_pts:
#         instance.tags[p] = NOISE
#         return False
#     else:
#         instance.tags[p] = cluster_id
#         for idx in neighbours:
#             instance.tags[idx] = cluster_id
#         while len(neighbours) > 0:
#             sub_neighbours = get_neighbours(instance, neighbours[0], eps, fast_mode)
#             if len(sub_neighbours) >= min_pts:
#                 for sub_n in sub_neighbours:
#                     if instance.tags[sub_n] < 0:
#                         instance.tags[sub_n] = cluster_id
#                         if instance.tags[sub_n] == UNKNOWN:
#                             neighbours.append(sub_n)
#             neighbours = neighbours[1:]
#     return True

def predict(instance, eps, min_pts, fast_mode=False) -> list:
    
    cluster_id = 1
    
#     p_id_list = list(np.array(range(instance.num_p)))
#     p_id_rdd = sc.parallelize(p_id_list)
#     results = p_id_rdd.foreach(each_clustering)
    
    for p_id in range(instance.num_p):
        if instance.tags[p_id] != UNKNOWN:
            continue
        if clustering(instance, p_id, eps, min_pts, cluster_id, fast_mode):
            cluster_id += 1
    return np.array(instance.tags)

##### parallel test

In [208]:
src = 'spiral.txt'
test = NaiveDBSCAN_parallel(src)

In [209]:
start_time = time.time()
print(predict(test, 2.5, 3))
end_time = time.time()
print('parallal run time:', end_time - start_time)

[ 1  1  1  1  1  2  2  2  2  2  3  3  3  3  3  4  4  4  4  4  5  5  5  5
  5  5  5  6  6  6  6  6  6  6  7  7  7  7  7  7  7  8  8  8  8  8  8  8
  8  9  9  9  9  9  9  9  9 10 10 10 10 10 10 10 10 10 11 11 11 11 11 11
 11 11 11 11 11 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 13 13 13 13
 13 13 13 13 13 13 13 13 13 13 14 14 14 14 14 15 15 15 15 15 16 16 16 16
 16 17 17 17 17 17 17 18 18 18 18 18 18 18 19 19 19 19 19 19 19 20 20 20
 20 20 20 20 21 21 21 21 21 21 21 22 22 22 22 22 22 22 22 23 23 23 23 23
 23 23 23 23 23 24 24 24 24 24 24 24 24 24 24 24 24 25 25 25 25 25 25 25
 25 25 25 25 25 25 25 25 26 26 26 26 26 26 26 27 27 27 27 27 28 28 28 28
 28 29 29 29 29 29 30 30 30 30 30 31 31 31 31 31 31 32 32 32 32 32 32 32
 33 33 33 33 33 33 33 34 34 34 34 34 34 34 35 35 35 35 35 35 35 35 35 36
 36 36 36 36 36 36 36 36 36 37 37 37 37 37 37 37 37 37 37 37 38 38 38 38
 38 38 38 38 38 38 38 38 38 38 38 39 39 39 39 39 39 39 39 39 39 39 39 39]
parallal run time: 1.3952338695526123


##### serial test

In [133]:
src = 'spiral.txt'

print('Naive DBSCAN:')
ndbscan = NaiveDBSCAN(src)

Naive DBSCAN:


In [134]:
start_time = time.time()
predictions = ndbscan.predict(2.5, 3)
print(predictions)
end_time = time.time()
print('parallal run time:', end_time - start_time)

[ 1  1  1  2  2  2  2  2  3  3  3  3  3  4  4  4  4  4  5  5  5  5  5  5
  6  6  6  6  6  6  6  7  7  7  7  7  7  7  8  8  8  8  8  8  8  9  9  9
  9  9  9  9  9 10 10 10 10 10 10 10 10 10 11 11 11 11 11 11 11 11 11 11
 12 12 12 12 12 12 12 12 12 12 12 12 13 13 13 13 13 13 13 13 13 13 13 13
 13 13 13 13 13 13 13 13 13 13 14 14 14 15 15 15 15 15 16 16 16 16 16 17
 17 17 17 17 18 18 18 18 18 18 18 19 19 19 19 19 19 19 20 20 20 20 20 20
 20 21 21 21 21 21 21 21 22 22 22 22 22 22 22 23 23 23 23 23 23 23 23 23
 24 24 24 24 24 24 24 24 24 24 24 25 25 25 25 25 25 25 25 25 25 25 25 25
 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 27 27 27 28 28 28 28 28 29
 29 29 29 29 30 30 30 30 30 31 31 31 31 31 31 32 32 32 32 32 32 33 33 33
 33 33 33 33 34 34 34 34 34 34 34 35 35 35 35 35 35 35 35 36 36 36 36 36
 36 36 36 36 37 37 37 37 37 37 37 37 37 37 37 38 38 38 38 38 38 38 38 38
 38 38 38 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39 39]
parallal run time: 1.2797307968139648


In [210]:
del ndbscan