In [1]:
import numpy as np
import time

# Status
UNKNOWN = -1
NOISE = -2


In [121]:
class DBSCAN(object):
    """
    Base Class of DBSCAN, please do NOT instantiate this Class
    """

    def __init__(self, path):
        """
        DBSCAN Classes should be instantiate with data file path
        """
        self.m, _ = self._load_data(path)
        self.num_p = self.m.shape[0]
        self.tags = [UNKNOWN] * self.num_p

    def _load_data(self, path: str):
        with open(path, 'r') as f:
            data = []
            label = []
            for l in f.readlines():
                source = l.strip().split()
                data.append([float(val) for val in source[:2]])
                label.append(int(source[-1]))
            return np.array(data), np.array(label)
        
    def _get_dist(self, a, b, fast_mode: bool = False) -> float:
        """
        for float comparison, set all distance value precision to 5
        :param: a: int; index of given point in data matrix
        :param: b: same as a
        :param: fast_mode: bool -> if True, ignore sqrt() opration for distance
        """
        if fast_mode:
            result = np.power(self.m[b] - self.m[a], 2).sum()
        else:
            result = np.sqrt(np.power(self.m[b] - self.m[a], 2).sum())
        return round(result, 5)
    
    def predict(self, eps, min_pts, fast_mode=False) -> list:
        """
        return list of labels as the sequence in data matrix
        :param: m: np.matrix; N * 2 matrix recoding all nodes' coordinates
        :param: eps: float; the value of radius of density area
        :param: min_pts: int; least neighbours should be in a density area
        """

        cluster_id = 1
        for p_id in range(self.num_p):
            if self.tags[p_id] != UNKNOWN:
                continue
            if self._clustering(p_id, eps, min_pts, cluster_id, fast_mode):
                cluster_id += 1
        return np.array(self.tags)

In [122]:
class MatrixDBSCAN(DBSCAN):

    def __init__(self, path):
        super(MatrixDBSCAN, self).__init__(path)
        self._get_distance_matrix()     # self.dist_m will be created
#         del self.m

    def _get_distance_matrix(self):
        """
        Only once calculation will be on each point-pairs
        results will be stored in self.dist_m
        """

        self.dist_m = np.zeros((self.num_p, self.num_p))
        for p_id in range(self.num_p):
            for q_id in range(p_id, self.num_p):
                dist = self._get_dist(p_id, q_id)
                self.dist_m[q_id, p_id] = dist
                self.dist_m[p_id, q_id] = dist

    def _get_neighbours(self, p: int, eps: float, fast_mode=False) -> list:
        return list(np.nonzero(self.dist_m[p] < eps))

    def _clustering(self, p, eps, min_pts, cluster_id, fast_mode=False) -> bool:
        """
        TODO: There should be some optimizations for this part, current code is too ugly
        """

        neighbours = self._get_neighbours(p, eps, fast_mode)[0]
        if len(neighbours) < min_pts:
            self.tags[p] = NOISE
            return False
        else:
            self.tags[p] = cluster_id
            for idx in neighbours:
                self.tags[idx] = cluster_id
                
            while len(neighbours) > 0:
                sub_neighbours = self._get_neighbours(neighbours[0], eps, fast_mode)
                if len(sub_neighbours) >= min_pts:
                    for sub_n in sub_neighbours:
                        if self.tags[sub_n] < 0:
                            self.tags[sub_n] = cluster_id
                            if self.tags[sub_n] == UNKNOWN:
                                neighbours.append(sub_n)
                neighbours = neighbours[1:]
        return True
    

In [123]:
#### serial #####

src = 'spiral.txt'

print('Matrix DBSCAN:')
start_time = time.time()
mdbscan = MatrixDBSCAN(src)
end_time = time.time()
print(mdbscan.predict(2.5, 3))
print('parallal run time:', end_time - start_time)
del mdbscan

Matrix DBSCAN:
[ 1  2  2  2  3  3  3  4  4  4  5  5  5  6  6  6  7  7  7  8  8  8  9  9
  9  9 10 10 10 10 11 11 11 11 12 12 12 12 13 13 13 13 14 14 14 14 15 15
 15 15 15 16 16 16 16 16 17 17 17 17 17 18 18 18 18 18 19 19 19 19 19 19
 20 20 20 20 20 20 21 21 21 21 21 21 21 22 22 22 22 22 22 22 22 22 22 22
 22 22 22 22 22 22 22 22 22 22 23 24 24 24 25 25 25 26 26 26 27 27 27 28
 28 28 29 29 29 30 30 30 30 31 31 31 31 32 32 32 32 33 33 33 33 34 34 34
 34 35 35 35 35 36 36 36 36 37 37 37 37 37 38 38 38 38 38 39 39 39 39 39
 40 40 40 40 40 40 41 41 41 41 41 41 42 42 42 42 42 42 42 43 43 43 43 43
 43 43 44 44 44 44 44 44 44 44 44 44 44 44 44 45 46 46 46 47 47 47 48 48
 48 49 49 49 50 50 50 51 51 51 52 52 52 53 53 53 53 54 54 54 54 55 55 55
 55 56 56 56 56 57 57 57 57 58 58 58 58 59 59 59 59 59 60 60 60 60 60 61
 61 61 61 61 62 62 62 62 62 62 63 63 63 63 63 63 64 64 64 64 64 64 64 65
 65 65 65 65 65 65 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66]
parallal run time: 0.58263993263244

In [108]:
class MatrixDBSCAN_parallel(object):

    def __init__(self, path, iterator):

        self.m, self.indices = self._load_data(path, iterator)
        self.num_p = self.m.shape[0]
        self.tags = [UNKNOWN] * self.num_p
        self._get_distance_matrix()     

    def _load_data(self, path: str, iterator):
        indices = []
        for i in iterator:
            indices.append(i)
            
        with open(path, 'r') as f:
            data = []
            count = 0
            for l in f.readlines():
                if count not in indices:
                    count += 1
                    continue
                source = l.strip().split()
                data.append([float(val) for val in source[:2]])
                count += 1
        
        return np.array(data), indices
    
    def _get_distance_matrix(self):

        self.dist_m = np.zeros((self.num_p, self.num_p))
        for p_id in range(self.num_p):
            for q_id in range(p_id, self.num_p):
                dist = self._get_dist(p_id, q_id)
                self.dist_m[q_id, p_id] = dist
                self.dist_m[p_id, q_id] = dist
                
    def _get_dist(self, a, b, fast_mode: bool = False) -> float:

        if fast_mode:
            result = np.power(self.m[b] - self.m[a], 2).sum()
        else:
            result = np.sqrt(np.power(self.m[b] - self.m[a], 2).sum())
        return round(result, 5)

In [125]:
def test_f(iterator):
    
    instance = MatrixDBSCAN_parallel(src, iterator)
    
    print(predict(instance, 2.5, 3))  

    return list(zip(instance.indices, instance.tags))

In [153]:
def get_result():
    predictions = [UNKNOWN] * basic.num_p
    base = 0
    for local_result in local_results:

        tuples = sorted(local_result[1])

    #     print('tuples',tuples)

        combined_c = {}

        for local_pred in tuples:
            if predictions[local_pred[0]] != UNKNOWN:
                old_c = predictions[local_pred[0]]
                combined_c[local_pred[1]] = old_c
                ## 如果有两点在此区中分为一个cluster，但曾被分为不同cluster，则这两点会被更新为同一个cluster,这个cluster是靠后的点的old_cluster

    #     print('combined_c',combined_c)


        new_tuples = []
        next_c = 1
        flag = 0

        for local_pred in tuples:
            if local_pred[1] in combined_c.keys():
                print('combined ', local_pred)
                new_tuples.append((local_pred[0], combined_c[local_pred[1]]))
            else:

                new_c = local_pred[1]

                if flag == 0:
                    if new_c >= next_c:
                        new_tuples.append((local_pred[0], next_c))
                    else:
                        print('error')
                else:
                    if last_c  == new_c:
                        new_tuples.append((local_pred[0], next_c))
                    elif new_c >= last_c + 1:
                        next_c += 1
                        new_tuples.append((local_pred[0], next_c))
                    else:
                        print('error')

                last_c = new_c

                if flag == 0:
                    flag = 1


    #     print('new_tuples',new_tuples)

        max_c_num = 0

        for local_pred in new_tuples:
            local_c = local_pred[1]
            if predictions[local_pred[0]] == UNKNOWN:
                predictions[local_pred[0]] = local_c + base
                if local_c > max_c_num:
                    max_c_num = local_c


        base = max_c_num
        
    return predictions

In [155]:
print('Matrix DBSCAN:')
basic = MatrixDBSCAN(src)

start_time = time.time()

low = np.min(basic.m, axis=0)[0]
upper = np.max(basic.m, axis=0)[0]

n_partitions = 4

bins = np.linspace(low, upper, num=n_partitions, endpoint=False)

edges = np.array(range(basic.num_p))

edge_partitions = []

for e in edges: 
#     print(test.m[e])
    for i in range(n_partitions-1, -1, -1):
#         print(test.m[e][0], bins[i])
        if test.m[e][0]>=bins[i]-min_pts:
            edge_partitions.append((i, e))
            break

local_results = sc.parallelize(edge_partitions).groupByKey().mapValues(lambda xs: test_f(xs)).collect()

end_time = time.time()

print(get_result())

print('parallal run time:', end_time - start_time)

Matrix DBSCAN:
[20, 21, 21, 21, 22, 22, 22, 23, 23, 23, 24, 24, 24, 18, 19, 19, 19, 20, 20, 20, 20, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 21, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 25, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 24, 24, 24, 25, 25, 25, 28, 29, 29, 29, 30, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 26, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 16, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 29, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 31, 32, 32, 32, 32, 33, 33, 33, 33, 33, 40, 41, 41, 41, 41, 42, 42, 42,

In [None]:
del basic