In [1]:
%load_ext lab_black

In [2]:
import numpy as np
import math
import pandas as pd
import itertools
import copy
from sklearn.cluster import DBSCAN

In [3]:
from random import choice

In [4]:
np.random.seed(42)

# Constants and Data Loading

In [5]:
np.set_printoptions(precision=2)
np.set_printoptions(suppress=True)

In [6]:
max_number_of_tracks = 232
max_number_of_tracks_power_2 = 256
max_number_of_tracks_log_2 = 8
batch_size = 50
eps = 0.15

z0_file = "/media/lucas/QS/binaries-trk/OldKF_TTbar_170K_quality-1-trk-z0.bin"
pt_file = "/media/lucas/QS/binaries-trk/OldKF_TTbar_170K_quality-1-trk-pt.bin"
z0 = np.fromfile(z0_file, dtype=np.float32)
pt = np.fromfile(pt_file, dtype=np.float32)

### Class Initialization

In [7]:
class BatchedDBSCAN:
    def __init__(
        self,
        z0,
        pt,
        eps,
        batch_size,
        max_number_of_tracks,
        verbose: bool = False,
        save_intermediate: bool = False,
    ):

        self.eps = eps
        self.batch_size = batch_size
        self.verbose = verbose
        self.save_intermediate = save_intermediate
        self.z0_boundary = 21  # 21 cm is outside the detector acceptance
        self.pt_boundary = 0  # 0 pT won't contribute to the pT sum.
        self.minPts = 2  # This algorithm only works for a minimum number of 2 points

        self.max_number_of_tracks = int(max_number_of_tracks)
        self.n_batches = math.ceil(self.max_number_of_tracks / self.batch_size)

        # Max number of tracks including all batches
        self.max_n_tracks_batched = self.batch_size * self.n_batches
        self.max_n_clusters_batch = math.ceil(self.batch_size / self.minPts)
        self.max_n_clusters = math.ceil(self.max_n_tracks_batched / self.minPts)

        # Need to pad vectors to the max_number_of_tracks allowed so that it matches the fpga input
        n_pad = self.max_number_of_tracks - z0.shape[0]
        # if verbose:
        # print("original number of tracks: ", z0.shape)
        self.z0 = self.pad_vector(z0, n_pad, self.z0_boundary)
        self.pt = self.pad_vector(pt, n_pad, self.pt_boundary)

        # These are needed for the prefix sum
        self.max_number_of_tracks_power_2 = (
            1 << (self.max_number_of_tracks - 1).bit_length()
        )
        self.batch_size_power_2 = 1 << (self.batch_size - 1).bit_length()
        self.max_number_of_tracks_log_2 = np.log2(self.max_number_of_tracks_power_2)
        self.batch_size_log_2 = np.log2(self.batch_size_power_2)
        # self.n_batches = math.ceil(self.max_number_of_tracks / self.batch_size)

    def pad_vector(self, vec, n_pad, value):
        """pads vector to a set size with given value"""

        vec_to_pad = value * np.ones(n_pad)
        vec = np.append(vec, vec_to_pad)

        return vec

    def build_tracks(self, z0, pt):
        """Builds tracks batchess"""

        # Shape is determined by the size of batch, z0, pT and label (not used atm)
        track_batch = np.zeros((self.batch_size, 3))

        track_batch[:, 0] = z0
        track_batch[:, 1] = pt

        # sort the tracks by z0
        track_batch = track_batch[track_batch[:, 0].argsort()]

        return track_batch

### Prefix sum

In [8]:
def prefix_sum(self, arr):
    """
    Calculates the prefix sum of pT.
    Warning, requires array to be of size thats log base of 2.
    """
    size_log2 = int(np.log2(arr.shape[0]))

    # up-sweep
    for d in range(0, size_log2, 1):
        step_size = 2**d
        double_step_size = step_size * 2

        for i in range(0, arr.shape[0], double_step_size):
            arr[i + double_step_size - 1] += arr[i + step_size - 1]

    # down-sweep
    arr[arr.shape[0] - 1] = 0
    d = size_log2 - 1

    while d >= 0:
        step_size = 2**d
        double_step_size = step_size * 2
        for i in range(0, arr.shape[0], double_step_size):
            tmp = arr[i + step_size - 1]
            arr[i + step_size - 1] = arr[i + double_step_size - 1]
            arr[i + double_step_size - 1] += tmp
        d -= 1

    return arr


BatchedDBSCAN.prefix_sum = prefix_sum

### left boundaries

In [9]:
def find_left_boundaries(self, tracks):

    left_boundaries = np.zeros(self.batch_size, dtype=bool)

    # first value is always a left boundary
    left_boundaries[0] = 1

    for i in range(1, self.batch_size):
        _t = tracks[i]

        if _t[0] - tracks[i - 1][0] > self.eps:
            tracks[i][2] = -1
            left_boundaries[i] = 1
        else:
            left_boundaries[i] = 0

    self.left_boundaries = left_boundaries
    return left_boundaries


BatchedDBSCAN.find_left_boundaries = find_left_boundaries

### right boundaries

In [10]:
def find_right_boundaries(self, left_boundaries, rs, tracks):

    max_tracks = self.batch_size

    boundaries = np.zeros((max_tracks, 6))
    is_noise = np.zeros((max_tracks, 1))

    for i in range(max_tracks - 1):

        left_edge = left_boundaries[i] and not (left_boundaries[i + 1])  # 1, 0
        right_edge = not (left_boundaries[i]) and left_boundaries[i + 1]  # 0, 1
        check_noise = (left_boundaries[i] == 1) and (left_boundaries[i + 1] == 1)

        if left_edge or right_edge:
            boundaries[i][0] = i
            boundaries[i][1] = rs[i]
            boundaries[i][2] = rs[i + 1]
            boundaries[i][3] = rs[i + 1] - rs[i]
            boundaries[i][4] = tracks[i, 0]
            boundaries[i][5] = tracks[i + 1, 0]
        elif check_noise:
            boundaries[i][0] = i
            boundaries[i][1] = rs[i]
            boundaries[i][2] = rs[i + 1]
            boundaries[i][3] = rs[i + 1] - rs[i]
            boundaries[i][4] = tracks[i, 0]
            boundaries[i][5] = tracks[i, 0]
            is_noise[i] = 1
        else:
            boundaries[i][0] = max_tracks
            boundaries[i][1] = 0
            boundaries[i][2] = 0
            boundaries[i][3] = 0
            boundaries[i][4] = 21
            boundaries[i][5] = 21

    # Check for the last boundary
    if left_boundaries[max_tracks - 1]:
        boundaries[max_tracks - 1][0] = max_tracks
        boundaries[max_tracks - 1][1] = 0
        boundaries[max_tracks - 1][2] = 0
        boundaries[max_tracks - 1][3] = 0
        boundaries[max_tracks - 1][4] = 21
        boundaries[max_tracks - 1][5] = 21
    else:
        boundaries[max_tracks - 1][0] = max_tracks - 1
        boundaries[max_tracks - 1][1] = rs[max_tracks - 1]
        boundaries[max_tracks - 1][2] = rs[max_tracks]
        boundaries[max_tracks - 1][3] = rs[max_tracks] - rs[max_tracks - 1]
        boundaries[max_tracks - 1][4] = tracks[max_tracks - 1, 0]
        boundaries[max_tracks - 1][5] = tracks[max_tracks - 1, 0]

    # Sort boundaries by the index
    sort_idx = boundaries[:, 0].argsort()
    boundaries = boundaries[sort_idx]
    is_noise = is_noise[sort_idx]
    self.is_noise = is_noise

    return boundaries


BatchedDBSCAN.find_right_boundaries = find_right_boundaries

### Convert boundaries to cluster

In [11]:
def convert_boundaries_to_clusters(self, boundaries: np.array) -> np.array:
    n_boundaries = boundaries.shape[0]
    n_clusters = math.ceil(n_boundaries / 2)
    clusters = np.zeros((n_clusters, 6))
    j = 0
    for i in range(0, n_boundaries, 2):
        pt_low = boundaries[i, 1]
        pt_high = boundaries[i + 1, 2]
        pt_sum = pt_high - pt_low
        z0_low = boundaries[i, 4]
        z0_high = boundaries[i + 1, 5]

        clusters[j, 3] = pt_sum
        clusters[j, 4] = z0_low
        clusters[j, 5] = z0_high
        j += 1
    return clusters


BatchedDBSCAN.convert_boundaries_to_clusters = convert_boundaries_to_clusters

### get vertex

In [12]:
def get_vertex(self, cluster_of_tracks: np.array) -> float:
    """
    Calculates the median z0 of the cluster of tracks
    """

    n_size = cluster_of_tracks.shape[0]

    if n_size % 2 == 0:
        return 0.5 * (
            cluster_of_tracks[n_size // 2] + cluster_of_tracks[n_size // 2 - 1]
        )
    else:
        return cluster_of_tracks[n_size // 2]


BatchedDBSCAN.get_vertex = get_vertex

### merge clusters

In [13]:
def merge_clusters(self, clusters: np.array) -> np.array:

    n_clusters = clusters.shape[0]
    if self.n_batches == 1:
        self.max_pt_i = np.argmax(clusters[:, 3])
        self.max_pt = clusters[self.max_pt_i, 3]
        self.merge_count = 0
        return clusters

    else:
        max_pt = 0
        max_pt_i = 0
        merge_count = 0

        comb = list(itertools.combinations(range(n_clusters), 2))

        for i, j in comb:
            if clusters[i, 4] >= 21:
                continue

            if max_pt < clusters[i, 3]:
                max_pt = clusters[i, 3]
                max_pt_i = i

            if clusters[j, 4] >= 21:
                continue

            case1 = (clusters[i, 4] - self.eps) <= clusters[j, 5]
            case2 = (clusters[i, 5] + self.eps) >= clusters[j, 4]

            if case1 and case2:
                c1 = copy.copy(clusters[i, :])
                c2 = copy.copy(clusters[j, :])

                merge_count += 1
                # Expand boundaries of cluster after merging
                if clusters[j, 4] < clusters[i, 4]:
                    clusters[i, 4] = clusters[j, 4]
                if clusters[j, 5] > clusters[i, 5]:
                    clusters[i, 5] = clusters[j, 5]
                clusters[i, 3] += clusters[j, 3]
                clusters[i, 2] += clusters[j, 2]
                if max_pt < clusters[i, 3]:
                    max_pt = clusters[i, 3]
                    max_pt_i = i

                clusters[j, 3] = 0
                clusters[j, 4] = 21
                clusters[j, 5] = 21

                print(
                    f"""merging cluster [{round(c1[4],2), round(c1[5],2), round(c1[3],2)}] and [{round(c2[4],2), round(c2[5],2), round(c2[3],2)}] --> [{round(clusters[i,4],2), round(clusters[i,5], 2), round(clusters[i,3],2)}]"""
                )

        self.max_pt = max_pt
        self.max_pt_i = max_pt_i
        self.merge_count = merge_count
        return clusters


BatchedDBSCAN.merge_clusters = merge_clusters

### initialize clusters

In [14]:
def initialize_clusters(self, max_n_clusters: int) -> np.array:

    clusters = np.zeros((max_n_clusters, 6))
    clusters[:, 4] = 21
    clusters[:, 5] = 21

    return clusters


BatchedDBSCAN.initialize_clusters = initialize_clusters

### fit sklearn for test

In [15]:
def fitsklearn(self):
    start_idx = 0
    end_idx = start_idx + self.batch_size
    n_pad = (self.n_batches * self.batch_size) - self.z0.shape[0]
    self.z0 = self.pad_vector(self.z0, n_pad, 21)
    self.pt = self.pad_vector(self.pt, n_pad, 0)

    clusters_df = pd.DataFrame({})
    clusters = self.initialize_clusters(self.max_n_clusters)

    for i in range(self.n_batches):
        start_idx = i * self.batch_size
        end_idx = (i + 1) * self.batch_size
        z0_batch = self.z0[start_idx:end_idx]
        pt_batch = self.pt[start_idx:end_idx]

        _db = DBSCAN(eps=0.15, min_samples=2).fit(z0_batch.reshape(-1, 1))

        _results = pd.DataFrame({"z0": z0_batch, "pt": pt_batch, "label": _db.labels_})
        max_label = _results.label.max()
        n_noise = _results[_results.label == -1].shape[0]

        _results.loc[_results.label == -1, "label"] = np.arange(n_noise) + max_label + 1

        clusters_batch = _results.groupby(["label"]).agg(
            {"z0": [np.min, np.max], "pt": [np.sum, "count"]}
        )
        clusters_batch.columns = ["z0_min", "z0_max", "pt_sum", "ntracks"]
        clusters_df = pd.concat([clusters_df, clusters_batch])

    n_clusters = clusters_df.shape[0]
    clusters[0:n_clusters, 2] = clusters_df["ntracks"]
    clusters[0:n_clusters, 3] = clusters_df["pt_sum"]
    clusters[0:n_clusters, 4] = clusters_df["z0_min"]
    clusters[0:n_clusters, 5] = clusters_df["z0_max"]

    self.clusters_merged = self.merge_clusters(clusters)


BatchedDBSCAN.fitsklearn = fitsklearn

### fit

In [16]:
def fit(self):

    np.set_printoptions(precision=2)
    np.set_printoptions(suppress=True)

    start_idx = 0
    end_idx = start_idx + self.batch_size
    # Need to pad vectors to match the size of n_batches*batch_size
    n_pad = (self.n_batches * self.batch_size) - self.z0.shape[0]
    self.z0 = self.pad_vector(self.z0, n_pad, 21)
    self.pt = self.pad_vector(self.pt, n_pad, 0)

    clusters = self.initialize_clusters(self.max_n_clusters)

    pv_cluster = np.zeros((1, 6))
    merge_count = 0
    for i in range(self.n_batches):

        start_idx = i * self.batch_size
        end_idx = (i + 1) * self.batch_size

        z0_batch = self.z0[start_idx:end_idx]
        pt_batch = self.pt[start_idx:end_idx]

        track_batch = self.build_tracks(z0_batch, pt_batch)
        self.tracks = track_batch

        rs_batch = self.pad_vector(
            track_batch[:, 1], self.batch_size_power_2 - self.batch_size, 0
        )

        # rs_batch = np.cumsum(rs_batch)
        rs_batch = self.prefix_sum(rs_batch)
        self.rs = rs_batch
        if self.save_intermediate:
            np.save("rs_batch.npy", rs_batch)
            np.save("pt_batch.npy", pt_batch)
            np.save("track_batch.npy", track_batch)

        left_boundaries = self.find_left_boundaries(track_batch)
        if self.save_intermediate:
            np.save("left_boundaries_b.npy", left_boundaries)

        boundaries = self.find_right_boundaries(left_boundaries, rs_batch, track_batch)

        self.boundaries = boundaries

        if self.save_intermediate:
            np.save("right_boundaries_b.npy", boundaries)

        clusters_batch = self.convert_boundaries_to_clusters(boundaries)

        clusters[
            i * self.max_n_clusters_batch : (i + 1) * self.max_n_clusters_batch, :
        ] = clusters_batch

        if track_batch[-1, 0] == 21:
            break

    clusters = self.merge_clusters(clusters)

    self.clusters = clusters

    # Find pv_cluster
    pv_cluster[0, :] = clusters[self.max_pt_i, :]

    print(self.max_pt, self.max_pt_i)
    print(f"Merged count: {self.merge_count}")

    pv_tracks = []

    for i in range(self.max_number_of_tracks):
        z0_trk = self.z0[i]

        if (z0_trk >= pv_cluster[0, 4]) and (z0_trk <= pv_cluster[0, 5]):
            pv_tracks.append(z0_trk)

    median_vertex = self.get_vertex(np.array(pv_tracks))
    self.z0_pv = np.median(pv_tracks)

    print(f"mean: {np.mean(pv_tracks)}")
    print(f"median: {np.median(pv_tracks)}")
    print(f"median2: {median_vertex}")


BatchedDBSCAN.fit = fit

### Testing

In [17]:
z0.shape

(139,)

In [46]:
batch_size = 100

In [47]:
db = BatchedDBSCAN(z0, pt, eps, batch_size, max_number_of_tracks, True, True)

db.fit()

merging cluster [(-9.2, -6.09, 5.04)] and [(-8.26, -4.16, 5.25)] --> [(-9.2, -4.16, 10.29)]
merging cluster [(-5.62, -5.1, 7.96)] and [(-5.1, -4.92, 4.66)] --> [(-5.62, -4.92, 12.62)]
merging cluster [(-5.62, -4.92, 12.62)] and [(-4.92, -4.69, 4.85)] --> [(-5.62, -4.69, 17.47)]
merging cluster [(-4.22, -4.04, 6.44)] and [(-4.04, -3.69, 4.33)] --> [(-4.22, -3.69, 10.77)]
merging cluster [(-4.22, -3.69, 10.77)] and [(-3.69, -3.16, 97.55)] --> [(-4.22, -3.16, 108.32)]
merging cluster [(-4.22, -3.16, 108.32)] and [(-3.16, -2.58, 14.88)] --> [(-4.22, -2.58, 123.2)]
merging cluster [(-4.22, -2.58, 123.2)] and [(-2.58, -1.05, 93.46)] --> [(-4.22, -1.05, 216.66)]
merging cluster [(-4.22, -1.05, 216.66)] and [(-1.05, -0.76, 5.1)] --> [(-4.22, -0.76, 221.76)]
merging cluster [(-4.22, -0.76, 221.76)] and [(-0.76, -0.59, 5.15)] --> [(-4.22, -0.59, 226.91)]
merging cluster [(-4.22, -0.59, 226.91)] and [(-3.81, -2.81, 7.36)] --> [(-4.22, -0.59, 234.27)]
merging cluster [(-4.22, -0.59, 234.27)] and [

In [48]:
track_batch = pd.DataFrame(np.load("track_batch.npy"), columns=["z0", "pt", "label"])

In [49]:
track_batch

Unnamed: 0,z0,pt,label
0,-8.261719,2.654077,0.0
1,-4.160156,2.60061,0.0
2,-3.808594,5.222015,0.0
3,-3.75,2.135001,0.0
4,-2.8125,2.4072,0.0
5,-2.695312,2.06357,0.0
6,-2.460938,3.462884,0.0
7,-2.285156,2.286245,0.0
8,-2.285156,2.94561,0.0
9,-1.757812,2.941708,0.0


In [50]:
rs_batch = np.load("rs_batch.npy")

In [51]:
rs_batch

array([  0.  ,   2.65,   5.25,  10.48,  12.61,  15.02,  17.08,  20.55,
        22.83,  25.78,  28.72,  39.36,  41.46,  43.77,  46.29,  49.34,
        51.85,  55.53,  57.7 ,  60.69,  63.24,  65.42,  67.51,  69.98,
        72.34,  74.31,  76.49,  78.96,  81.64,  83.59,  86.82,  91.01,
        93.99,  96.42, 100.49, 106.8 , 111.05, 113.8 , 116.05, 119.21,
       119.21, 119.21, 119.21, 119.21, 119.21, 119.21, 119.21, 119.21,
       119.21, 119.21, 119.21, 119.21, 119.21, 119.21, 119.21, 119.21,
       119.21, 119.21, 119.21, 119.21, 119.21, 119.21, 119.21, 119.21,
       119.21, 119.21, 119.21, 119.21, 119.21, 119.21, 119.21, 119.21,
       119.21, 119.21, 119.21, 119.21, 119.21, 119.21, 119.21, 119.21,
       119.21, 119.21, 119.21, 119.21, 119.21, 119.21, 119.21, 119.21,
       119.21, 119.21, 119.21, 119.21, 119.21, 119.21, 119.21, 119.21,
       119.21, 119.21, 119.21, 119.21, 119.21, 119.21, 119.21, 119.21,
       119.21, 119.21, 119.21, 119.21, 119.21, 119.21, 119.21, 119.21,
      

In [55]:
bound = pd.DataFrame(
    db.boundaries, columns=["idx", "pt_i", "pt_i+1", "pt_sum", "z0_i", "z0_i+1"]
)

In [56]:
bound["is_noise"] = db.is_noise

In [57]:
pd.set_option("display.max_rows", None)

In [58]:
pt[(z0 < -9.18) & (z0 > -9.20)]

array([2.81], dtype=float32)

In [59]:
df = pd.DataFrame({"z0": z0, "pt": pt})

In [60]:
track_batch.head(20)

Unnamed: 0,z0,pt,label
0,-8.261719,2.654077,0.0
1,-4.160156,2.60061,0.0
2,-3.808594,5.222015,0.0
3,-3.75,2.135001,0.0
4,-2.8125,2.4072,0.0
5,-2.695312,2.06357,0.0
6,-2.460938,3.462884,0.0
7,-2.285156,2.286245,0.0
8,-2.285156,2.94561,0.0
9,-1.757812,2.941708,0.0


In [61]:
bound.head(20)

Unnamed: 0,idx,pt_i,pt_i+1,pt_sum,z0_i,z0_i+1,is_noise
0,0.0,0.0,2.654077,2.654077,-8.261719,-8.261719,1.0
1,1.0,2.654077,5.254687,2.60061,-4.160156,-4.160156,1.0
2,2.0,5.254687,10.476703,5.222015,-3.808594,-3.75,0.0
3,3.0,10.476703,12.611704,2.135001,-3.75,-2.8125,0.0
4,4.0,12.611704,15.018903,2.4072,-2.8125,-2.695312,0.0
5,5.0,15.018903,17.082473,2.06357,-2.695312,-2.460938,0.0
6,6.0,17.082473,20.545357,3.462884,-2.460938,-2.460938,1.0
7,7.0,20.545357,22.831601,2.286245,-2.285156,-2.285156,0.0
8,8.0,22.831601,25.777212,2.94561,-2.285156,-1.757812,0.0
9,9.0,25.777212,28.718919,2.941708,-1.757812,-1.757812,0.0


In [62]:
def convert_boundaries_to_clusters(boundaries: np.array) -> np.array:
    n_boundaries = boundaries.shape[0]
    n_clusters = math.ceil(n_boundaries / 2)
    clusters = np.zeros((n_boundaries, 7))
    j = 0
    i = 0
    while i < n_boundaries:
        check_noise = boundaries[i, -1] == 1
        if check_noise:

            pt_low = boundaries[i, 1]
            pt_high = boundaries[i, 2]
            pt_sum = pt_high - pt_low
            z0_low = boundaries[i, 4]
            z0_high = boundaries[i, 5]

            clusters[j, 3] = pt_sum
            clusters[j, 4] = z0_low
            clusters[j, 5] = z0_high
            clusters[j, 6] = 1
            j += 1
            i += 1
        else:
            pt_low = boundaries[i, 1]
            pt_high = boundaries[i + 1, 2]
            pt_sum = pt_high - pt_low
            z0_low = boundaries[i, 4]
            z0_high = boundaries[i + 1, 4]

            clusters[j, 3] = pt_sum
            clusters[j, 4] = z0_low
            clusters[j, 5] = z0_high
            j += 1
            i += 2
    return clusters

In [63]:
clusters = pd.DataFrame(
    convert_boundaries_to_clusters(bound.values),
    columns=["i", "j", "k", "pt_sum", "z0_low", "z0_high", "noise"],
)

IndexError: index 100 is out of bounds for axis 0 with size 100

In [64]:
clusters

Unnamed: 0,i,j,k,pt_sum,z0_low,z0_high,noise
0,0.0,0.0,0.0,2.812771,-9.199219,-9.199219,1.0
1,0.0,0.0,0.0,2.654077,-8.261719,-8.261719,1.0
2,0.0,0.0,0.0,2.223069,-6.09375,-6.09375,1.0
3,0.0,0.0,0.0,7.957812,-5.625,-5.507812,0.0
4,0.0,0.0,0.0,2.698591,-5.097656,-5.097656,1.0
5,0.0,0.0,0.0,4.852124,-4.921875,-4.921875,0.0
6,0.0,0.0,0.0,1.958026,-4.6875,-4.6875,1.0
7,0.0,0.0,0.0,13.367491,-4.21875,-4.042969,0.0
8,0.0,0.0,0.0,104.907081,-3.808594,-3.398438,0.0
9,0.0,0.0,0.0,142.023399,-3.164062,-1.40625,0.0


In [34]:
def merge_clusters2(self, clusters: np.array) -> np.array:

    n_clusters = clusters.shape[0]
    if self.n_batches == 1:
        self.max_pt_i = np.argmax(clusters[:, 3])
        self.max_pt = clusters[self.max_pt_i, 3]
        self.merge_count = 0
        return clusters

    else:
        max_pt = 0
        max_pt_i = 0
        merge_count = 0

        comb = list(itertools.combinations(range(n_clusters), 2))

        for i, j in comb:
            if clusters[i, 4] >= 21:
                continue

            if max_pt < clusters[i, 3]:
                max_pt = clusters[i, 3]
                max_pt_i = i

            if clusters[j, 4] >= 21:
                continue

            case1 = (clusters[i, 4] - self.eps) <= clusters[j, 5]
            case2 = (clusters[i, 5] + self.eps) >= clusters[j, 4]

            if case1 and case2:
                c1 = copy.copy(clusters[i, :])
                c2 = copy.copy(clusters[j, :])
                c1_noise = clusters[i, -1]
                c2_noise = clusters[j, -1]

                if c1_noise:
                    clusters[i, -1] = 0

                merge_count += 1
                # Expand boundaries of cluster after merging
                if clusters[j, 4] < clusters[i, 4]:
                    clusters[i, 4] = clusters[j, 4]
                if clusters[j, 5] > clusters[i, 5]:
                    clusters[i, 5] = clusters[j, 5]
                clusters[i, 3] += clusters[j, 3]
                clusters[i, 2] += clusters[j, 2]
                if max_pt < clusters[i, 3]:
                    max_pt = clusters[i, 3]
                    max_pt_i = i

                clusters[j, 3] = 0
                clusters[j, 4] = 21
                clusters[j, 5] = 21

                print(
                    f"""merging cluster [{round(c1[4],2), round(c1[5],2), round(c1[3],2)}] and [{round(c2[4],2), round(c2[5],2), round(c2[3],2)}] --> [{round(clusters[i,4],2), round(clusters[i,5], 2), round(clusters[i,3],2)}]"""
                )

        self.max_pt = max_pt
        self.max_pt_i = max_pt_i
        self.merge_count = merge_count
        return clusters


BatchedDBSCAN.merge_clusters2 = merge_clusters2

In [41]:
c_merged = db.merge_clusters2(clusters.values)

In [45]:
pd.DataFrame(c_merged, columns=["i", "j", "k", "pt_sum", "z0_low", "z0_high", "noise"])

Unnamed: 0,i,j,k,pt_sum,z0_low,z0_high,noise
0,0.0,0.0,0.0,2.812771,-9.199219,-9.199219,1.0
1,0.0,0.0,0.0,2.654077,-8.261719,-8.261719,1.0
2,0.0,0.0,0.0,2.223069,-6.09375,-6.09375,1.0
3,0.0,0.0,0.0,7.957812,-5.625,-5.507812,0.0
4,0.0,0.0,0.0,2.698591,-5.097656,-5.097656,1.0
5,0.0,0.0,0.0,4.852124,-4.921875,-4.921875,0.0
6,0.0,0.0,0.0,1.958026,-4.6875,-4.6875,1.0
7,0.0,0.0,0.0,13.367491,-4.21875,-4.042969,0.0
8,0.0,0.0,0.0,104.907081,-3.808594,-3.398438,0.0
9,0.0,0.0,0.0,142.023399,-3.164062,-1.40625,0.0


# DEBUG

In [98]:
for i in range(0, 10, 2):
    print(i, i+1)

0 1
2 3
4 5
6 7
8 9


In [164]:
bound

Unnamed: 0,idx,pt_i,pt_i+1,pt_sum,z0_i,z0_i+1,is_noise
0,0.0,2.812771,2.812771,2.812771,-9.199219,-9.199219,1.0
1,1.0,5.466848,5.466848,5.466848,-8.261719,-8.261719,1.0
2,2.0,7.689916,7.689916,7.689916,-6.09375,-6.09375,1.0
3,3.0,12.516875,15.647728,3.130853,-5.625,-5.507812,0.0
4,4.0,15.647728,18.346319,2.698591,-5.507812,-5.097656,0.0
5,5.0,18.346319,18.346319,18.346319,-5.097656,-5.097656,1.0
6,6.0,20.305036,23.198443,2.893408,-4.921875,-4.921875,0.0
7,7.0,23.198443,25.156469,1.958026,-4.921875,-4.6875,0.0
8,8.0,25.156469,25.156469,25.156469,-4.6875,-4.6875,1.0
9,9.0,29.242595,31.592723,2.350128,-4.21875,-4.21875,0.0


In [166]:
df.sort_values(by='z0')

Unnamed: 0,z0,pt
93,-9.199219,2.812771
129,-8.261719,2.654077
98,-6.09375,2.223069
16,-5.625,4.826959
41,-5.507812,3.130853
9,-5.097656,2.698591
36,-4.921875,1.958716
26,-4.921875,2.893408
30,-4.6875,1.958026
4,-4.21875,2.350128


In [161]:
def convert_boundaries_to_clusters2(boundaries: np.array) -> np.array:
    n_boundaries = boundaries.shape[0]
    n_clusters = math.ceil(n_boundaries / 2)
    clusters = np.zeros((n_clusters, 6))
    j = 0
    for i in range(0, n_boundaries, 2):
        pt_low = boundaries[i, 1]
        pt_high = boundaries[i + 1, 2]
        pt_sum = pt_high - pt_low
        z0_low = boundaries[i, 4]
        z0_high = boundaries[i + 1, 5]

        clusters[j, 3] = pt_sum
        clusters[j, 4] = z0_low
        clusters[j, 5] = z0_high
        j += 1
    return clusters

In [217]:
pd.DataFrame(bound2)

Unnamed: 0,0,1,2,3,4,5,6
0,0.0,2.812771,2.812771,2.812771,-9.199219,-9.199219,1.0
1,1.0,5.466848,5.466848,5.466848,-8.261719,-8.261719,1.0
2,2.0,7.689916,7.689916,7.689916,-6.09375,-6.09375,1.0
3,3.0,12.516875,15.647728,3.130853,-5.625,-5.507812,0.0
4,4.0,15.647728,18.346319,2.698591,-5.507812,-5.097656,0.0
5,5.0,18.346319,18.346319,18.346319,-5.097656,-5.097656,1.0
6,6.0,20.305036,23.198443,2.893408,-4.921875,-4.921875,0.0
7,7.0,23.198443,25.156469,1.958026,-4.921875,-4.6875,0.0
8,8.0,25.156469,25.156469,25.156469,-4.6875,-4.6875,1.0
9,9.0,29.242595,31.592723,2.350128,-4.21875,-4.21875,0.0


In [162]:
bound2 = bound.values
n_boundaries = bound2.shape[0]
n_clusters = math.ceil(n_boundaries/2)
clusters = np.zeros((n_clusters, 6))
is_noise = db.is_noise

i = 0

for j in range(n_clusters):
    
    if is_noise[i]:
        z0_low = bound2[i, 4]
        z0_high = bound2[i, 5]
        pt = bound2[i, 1]
        i+=1
    else:
        z0_low = bound2[i, 4]
        z0_high = bound2[i+1, 5]
        pt = bound2[i+1, 2] - bound2[i, 1]
        i+=2
    clusters[j, 3] = pt
    clusters[j, 4] = z0_low
    clusters[j, 5] = z0_high



# j = 0 
# for i in range(0, n_boundaries, 2):
#     pt_low = bound2[i, 1]
#     pt_high = bound2[i+1, 2]
#     pt_sum = pt_high - pt_low
#     z0_low = bound2[i, 4]
#     z0_high =bound2[i+1, 5]
    
#     clusters[j, 3] = pt_sum
#     clusters[j, 4] = z0_low
#     clusters[j, 5] = z0_high
#     j += 1

In [168]:
df_c = pd.DataFrame(clusters[:, 3:], columns=["pt_sum", "z0_min", "z0_max"])

In [169]:
df_c

Unnamed: 0,pt_sum,z0_min,z0_max
0,2.812771,-9.199219,-9.199219
1,5.466848,-8.261719,-8.261719
2,7.689916,-6.09375,-6.09375
3,5.829444,-5.625,-5.097656
4,18.346319,-5.097656,-5.097656
5,4.851433,-4.921875,-4.6875
6,25.156469,-4.6875,-4.6875
7,14.50338,-4.21875,-3.808594
8,102.91615,-3.808594,-3.164062
9,140.995093,-3.164062,-1.054688


In [125]:
bound2

array([[  0.  ,   2.81,   5.47, ...,  -9.2 ,  -9.2 ,   1.  ],
       [  1.  ,   5.47,   7.69, ...,  -8.26,  -8.26,   1.  ],
       [  2.  ,   7.69,  12.52, ...,  -6.09,  -6.09,   1.  ],
       ...,
       [232.  ,   0.  ,   0.  , ...,  21.  ,  21.  ,   0.  ],
       [232.  ,   0.  ,   0.  , ...,  21.  ,  21.  ,   0.  ],
       [232.  ,   0.  ,   0.  , ...,  21.  ,  21.  ,   0.  ]])

In [127]:
df.sort_values(by='z0')

Unnamed: 0,z0,pt
93,-9.199219,2.812771
129,-8.261719,2.654077
98,-6.09375,2.223069
16,-5.625,4.826959
41,-5.507812,3.130853
9,-5.097656,2.698591
36,-4.921875,1.958716
26,-4.921875,2.893408
30,-4.6875,1.958026
4,-4.21875,2.350128


In [195]:
rs = np.cumsum(db.pt)

In [235]:
def find_right_boundaries2(left_boundaries, rs, tracks):

    max_tracks = left_boundaries.shape[0]

    boundaries = np.zeros((max_tracks, 7))
    is_noise = np.zeros((max_tracks, 1))

    for i in range(max_tracks - 1):

        check1 = left_boundaries[i] and not (left_boundaries[i + 1])
        check2 = not (left_boundaries[i]) and left_boundaries[i + 1]
        check_noise = (left_boundaries[i] == 1) and (left_boundaries[i + 1] == 1)

        if check1 or check2:
            boundaries[i][0] = i
            boundaries[i][1] = rs[i]
            boundaries[i][2] = rs[i + 1]
            boundaries[i][3] = rs[i + 1] - rs[i]
            boundaries[i][4] = tracks[i, 0]
            boundaries[i][5] = tracks[i + 1, 0]
            boundaries[i][6] = 0
        #         elif check_noise:
        #             boundaries[i][0] = i
        #             boundaries[i][1] = rs[i]
        #             boundaries[i][2] = rs[i + 1]
        #             boundaries[i][3] = rs[i + 1] - rs[i]
        #             boundaries[i][4] = tracks[i, 0]
        #             boundaries[i][5] = tracks[i, 0]
        #             boundaries[i][6] = 1

        #             boundaries[i + 1][0] = i
        #             boundaries[i + 1][1] = rs[i]
        #             boundaries[i + 1][2] = rs[i + 1]
        #             boundaries[i + 1][3] = rs[i + 1] - rs[i]
        #             boundaries[i + 1][4] = tracks[i, 0]
        #             boundaries[i + 1][5] = tracks[i, 0]
        #             boundaries[i + 1][6] = 1

        #             is_noise[i] = 1
        else:
            boundaries[i][0] = max_tracks
            boundaries[i][1] = 0
            boundaries[i][2] = 0
            boundaries[i][3] = 0
            boundaries[i][4] = 21
            boundaries[i][5] = 21

    # Check for the last boundary
    if left_boundaries[max_tracks - 1]:
        boundaries[max_tracks - 1][0] = max_tracks
        boundaries[max_tracks - 1][1] = 0
        boundaries[max_tracks - 1][2] = 0
        boundaries[max_tracks - 1][3] = 0
        boundaries[max_tracks - 1][4] = 21
        boundaries[max_tracks - 1][5] = 21
    else:
        boundaries[max_tracks - 1][0] = max_tracks - 1
        boundaries[max_tracks - 1][1] = rs[max_tracks - 1]
        boundaries[max_tracks - 1][2] = rs[max_tracks]
        boundaries[max_tracks - 1][3] = rs[max_tracks] - rs[max_tracks - 1]
        boundaries[max_tracks - 1][4] = tracks[max_tracks - 1, 0]
        boundaries[max_tracks - 1][5] = tracks[max_tracks - 1, 0]

    # Sort boundaries by the index
    sort_idx = boundaries[:, 0].argsort()
    boundaries = boundaries[sort_idx]
    is_noise = is_noise[sort_idx]

    return boundaries

In [236]:
b_t = find_right_boundaries2(db.left_boundaries, db.rs, db.tracks)

In [237]:
pd.DataFrame(
    b_t, columns=["idx", "pt_i", "pt_i+1", "pt_sum", "z0_min", "z0_max", "Noise"]
)

Unnamed: 0,idx,pt_i,pt_i+1,pt_sum,z0_min,z0_max,Noise
0,3.0,12.516875,15.647728,3.130853,-5.625,-5.507812,0.0
1,4.0,15.647728,18.346319,2.698591,-5.507812,-5.097656,0.0
2,6.0,20.305036,23.198443,2.893408,-4.921875,-4.921875,0.0
3,7.0,23.198443,25.156469,1.958026,-4.921875,-4.6875,0.0
4,9.0,29.242595,31.592723,2.350128,-4.21875,-4.21875,0.0
5,13.0,38.52396,43.745975,5.222015,-4.042969,-3.808594,0.0
6,14.0,43.745975,45.880976,2.135001,-3.808594,-3.75,0.0
7,29.0,143.431041,146.662125,3.231084,-3.398438,-3.164062,0.0
8,30.0,146.662125,148.667183,2.005058,-3.164062,-3.105469,0.0
9,72.0,285.45444,287.657218,2.202779,-1.40625,-1.054688,0.0


In [219]:
clusters

array([[  0.  ,   0.  ,   0.  ,   2.81,  -9.2 ,  -9.2 ],
       [  0.  ,   0.  ,   0.  ,   5.47,  -8.26,  -8.26],
       [  0.  ,   0.  ,   0.  ,   7.69,  -6.09,  -6.09],
       [  0.  ,   0.  ,   0.  ,   5.83,  -5.62,  -5.1 ],
       [  0.  ,   0.  ,   0.  ,  18.35,  -5.1 ,  -5.1 ],
       [  0.  ,   0.  ,   0.  ,   4.85,  -4.92,  -4.69],
       [  0.  ,   0.  ,   0.  ,  25.16,  -4.69,  -4.69],
       [  0.  ,   0.  ,   0.  ,  14.5 ,  -4.22,  -3.81],
       [  0.  ,   0.  ,   0.  , 102.92,  -3.81,  -3.16],
       [  0.  ,   0.  ,   0.  , 141.  ,  -3.16,  -1.05],
       [  0.  ,   0.  ,   0.  ,   8.03,  -1.05,  -0.76],
       [  0.  ,   0.  ,   0.  , 295.69,  -0.76,  -0.76],
       [  0.  ,   0.  ,   0.  ,  17.12,  -0.59,  -0.12],
       [  0.  ,   0.  ,   0.  ,  52.09,  -0.12,   0.59],
       [  0.  ,   0.  ,   0.  ,   6.97,   0.59,   0.88],
       [  0.  ,   0.  ,   0.  ,   6.85,   0.88,   1.29],
       [  0.  ,   0.  ,   0.  ,   5.47,   1.29,   1.64],
       [  0.  ,   0.  ,   0.  ,

In [220]:
inner_loop = 4
outer_loop = 3
batch_loop = 5

count_total = 0

for i in range(batch_loop):
    for j in range(outer_loop):
        for k in range(inner_loop):
            count_total += 1

In [221]:
count_total

60

In [222]:
5 * 4 * 3

60

In [223]:
inner_loop = 5 * 4
outer_loop = 5 * 3
count_total = 0
for i in range(outer_loop):
    for j in range(inner_loop):
        count_total += 1

In [224]:
count_total

300

In [226]:
(5**2) * 3 * 4

300

In [227]:
250 * 250

62500

In [228]:
batch_vector_size = 50
total_vector_size = 0
N_operations = 0
for i in range(5):
    ops = total_vector_size * batch_vector_size
    N_operations += ops
    total_vector_size += batch_vector_size

In [229]:
N_operations

25000

In [269]:
for i in range(10):
    i = i + 1
    print(i)

1
2
3
4
5
6
7
8
9
10
