In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%load_ext lab_black

In [4]:
import numpy as np
import math
import pandas as pd
import itertools
import copy
from sklearn.cluster import DBSCAN

In [5]:
from random import choice

In [6]:
np.random.seed(42)

# Constants and Data Loading

In [7]:
np.set_printoptions(precision=2)
np.set_printoptions(suppress=True)

In [8]:
max_number_of_tracks = 232
max_number_of_tracks_power_2 = 256
max_number_of_tracks_log_2 = 8
batch_size = 50
eps = 0.15

z0_file = "/media/lucas/QS/binaries-trk/OldKF_TTbar_170K_quality-1-trk-z0.bin"
pt_file = "/media/lucas/QS/binaries-trk/OldKF_TTbar_170K_quality-1-trk-pt.bin"
z0 = np.fromfile(z0_file, dtype=np.float32)
pt = np.fromfile(pt_file, dtype=np.float32)

In [9]:
df = pd.DataFrame({"z0": z0, "pt": pt})

In [10]:
db = DBSCAN(eps=0.15, min_samples=2).fit(df.z0.values.reshape(-1, 1))

In [11]:
df["label"] = db.labels_

In [12]:
df.sort_values(by="z0", inplace=True)

In [13]:
df.reset_index(drop=True, inplace=True)

In [14]:
pd.set_option("display.max_rows", None)

In [15]:
df["is_noise"] = 0
df.loc[df.label == -1, "is_noise"] = 1

In [16]:
n_noise = df["is_noise"].sum()

In [17]:
max_label = df.label.max()

In [18]:
df.loc[df.label == -1, "label"] = np.arange(n_noise) + max_label + 1

In [19]:
clusters = df.groupby(["label"]).agg({"z0": [min, max], "pt": [sum], "is_noise": [sum]})
clusters.columns = ["z0_min", "z0_max", "pt_sum", "is_noise"]

In [20]:
clusters.sort_values(by="pt_sum", inplace=True, ascending=False)

In [21]:
print("hi")

hi


In [22]:
clusters

Unnamed: 0_level_0,z0_min,z0_max,pt_sum,is_noise
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,-3.164062,-1.40625,142.023392,0
6,-3.808594,-3.398438,104.907082,0
4,-0.117188,0.410156,51.442478,0
3,-0.585938,-0.292969,17.673368,0
11,3.457031,3.632812,17.044699,0
1,2.8125,3.105469,14.746803,0
2,-4.21875,-4.042969,13.367491,0
15,0.878906,1.054688,9.260561,0
14,5.742188,5.859375,8.528573,0
8,-1.054688,-0.9375,8.141763,0


In [23]:
clusters.to_pickle("real_clusters.pkl")

In [21]:
clusters.sort_values(by="z0_min")

Unnamed: 0_level_0,z0_min,z0_max,pt_sum,is_noise
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20,-9.199219,-9.199219,2.812771,1
21,-8.261719,-8.261719,2.654077,1
22,-6.09375,-6.09375,2.223069,1
7,-5.625,-5.507812,7.957811,0
23,-5.097656,-5.097656,2.698591,1
10,-4.921875,-4.921875,4.852124,0
24,-4.6875,-4.6875,1.958026,1
2,-4.21875,-4.042969,13.367491,0
6,-3.808594,-3.398438,104.907082,0
0,-3.164062,-1.40625,142.023392,0


In [33]:
from acceleratedDBSCAN import AccDBSCAN

In [35]:
db = AccDBSCAN(z0, pt, 0.15, 232, verbose=True, debug=True)
db.fit()

tracks built
(232, 3)
(256,)
prefix sum done
data initialized...
left boundaries found...
right boundaries found...
vertices found...
scan complete.


In [36]:
left_boundaries = pd.read_csv("left_boundaries.csv")

In [52]:
bound = pd.read_csv("boundaries.csv")
bound.drop(columns=["Unnamed: 0"], inplace=True)

In [53]:
bound.sort_values(by="idx", inplace=True)

In [55]:
bound.loc[bound.idx < 232]

Unnamed: 0,idx,pts,nextPts,diff,z0_min,z0_max
3,3.0,7.689916,12.516875,4.826959,-5.625,-5.507812
4,4.0,12.516875,15.647728,3.130853,-5.507812,-5.097656
6,6.0,18.346319,20.305036,1.958716,-4.921875,-4.921875
7,7.0,20.305036,23.198443,2.893408,-4.921875,-4.6875
9,9.0,25.156469,29.242595,4.086126,-4.21875,-4.21875
13,13.0,36.312845,38.52396,2.211114,-4.042969,-3.808594
14,14.0,38.52396,43.745975,5.222015,-3.808594,-3.75
29,29.0,138.714811,143.431041,4.71623,-3.398438,-3.164062
30,30.0,143.431041,146.662125,3.231084,-3.164062,-3.105469
72,72.0,282.996068,285.45444,2.458371,-1.40625,-1.054688


In [56]:
tracks = np.load("tracks.npy")

In [57]:
def find_vertices(boundaries, max_number_of_tracks, tracks) -> np.array:
    label = 0
    max_tracks = max_number_of_tracks
    max_vertices = math.ceil(max_tracks / 2)

    vertices = np.zeros((max_vertices, 6))

    for i in range(0, max_tracks, 2):
        left_boundary = boundaries[i]
        right_boundary = boundaries[i + 1]

        if left_boundary[0] != right_boundary[0]:
            label += 1
            z0_vertex = find_vertex_and_label_clusters(
                tracks, left_boundary[0], right_boundary[0], label
            )
            # print(z0_vertex)
            vertices[i // 2][0] = z0_vertex
            vertices[i // 2][1] = right_boundary[2] - left_boundary[1]
            vertices[i // 2][2] = left_boundary[0]
            vertices[i // 2][3] = right_boundary[0]
            vertices[i // 2][4] = tracks[int(left_boundary[0]), 0]
            vertices[i // 2][5] = tracks[int(right_boundary[0]), 0]

    # Argsort sorts in increasing order (add argsort[::-1][:n] for descending order)

    vertices = vertices[vertices[:, 1].argsort()[::-1][: vertices.shape[0]]]

    return vertices

In [58]:
def get_vertex(cluster_of_tracks: np.array) -> float:
    """
    Calculates the median z0 of the cluster of tracks
    """

    n_size = cluster_of_tracks.shape[0]

    if n_size % 2 == 0:
        return 0.5 * (
            cluster_of_tracks[n_size // 2][0] + cluster_of_tracks[n_size // 2 - 1][0]
        )
    else:
        return cluster_of_tracks[n_size // 2][0]


def find_vertex_and_label_clusters(
    tracks: np.array, startIndex: int, endIndex: int, label: int
) -> float:

    tracks_cluster = tracks[int(startIndex) : int(endIndex) + 1]

    z0_vertex = get_vertex(tracks_cluster)

    return z0_vertex

In [60]:
v = find_vertices(bound.values, 232, tracks)

In [61]:
v

array([[ -2.23, 142.02,  30.  ,  72.  ,  -3.16,  -1.41],
       [ -3.6 , 104.91,  14.  ,  29.  ,  -3.81,  -3.4 ],
       [  0.26,  51.44,  83.  , 102.  ,  -0.12,   0.41],
       [ -0.47,  17.67,  77.  ,  82.  ,  -0.59,  -0.29],
       [  3.57,  17.04, 122.  , 125.  ,   3.46,   3.63],
       [  2.99,  14.75, 117.  , 121.  ,   2.81,   3.11],
       [ -4.16,  13.37,   9.  ,  13.  ,  -4.22,  -4.04],
       [  1.  ,   9.26, 105.  , 107.  ,   0.88,   1.05],
       [  5.8 ,   8.53, 132.  , 134.  ,   5.74,   5.86],
       [ -1.  ,   8.14,  73.  ,  75.  ,  -1.05,  -0.94],
       [ -5.57,   7.96,   3.  ,   4.  ,  -5.62,  -5.51],
       [  2.49,   7.43, 115.  , 116.  ,   2.46,   2.52],
       [  2.05,   6.75, 112.  , 113.  ,   1.99,   2.11],
       [  3.87,   6.42, 126.  , 127.  ,   3.87,   3.87],
       [  6.56,   6.15, 135.  , 137.  ,   6.5 ,   6.62],
       [  1.32,   5.26, 108.  , 109.  ,   1.29,   1.35],
       [ -4.92,   4.85,   6.  ,   7.  ,  -4.92,  -4.92],
       [  0.62,   4.71, 103.  ,

In [66]:
clusters_acc = pd.DataFrame(
    v, columns=["pv_z0", "pt_sum", "id1", "id2", "z0_low", "z0_high"]
)

In [87]:
bound.head(30)

Unnamed: 0,idx,pts,nextPts,diff,z0_min,z0_max
3,3.0,7.689916,12.516875,4.826959,-5.625,-5.507812
4,4.0,12.516875,15.647728,3.130853,-5.507812,-5.097656
6,6.0,18.346319,20.305036,1.958716,-4.921875,-4.921875
7,7.0,20.305036,23.198443,2.893408,-4.921875,-4.6875
9,9.0,25.156469,29.242595,4.086126,-4.21875,-4.21875
13,13.0,36.312845,38.52396,2.211114,-4.042969,-3.808594
14,14.0,38.52396,43.745975,5.222015,-3.808594,-3.75
29,29.0,138.714811,143.431041,4.71623,-3.398438,-3.164062
30,30.0,143.431041,146.662125,3.231084,-3.164062,-3.105469
72,72.0,282.996068,285.45444,2.458371,-1.40625,-1.054688


In [68]:
clusters_acc[0:20]

Unnamed: 0,pv_z0,pt_sum,id1,id2,z0_low,z0_high
0,-2.226562,142.023399,30.0,72.0,-3.164062,-1.40625
1,-3.603516,104.907081,14.0,29.0,-3.808594,-3.398438
2,0.263672,51.442479,83.0,102.0,-0.117188,0.410156
3,-0.46875,17.673368,77.0,82.0,-0.585938,-0.292969
4,3.574219,17.044699,122.0,125.0,3.457031,3.632812
5,2.988281,14.746804,117.0,121.0,2.8125,3.105469
6,-4.160156,13.367491,9.0,13.0,-4.21875,-4.042969
7,0.996094,9.260561,105.0,107.0,0.878906,1.054688
8,5.800781,8.528573,132.0,134.0,5.742188,5.859375
9,-0.996094,8.141763,73.0,75.0,-1.054688,-0.9375


In [85]:
clusters[clusters.is_noise != 1].reset_index(drop=True)

Unnamed: 0,z0_min,z0_max,pt_sum,is_noise
0,-3.164062,-1.40625,142.023392,0
1,-3.808594,-3.398438,104.907082,0
2,-0.117188,0.410156,51.442478,0
3,-0.585938,-0.292969,17.673368,0
4,3.457031,3.632812,17.044699,0
5,2.8125,3.105469,14.746803,0
6,-4.21875,-4.042969,13.367491,0
7,0.878906,1.054688,9.260561,0
8,5.742188,5.859375,8.528573,0
9,-1.054688,-0.9375,8.141763,0


In [91]:
bound.reset_index(drop=True, inplace=True)

In [97]:
for i in range(0, 232, 2):
    print(i, bound[i : i + 1 + 1])
    print("---------------------------------------------")

0    idx        pts    nextPts      diff    z0_min    z0_max
0  3.0   7.689916  12.516875  4.826959 -5.625000 -5.507812
1  4.0  12.516875  15.647728  3.130853 -5.507812 -5.097656
---------------------------------------------
2    idx        pts    nextPts      diff    z0_min    z0_max
2  6.0  18.346319  20.305036  1.958716 -4.921875 -4.921875
3  7.0  20.305036  23.198443  2.893408 -4.921875 -4.687500
---------------------------------------------
4     idx        pts    nextPts      diff    z0_min    z0_max
4   9.0  25.156469  29.242595  4.086126 -4.218750 -4.218750
5  13.0  36.312845  38.523960  2.211114 -4.042969 -3.808594
---------------------------------------------
6     idx         pts     nextPts      diff    z0_min    z0_max
6  14.0   38.523960   43.745975  5.222015 -3.808594 -3.750000
7  29.0  138.714811  143.431041  4.716230 -3.398438 -3.164062
---------------------------------------------
8     idx         pts     nextPts      diff    z0_min    z0_max
8  30.0  143.431041  146

In [None]:
def convert_boundaries_to_clusters(self, boundaries: np.array) -> np.array:
    n_boundaries = boundaries.shape[0]
    n_clusters = math.ceil(n_boundaries / 2)
    clusters = np.zeros((n_clusters, 6))
    j = 0
    for i in range(0, n_boundaries, 2):
        pt_low = boundaries[i, 1]
        pt_high = boundaries[i + 1, 2]
        pt_sum = pt_high - pt_low
        z0_low = boundaries[i, 4]
        z0_high = boundaries[i + 1, 5]

        clusters[j, 3] = pt_sum
        clusters[j, 4] = z0_low
        clusters[j, 5] = z0_high
        j += 1
    return clusters


BatchedDBSCAN.convert_boundaries_to_clusters = convert_boundaries_to_clusters