In [2]:
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from math import isclose

np.random.seed(42)

pd.set_option("display.precision", 2)

def run_normal_dbscan(z0, pt, eps):
    dfb = pd.DataFrame({"z0": z0, "pt": pt})
    db = DBSCAN(eps=eps, min_samples=2).fit(dfb["z0"].values.reshape(-1, 1))
    dfb["label"] = db.labels_

    dfb.loc[dfb.label == -1, "pt"] = 0

    clusters = dfb.groupby("label").agg({"z0": [np.median], "pt": [np.sum]})
    clusters.columns = ["z0", "pt_sum"]

    clusters = clusters.sort_values(by="pt_sum", ascending=False)

    z0_pv = clusters.iloc[0]["z0"]
    pt_pv = clusters.iloc[0]["pt_sum"]

    return z0_pv, pt_pv, clusters


if __name__ == "__main__":

    max_number_of_tracks = 232
    max_number_of_tracks_power_2 = 256
    max_number_of_tracks_log_2 = 8
    batch_size = 50

    eps = 0.15
    verbose = False
    save_intermediate = False
    from batched_dbscan import BatchedDBSCAN

    z0_pvs = []
    z0_batched = []
    z0_batched_skl = []
    pt_pvs = []
    pt_batched = []
    pt_batched_skl = []
    file_i = 28

    store = "/home/kirby/data/binaries-trk-100/"
    z0_file = store + f"b-{file_i}-trk-z0.bin"
    pt_file = store + f"b-{file_i}-trk-pt.bin"
    z0 = np.fromfile(z0_file, dtype=np.float32)
    pt = np.fromfile(pt_file, dtype=np.float32)

    z0_pv, pt_pv, clusters = run_normal_dbscan(z0, pt, eps)

    db = BatchedDBSCAN(
        z0, pt, eps, batch_size, max_number_of_tracks, verbose, save_intermediate
    )

    db.fit()

    db_skl = BatchedDBSCAN(
        z0, pt, eps, batch_size, max_number_of_tracks, verbose, save_intermediate
    )

    db_skl.fitsklearn()

    z0_pvs.append(z0_pv)
    z0_batched.append(db.z0_pv)
    z0_batched_skl.append(db_skl.z0_pv_skl)
    pt_pvs.append(pt_pv)
    pt_batched.append(db.max_pt)
    pt_batched_skl.append(db_skl.max_pt)

    r = pd.DataFrame(
        {
            "z0_normal": z0_pvs,
            "z0_batched": z0_batched,
            "z0_batched_skl": z0_batched_skl,
            "pt_normal": pt_pvs,
            "pt_batched": pt_batched,
            "pt_batched_skl": pt_batched_skl,
        }
    )
    print(r)
    d = pd.DataFrame({})


    d['z0_diff'] = 100 * (r['z0_batched'] - r['z0_normal']) / r['z0_normal']
    d['pt_diff'] = 100 * (r['pt_batched'] - r['pt_normal']) / r['pt_normal']
    pd.set_option('display.max_columns', None)
    print(d)
    print(clusters)

    # print(d.describe())
    # print(
    #     f"file {i}: {db.z0_pv} ({db_skl.z0_pv_skl}), {db.max_pt} ({db_skl.max_pt})"
    # )
    # print(db.boundaries_batches[0])
    # print(db.z0_pv, db.max_pt)

    # import json
    # with open('merged_list.json', 'w') as f:
    #     json.dump(db.merged_list, f, indent=4)


(250,)
   z0_normal  z0_batched  z0_batched_skl  pt_normal  pt_batched  \
0        0.0         0.0             0.0     244.23      244.23   

   pt_batched_skl  
0          244.23  
   z0_diff   pt_diff
0      NaN -5.37e-07
         z0  pt_sum
label              
 5     0.00  244.23
 2     3.57   46.59
 1     5.68   42.40
 0    -6.09   31.37
 10    1.76   27.30
 6     4.34   23.89
 4    -5.33   19.19
 9    -2.29   18.78
 15   -3.63   16.41
 21    2.11   11.99
 7     2.87   10.94
 14   -4.86   10.89
 8    -2.81    9.47
 3     1.00    7.84
 11   -1.41    6.57
 18    1.35    6.45
 13    4.86    4.86
 16    8.00    4.80
 19   -7.62    4.77
 20    2.46    4.47
 12    3.08    4.32
 17   -3.90    4.18
 22   -1.79    3.95
-1    -5.62    0.00


In [3]:
db.boundaries_batches[0].shape

(50, 7)

In [4]:
def compare_boundaries(b_t, b_c):
    columns = ['idx','pts','nextPts','z0_low','z0_high','noise']

    for col in columns:
        for i in range(b_t.shape[0]):
            a = b_t[col].iloc[i]
            b = b_c[col].iloc[i]
            if not isclose(a, b, rel_tol=1e-5):
                print(f'column {col} at index {i} is different')
                print(f'batched: {b_t[col].iloc[i]}')
                print(f'c++: {b_c[col].iloc[i]}')
                print()

In [5]:
for i in range(5):
    print(f"--------------- batch number {i} ----------------")
    b_t = pd.DataFrame(db.boundaries_batches[i], columns = ['idx', 'pts','nextPts','rSum','z0_low','z0_high','noise'])
    b_t.drop(columns=['rSum'],inplace=True)
    b_c = pd.read_csv(f'file28/boundaries-{i}.txt', sep=', ', names = ['idx','pts','nextPts','z0_low','z0_high','noise'], engine ='python')
    compare_boundaries(b_t, b_c)
    

--------------- batch number 0 ----------------
column pts at index 12 is different
batched: 43.295854806900024
c++: 43.9405

column noise at index 36 is different
batched: 0.0
c++: 1

--------------- batch number 1 ----------------
column nextPts at index 13 is different
batched: 45.883562445640564
c++: 45.7556

--------------- batch number 2 ----------------
column pts at index 8 is different
batched: 25.87757134437561
c++: 26.1029

column nextPts at index 7 is different
batched: 20.630732536315918
c++: 21.1231

column nextPts at index 18 is different
batched: 57.11890172958374
c++: 56.1091

column nextPts at index 20 is different
batched: 64.86072564125061
c++: 63.9775

--------------- batch number 3 ----------------
column pts at index 15 is different
batched: 69.73574566841125
c++: 71.3885

column pts at index 17 is different
batched: 75.30391371250153
c++: 75.5334

column pts at index 29 is different
batched: 129.7069798707962
c++: 131.066

column nextPts at index 16 is different

In [6]:
rs0 = np.genfromtxt('file28/rs-0.txt', delimiter=' ')

In [7]:
np.array_equal(db.rs_batches[0], rs0)

False

In [8]:
for i in range(len(rs0)):
    a = rs0[i]
    b = db.rs_batches[0][i]
    if not isclose(a, b, rel_tol=1e-5):
        print(f'rs at index {i} is different')
        print(f'batched: {b}')
        print(f'c++: {a}')
        print()

rs at index 17 is different
batched: 43.295854806900024
c++: 43.9405



In [9]:
def compare_rs(r_t, r_c):
    for i in range(r_t.shape[0]):
        a = r_t[i]
        b = r_c[i]
        if not isclose(a, b, rel_tol=1e-5):
            print(f'rs at index {i} is different')
            print(f'batched: {b}')
            print(f'c++: {a}')
            print()

In [10]:
for i in range(5):
    print(f"--------------- batch number {i} ----------------")
    r_t = db.rs_batches[i]
    r_c = np.genfromtxt(f'file28/rs-{i}.txt', delimiter=' ')
    compare_rs(r_t, r_c)

--------------- batch number 0 ----------------
rs at index 17 is different
batched: 43.9405
c++: 43.295854806900024

--------------- batch number 1 ----------------
rs at index 16 is different
batched: 45.7556
c++: 45.883562445640564

rs at index 20 is different
batched: 58.8261
c++: 60.62415421009064

rs at index 25 is different
batched: 75.7804
c++: 84.01855981349945

rs at index 29 is different
batched: 95.535
c++: 99.03739178180695

rs at index 34 is different
batched: 112.11
c++: 110.67035496234894

--------------- batch number 2 ----------------
rs at index 8 is different
batched: 21.1231
c++: 20.630732536315918

rs at index 10 is different
batched: 26.1029
c++: 25.87757134437561

rs at index 21 is different
batched: 56.1091
c++: 57.11890172958374

rs at index 24 is different
batched: 63.9775
c++: 64.86072564125061

rs at index 25 is different
batched: 66.9231
c++: 67.41643571853638

rs at index 26 is different
batched: 69.3408
c++: 69.54733872413635

rs at index 27 is different

In [11]:
db.rs_batches[0]

array([  0.  ,   2.72,   4.71,   7.83,  10.58,  13.74,  15.8 ,  18.14,
        21.16,  23.13,  26.76,  29.71,  32.24,  34.39,  36.48,  38.65,
        41.04,  43.3 ,  46.19,  49.41,  52.63,  55.31,  57.57,  59.67,
        62.4 ,  75.73,  78.73,  81.26,  83.33,  85.95,  88.  ,  91.93,
        94.36,  99.92, 102.25, 104.42, 106.41, 108.99, 112.  , 114.94,
       116.99, 120.12, 122.14, 124.44, 127.08, 129.77, 132.61, 134.73,
       137.25, 139.7 , 143.13, 143.13, 143.13, 143.13, 143.13, 143.13,
       143.13, 143.13, 143.13, 143.13, 143.13, 143.13, 143.13, 143.13])

In [12]:
pd.read_csv('file28/rs-0.txt', sep=' ', header=None).values

array([[  0.  ,   2.72,   4.71,   7.83,  10.58,  13.74,  15.8 ,  18.14,
         21.16,  23.13,  26.76,  29.71,  32.24,  34.39,  36.48,  38.65,
         41.04,  43.94,  46.19,  49.41,  52.63,  55.31,  57.57,  59.67,
         62.4 ,  75.73,  78.73,  81.26,  83.33,  85.95,  88.  ,  91.93,
         94.36,  99.92, 102.25, 104.42, 106.41, 108.99, 112.  , 114.94,
        117.  , 120.12, 122.14, 124.44, 127.08, 129.77, 132.61, 134.73,
        137.25, 139.7 , 143.13, 143.13, 143.13, 143.13, 143.13, 143.13,
        143.13, 143.13, 143.13, 143.13, 143.13, 143.13, 143.13, 143.13,
           nan]])

### Looking at the tracks

In [13]:
tc = pd.read_csv('file28/tracks-1.txt', sep=', ', names = ['z0','pt'], engine='python')

In [14]:
db.z0_batches[0]

array([-6.21, -6.09, -6.09, -5.8 , -5.39, -4.45, -2.87, -2.29, -1.99,
       -1.46, -1.35, -1.  , -0.76, -0.7 , -0.64, -0.59, -0.47, -0.47,
       -0.18, -0.06,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.18,  0.29,
        0.29,  0.7 ,  0.76,  1.  ,  1.76,  2.87,  2.87,  3.05,  3.52,
        3.57,  3.69,  3.75,  3.98,  4.28,  4.45,  4.45,  4.92,  5.39,
        5.51,  5.68,  5.74,  6.09,  6.15])

In [15]:
tc['z0py'] = db.z0_batches[1]
tc['ptpy'] = db.pt_batches[1]

In [16]:
tc0

NameError: name 'tc0' is not defined

In [None]:
def compare_tracks(t_t, t_c):
    columns = ['z0','pt']

    for col in columns:
        for i in range(t_t.shape[0]):
            a = t_t[col].iloc[i]
            b = t_c[col].iloc[i]
            if not isclose(a, b, rel_tol=1e-5):
                print(f'column {col} at index {i} is different')
                print(f'batched: {t_t[col].iloc[i]}')
                print(f'c++: {t_c[col].iloc[i]}')
                print()    

In [None]:
for i in range(5):
    t_c = pd.read_csv("file28/tracks-{}.txt".format(i), sep=', ', names = ['z0','pt'], engine='python')
    t_t = pd.DataFrame({'z0': db.z0_batches[i], 'pt': db.pt_batches[i]})
    print(f"--------------- batch number {i} ----------------")
    compare_tracks(t_t, t_c)

--------------- batch number 0 ----------------
column pt at index 16 is different
batched: 2.2510147094726562
c++: 2.89567

column pt at index 17 is different
batched: 2.8956716060638428
c++: 2.25101

--------------- batch number 1 ----------------
column pt at index 15 is different
batched: 2.237856149673462
c++: 2.10985

column pt at index 16 is different
batched: 2.1098451614379883
c++: 2.23786

column pt at index 19 is different
batched: 5.363287448883057
c++: 3.56519

column pt at index 20 is different
batched: 3.5651910305023193
c++: 5.36329

column pt at index 24 is different
batched: 12.615914344787598
c++: 4.37776

column pt at index 25 is different
batched: 4.377762317657471
c++: 12.6159

column pt at index 28 is different
batched: 6.120179653167725
c++: 2.61778

column pt at index 29 is different
batched: 2.6177799701690674
c++: 6.12018

column pt at index 33 is different
batched: 2.1391146183013916
c++: 3.57898

column pt at index 34 is different
batched: 3.578982830047607

In [None]:
tc

Unnamed: 0,z0,pt,z0py,ptpy
0,-7.62,2.16,-7.62,2.16
1,-6.27,3.74,-6.27,3.74
2,-5.8,1.97,-5.8,1.97
3,-5.39,4.2,-5.39,4.2
4,-5.27,2.98,-5.27,2.98
5,-5.21,2.45,-5.21,2.45
6,-5.16,4.07,-5.16,4.07
7,-4.86,4.41,-4.86,4.41
8,-4.86,4.41,-4.86,4.41
9,-3.93,1.97,-3.93,1.97


In [18]:
tracks = tc.iloc[0:16]

In [19]:
tracks

Unnamed: 0,z0,pt,z0py,ptpy
0,-7.62,2.16,-7.62,2.16
1,-6.27,3.74,-6.27,3.74
2,-5.8,1.97,-5.8,1.97
3,-5.39,4.2,-5.39,4.2
4,-5.27,2.98,-5.27,2.98
5,-5.21,2.45,-5.21,2.45
6,-5.16,4.07,-5.16,4.07
7,-4.86,4.41,-4.86,4.41
8,-4.86,4.41,-4.86,4.41
9,-3.93,1.97,-3.93,1.97


In [21]:
np.cumsum(tracks['pt'].values)

array([ 2.16,  5.9 ,  7.87, 12.07, 15.05, 17.5 , 21.57, 25.98, 30.38,
       32.35, 35.31, 37.5 , 39.49, 41.54, 43.65, 45.76])

In [22]:
db.prefix_sum(tracks['pt'].values)

array([ 0.  ,  2.16,  5.9 ,  7.87, 12.07, 15.05, 17.5 , 21.57, 25.98,
       30.38, 32.35, 35.31, 37.5 , 39.49, 41.54, 43.65])