In [49]:
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from math import isclose

np.random.seed(42)

pd.set_option("display.precision", 2)


def convert_pt_to_oneOverR(pt):

    return 0.3 * 3.811 / (100 * pt)


def convert_oneOverR_to_pt(oneOverR):

    return 0.3 * 3.811 / (100 * oneOverR)


def run_normal_dbscan(z0, pt, eps):
    dfb = pd.DataFrame({"z0": z0, "pt": pt})
    db = DBSCAN(eps=eps, min_samples=2).fit(dfb["z0"].values.reshape(-1, 1))
    dfb["label"] = db.labels_

    dfb.loc[dfb.label == -1, "pt"] = 0

    clusters = dfb.groupby("label").agg({"z0": [np.median], "pt": [np.sum]})
    clusters.columns = ["z0", "pt_sum"]

    clusters = clusters.sort_values(by="pt_sum", ascending=False)

    z0_pv = clusters.iloc[0]["z0"]
    pt_pv = clusters.iloc[0]["pt_sum"]

    return z0_pv, pt_pv, clusters


if __name__ == "__main__":

    max_number_of_tracks = 232
    max_number_of_tracks_power_2 = 256
    max_number_of_tracks_log_2 = 8
    batch_size = 50

    eps = 0.15
    verbose = False
    save_intermediate = False
    from batched_dbscan import BatchedDBSCAN

    z0_pvs = []
    z0_batched = []
    z0_batched_skl = []
    pt_pvs = []
    pt_batched = []
    pt_batched_skl = []
    file_i = 5

    store = "/home/kirby/data/binaries-trk-100/"
    z0_file = store + f"b-{file_i}-trk-z0.bin"
    pt_file = store + f"b-{file_i}-trk-pt.bin"
    z0 = np.fromfile(z0_file, dtype=np.float32)
    pt = np.fromfile(pt_file, dtype=np.float32)

    z0_pv, pt_pv, clusters = run_normal_dbscan(z0, pt, eps)

    db = BatchedDBSCAN(
        z0, pt, eps, batch_size, max_number_of_tracks, verbose, save_intermediate
    )

    db.fit()

    db_skl = BatchedDBSCAN(
        z0, pt, eps, batch_size, max_number_of_tracks, verbose, save_intermediate
    )

    db_skl.fitsklearn()

    z0_pvs.append(z0_pv)
    z0_batched.append(db.z0_pv)
    z0_batched_skl.append(db_skl.z0_pv_skl)
    pt_pvs.append(pt_pv)
    pt_batched.append(db.max_pt)
    pt_batched_skl.append(db_skl.max_pt)

    r = pd.DataFrame(
        {
            "z0_normal": z0_pvs,
            "z0_batched": z0_batched,
            "z0_batched_skl": z0_batched_skl,
            "pt_normal": pt_pvs,
            "pt_batched": pt_batched,
            "pt_batched_skl": pt_batched_skl,
        }
    )
    print(r)
    d = pd.DataFrame({})


    d['z0_diff'] = 100 * (r['z0_batched'] - r['z0_normal']) / r['z0_normal']
    d['pt_diff'] = 100 * (r['pt_batched'] - r['pt_normal']) / r['pt_normal']
    pd.set_option('display.max_columns', None)
    print(d)
    print(clusters)

    # print(d.describe())
    # print(
    #     f"file {i}: {db.z0_pv} ({db_skl.z0_pv_skl}), {db.max_pt} ({db_skl.max_pt})"
    # )
    # print(db.boundaries_batches[0])
    # print(db.z0_pv, db.max_pt)

    # import json
    # with open('merged_list.json', 'w') as f:
    #     json.dump(db.merged_list, f, indent=4)


   z0_normal  z0_batched  z0_batched_skl  pt_normal  pt_batched  \
0      -0.41       -0.41           -0.41      335.9       335.9   

   pt_batched_skl  
0           335.9  
   z0_diff   pt_diff
0     -0.0  3.41e-06
          z0  pt_sum
label               
 1     -0.41  335.90
 5      6.91  120.78
 0      1.41   63.80
 16    -8.20   40.12
 8     -1.93   32.93
 10     2.29   28.89
 7     -9.90   25.42
 4     -8.79   20.87
 6      3.28   20.66
 13     5.39   18.50
 14    -3.43    9.88
 2      7.79    9.88
 15     0.44    9.32
 18    13.18    9.00
 12     4.95    6.35
 9      8.61    5.42
 11    -1.41    5.24
 3    -10.25    4.46
 17    -4.22    4.06
-1     -3.02    0.00


In [17]:
r

Unnamed: 0,z0_normal,z0_batched,z0_batched_skl,pt_normal,pt_batched,pt_batched_skl
0,-0.41,-0.41,-0.41,335.9,335.9,335.9


In [20]:
db.boundaries_batches[0].shape

(50, 7)

In [22]:
b0 = pd.DataFrame(db.boundaries_batches[0], columns = ['idx', 'pts','nextPts','rSum','z0_low','z0_high','noise'])
b0.drop(columns=['rSum'],inplace=True)

In [30]:
b0['idx'] = b0['idx'].astype(int)

In [32]:
b0

Unnamed: 0,idx,pts,nextPts,z0_low,z0_high,noise
0,0,0.0,2.01,-11.78,-11.78,1.0
1,1,2.01,4.04,-10.25,-10.25,1.0
2,2,4.04,6.19,-9.49,-9.49,1.0
3,3,6.19,9.71,-9.02,-9.02,1.0
4,4,9.71,11.89,-8.73,-8.73,0.0
5,5,11.89,14.33,-8.73,-4.51,0.0
6,6,14.33,21.98,-4.51,-4.51,1.0
7,7,21.98,35.98,-1.93,-1.93,1.0
8,8,35.98,38.15,-1.11,-1.11,1.0
9,9,38.15,47.21,-0.88,-0.76,0.0


In [26]:
bc0 = pd.read_csv('boundaries-0.txt', sep=', ', names = ['idx','pts','nextPts','z0_low','z0_high','noise'])

  bc0 = pd.read_csv('boundaries-0.txt', sep=', ', names = ['idx','pts','nextPts','z0_low','z0_high','noise'])


In [27]:
bc0

Unnamed: 0,idx,pts,nextPts,z0_low,z0_high,noise
0,0,0.0,2.01,-11.78,-11.78,1
1,1,2.01,4.04,-10.25,-10.25,1
2,2,4.04,6.19,-9.49,-9.49,1
3,3,6.19,9.71,-9.02,-9.02,1
4,4,9.71,11.89,-8.73,-8.73,0
5,5,11.89,14.33,-8.73,-4.51,0
6,6,14.33,21.98,-4.51,-4.51,1
7,7,21.98,35.98,-1.93,-1.93,1
8,8,35.98,38.15,-1.11,-1.11,1
9,9,38.15,47.21,-0.88,-0.76,0


In [55]:
def compare_boundaries(b_t, b_c):
    columns = ['idx','pts','nextPts','z0_low','z0_high','noise']

    for col in columns:
        for i in range(b_t.shape[0]):
            a = b_t[col].iloc[i]
            b = b_c[col].iloc[i]
            if not isclose(a, b, rel_tol=1e-5):
                print(f'column {col} at index {i} is different')
                print(f'batched: {b_t[col].iloc[i]}')
                print(f'c++: {b_c[col].iloc[i]}')
                print()

In [56]:
compare_boundaries(b0, bc0)

column pts at index 12 is different
batched: 112.18462491035461
c++: 121.097



In [60]:
for i in range(5):
    print(f"--------------- batch number {i} ----------------")
    b_t = pd.DataFrame(db.boundaries_batches[i], columns = ['idx', 'pts','nextPts','rSum','z0_low','z0_high','noise'])
    b_t.drop(columns=['rSum'],inplace=True)
    b_c = pd.read_csv(f'boundaries-{i}.txt', sep=', ', names = ['idx','pts','nextPts','z0_low','z0_high','noise'], engine ='python')
    compare_boundaries(b_t, b_c)
    

--------------- batch number 0 ----------------
column pts at index 12 is different
batched: 112.18462491035461
c++: 121.097

--------------- batch number 1 ----------------
column pts at index 36 is different
batched: 148.79408431053162
c++: 148.228

--------------- batch number 2 ----------------
column pts at index 15 is different
batched: 157.4594920873642
c++: 157.782

column nextPts at index 12 is different
batched: 76.43546712398529
c++: 74.0892

--------------- batch number 3 ----------------
column nextPts at index 15 is different
batched: 41.96143102645874
c++: 42.7401

--------------- batch number 4 ----------------
column idx at index 10 is different
batched: 12.0
c++: 28

column idx at index 11 is different
batched: 13.0
c++: 29

column idx at index 12 is different
batched: 14.0
c++: 30

column idx at index 13 is different
batched: 15.0
c++: 31

column idx at index 14 is different
batched: 16.0
c++: 32

column idx at index 15 is different
batched: 17.0
c++: 33

column idx 

In [61]:
b_c

Unnamed: 0,idx,pts,nextPts,z0_low,z0_high,noise
0,0,0.0,2.98,-9.96,-9.84,0
1,1,2.98,5.27,-9.84,-1.46,0
2,2,5.27,7.8,-1.46,-1.46,1
3,3,7.8,9.9,-1.0,-1.0,1
4,4,9.9,15.37,-0.7,-0.59,0
5,5,15.37,18.63,-0.59,-0.41,0
6,6,18.63,21.2,-0.41,-0.41,1
7,7,21.2,23.23,-0.23,-0.18,0
8,10,30.02,32.94,-0.18,0.0,0
9,11,32.94,32.94,0.0,0.0,0


In [62]:
b_t

Unnamed: 0,idx,pts,nextPts,z0_low,z0_high,noise
0,0.0,0.0,2.98,-9.96,-9.84,0.0
1,1.0,2.98,5.27,-9.84,-1.46,0.0
2,2.0,5.27,7.8,-1.46,-1.46,1.0
3,3.0,7.8,9.9,-1.0,-1.0,1.0
4,4.0,9.9,15.37,-0.7,-0.59,0.0
5,5.0,15.37,18.63,-0.59,-0.41,0.0
6,6.0,18.63,21.2,-0.41,-0.41,1.0
7,7.0,21.2,23.24,-0.23,-0.18,0.0
8,10.0,30.02,32.94,-0.18,0.41,0.0
9,11.0,32.94,35.33,0.41,0.41,1.0


In [36]:
(b0['z0_high'] == bc0['z0_high']).all()

False

In [48]:
for i in range(50): 
    a = b0.iloc[i]['z0_low']
    b = bc0.iloc[i]['z0_low']

    if not (isclose(a,b, rel_tol=1e-5)):
        print(i, (a-b))

In [31]:
b0.equals(bc0)

False

In [13]:
cpy = pd.DataFrame({})
for i in range(5):
    _df = pd.DataFrame(db.clusters_batches[i], columns =['pt_sum','z0_low','z0_high','noise'])
    cpy = pd.concat([cpy, _df])

In [15]:
cpy.sort_values(by='z0_low', ascending=True)

Unnamed: 0,pt_sum,z0_low,z0_high,noise
0,2.57,-7.44,-7.44,1.0
0,2.15,-7.44,-7.44,1.0
1,3.30,-6.97,-6.97,1.0
0,3.30,-6.56,-6.56,1.0
1,3.31,-6.50,-6.50,1.0
...,...,...,...,...
29,0.00,21.00,21.00,0.0
30,0.00,21.00,21.00,0.0
31,0.00,21.00,21.00,0.0
23,0.00,21.00,21.00,0.0


In [8]:
ccpp =pd.read_csv("clusters-4.txt", sep=',', names=['z0_low','z0_high','pt_sum','noise'])

In [10]:
ccpp.sort_values(by='z0_low', ascending=True)

Unnamed: 0,z0_low,z0_high,pt_sum,noise
0,-11.78,-11.78,2.01,1
100,-10.78,-10.78,2.73,1
1,-10.25,-10.25,2.03,1
150,-10.25,-10.25,2.43,1
50,-9.96,-9.96,5.90,0
...,...,...,...,...
127,21.00,21.00,0.00,0
128,21.00,21.00,0.00,0
129,21.00,21.00,0.00,0
90,21.00,21.00,0.00,0
