In [2]:
### Reloads modules properly
%load_ext autoreload
%autoreload 2

In [3]:
%load_ext lab_black

In [4]:
import numpy as np
import math
import pandas as pd
import itertools
import copy
from sklearn.cluster import DBSCAN

In [5]:
from random import choice

In [6]:
np.random.seed(42)

In [33]:
np.set_printoptions(precision=2)
np.set_printoptions(suppress=True)

In [44]:
pd.set_option("display.max_rows", None)

# Analyzing The clusters merging

In [39]:
cuo = pd.read_pickle("skl_clusters_unmerged.pkl")

In [40]:
cu = cuo.copy()

In [18]:
class BatchedDBSCAN:
    def __init__(self):
        self.eps = 0.15
        self.batch_size = 50
        self.z0_boundary = 21
        self.pt_boundary = 0
        self.minPts = 2
        self.max_number_of_tracks = 232
        self.n_batches = 5
        self.max_n_tracks_batched = 250
        self.max_n_clusters_batch = 25
        self.max_n_clusters = 125

        self.merged_list = []
        self.pt_idx = 0
        self.z0_low_idx = 1
        self.z0_high_idx = 2
        self.noise_idx = 3

## Merge post batch

### clusters overlap

In [24]:
def clusters_overlap(self, ci, cj) -> bool:
    # |---- c_i -----|   |---- c_j ----|
    case1 = ci[self.z0_low_idx] - self.eps <= cj[self.z0_high_idx]
    case2 = ci[self.z0_high_idx] + self.eps >= cj[self.z0_low_idx]

    overlap = case1 and case2

    return overlap


BatchedDBSCAN.clusters_overlap = clusters_overlap

### record merging

In [25]:
def record_merging(self, ci, cj, cn):

    zi_low = round(ci[self.z0_low_idx], 2)
    zi_high = round(ci[self.z0_high_idx], 2)
    zi_pt = round(ci[self.pt_idx], 2)
    ci_str = f"[{zi_low}, {zi_high}, {zi_pt}]"

    zj_low = round(cj[self.z0_low_idx], 2)
    zj_high = round(cj[self.z0_high_idx], 2)
    zj_pt = round(cj[self.pt_idx], 2)
    cj_str = f"[{zj_low}, {zj_high}, {zj_pt}]"

    zn_low = round(cn[self.z0_low_idx], 2)
    zn_high = round(cn[self.z0_high_idx], 2)
    zn_pt = round(cn[self.pt_idx], 2)
    cn_str = f"[{zn_low}, {zn_high}, {zn_pt}]"

    merged_str = ci_str + " + " + cj_str + " -> " + cn_str
    self.merged_list.append(merged_str)


BatchedDBSCAN.record_merging = record_merging

## clusters merging

From what we've found in the *DEBUG* zone, we will perform a forward merge instead. 
This means that when we detect a merge between cluster i and j, it is cluster i that gets deleted (rather than j). 
The information gets propagated to cluster j then.

In [207]:
def merge_clusters(self, c: np.array) -> np.array:

    clusters = c.copy()
    n_clusters = clusters.shape[0]
    if self.n_batches == 1:
        self.max_pt_i = np.argmax(clusters[:, self.pt_idx])
        self.max_pt = clusters[self.max_pt_i, self.pt_idx]
        self.merge_count = 0
        return clusters

    else:
        max_pt = 0
        max_pt_i = 0
        merge_count = 0

        comb = list(itertools.combinations(range(n_clusters), 2))
        self.comb = comb

        to_merge = 9 * np.ones((n_clusters, n_clusters))

        for i, j in comb:

            # skip if cluster  is outside detector
            if (clusters[i, self.z0_low_idx] >= 21) or (
                clusters[j, self.z0_low_idx] >= 21
            ):
                continue

            ci = copy.copy(clusters[i, :])
            cj = copy.copy(clusters[j, :])

            overlap = self.clusters_overlap(clusters[i, :], clusters[j, :])
            to_merge[i, j] = overlap
            if overlap:

                # If cluster j is noise, then upon merging it is no-longer noise
                cj_noise = clusters[j, self.noise_idx]

                if cj_noise:
                    clusters[j, self.noise_idx] = 0

                merge_count += 1

                # Expand boundaries of cluster after merging
                if clusters[i, self.z0_low_idx] < clusters[j, self.z0_low_idx]:
                    clusters[j, self.z0_low_idx] = clusters[i, self.z0_low_idx]
                if clusters[i, self.z0_high_idx] > clusters[j, self.z0_high_idx]:
                    clusters[j, self.z0_high_idx] = clusters[i, self.z0_high_idx]

                # Add the pT of the cluster being merged.
                clusters[j, self.pt_idx] += clusters[i, self.pt_idx]

                # Erase merged cluster.
                clusters[i, self.pt_idx] = 0
                clusters[i, self.z0_low_idx] = 21
                clusters[i, self.z0_high_idx] = 21
                clusters[i, self.noise_idx] = 0

                self.record_merging(ci, cj, clusters[j, :])
                # print("----overlap detected------")
                # print(ci, cj, clusters[i, :], overlap)
                # print("--------------------------")
            # else:
            # print(ci, cj, overlap)

            # check if the pT_sum max is now higher
            # Need to protect against selecting a noise point as PV
            if (max_pt < clusters[j, self.pt_idx]) and (
                clusters[j, self.noise_idx] != 1
            ):
                max_pt = clusters[j, self.pt_idx]
                max_pt_i = j
        self.to_merge = pd.DataFrame(to_merge)
        self.max_pt = max_pt
        self.max_pt_i = max_pt_i
        self.merge_count = merge_count
        return pd.DataFrame(clusters, columns=["pt_sum", "z0_low", "z0_high", "noise"])


BatchedDBSCAN.merge_clusters = merge_clusters

## Testing Merge

In [208]:
db = BatchedDBSCAN()

In [209]:
cu = cuo.copy()

In [210]:
cu.iloc[73] = [250, -15, -15, 1]  # inject high pt noise point

In [211]:
cu.iloc[73]

pt_sum     250.0
z0_low     -15.0
z0_high    -15.0
ntracks      1.0
Name: 73, dtype: float64

In [212]:
cu["noise"] = 0

In [213]:
cu.loc[cu["ntracks"] == 1, "noise"] = 1

In [214]:
cu.drop(columns=["ntracks"], inplace=True)

In [215]:
a = db.merge_clusters(cu.values)

In [216]:
db.max_pt

142.0233987569809

In [217]:
db.max_pt_i

68

In [218]:
a.iloc[db.max_pt_i]

pt_sum     142.023399
z0_low      -3.164062
z0_high     -1.406250
noise        0.000000
Name: 68, dtype: float64

In [219]:
a[a.pt_sum > 0]

Unnamed: 0,pt_sum,z0_low,z0_high,noise
6,7.957812,-5.625,-5.507812,0.0
8,4.852124,-4.921875,-4.921875,0.0
14,2.698591,-5.097656,-5.097656,1.0
15,2.54225,5.097656,5.097656,1.0
20,1.958026,-4.6875,-4.6875,1.0
21,2.089986,-0.761719,-0.761719,1.0
22,2.060123,7.03125,7.03125,1.0
31,4.706585,0.585938,0.644531,0.0
38,2.199723,2.285156,2.285156,1.0
39,6.150181,6.503906,6.621094,0.0


After some perseverance we've got 3 things done here. 
1) The merging post batching is working correctly by using what I like to call the forward merge.
2) The merging procedure will not select the noise point as the PV if the pT is highest.
3) The merging can be done by using "combinations" rather than "permutations", which means you only have to resolve the upper triangular matrix so that means its N(N-1)/2 operations

The next idea to work on as well is to try to do a merging in batches where a sorting mechanism is used. 
This will help to reduce the number of comparisons that need to be done.

**WILL PROCEED TO TEST THIS IMPLEMENTATION WITH OTHER DATASETS**

# Merging in batches 

for this I will need the raw batches data

**PROJECT PARKED**

In [246]:
cb = {}

In [248]:
for i in range(5):
    cb[i] = pd.read_pickle(f"cb_{i}.pkl")

In [251]:
c = np.zeros((250, 4))

In [265]:
def clusters_overlap_test(ci, cj, eps=0.5) -> bool:
    z0_low_idx = 1
    z0_high_idx = 2

    # |---- c_i -----|   |---- c_j ----|
    case1 = ci[z0_low_idx] - eps <= cj[z0_high_idx]
    case2 = ci[z0_high_idx] + eps >= cj[z0_low_idx]

    overlap = case1 and case2

    return overlap

In [None]:
def merge_batch(ci, cj):
    
    n_ci = ci.shape[0]
    
    n_cj = cj.shape[0]
    
    z0_low_idx = 1
    z0_high_idx = 2
    pt_idx = 0
    noise_idx = 3
    

    comb = list(itertools.combinations(n_ci, n_cj)) #upper triangular matrix
    
    for i, j in comb:
        
        if (ci[i, z0_low_idx]>= 21) or (cj[j,z0_low_idx]>=21):
            continue
            
        overlap = clusters_overlap_test(ci[i,:], cj[j,:], eps= 0.15)
        
    
    
    
    

In [253]:
batch_size = 50
for bi in range(5):
    clusters_batch = cb[bi].copy()
    
    if bi==0:
        c[bi:(bi+1)*batch_size] = clusters_batch
    else:
        ci = c[0:(bi+1)*batch_size]
        cj = clusters_batch
        
        merge_batch(ci, cj)
        
    
    

SyntaxError: unexpected EOF while parsing (628612812.py, line 14)

ERROR:root:Cannot parse: 15:0: EOF in multi-line statement
Traceback (most recent call last):
  File "/home/lucas/miniconda3/envs/db-clustering/lib/python3.8/site-packages/lab_black.py", line 218, in format_cell
    formatted_code = _format_code(cell)
  File "/home/lucas/miniconda3/envs/db-clustering/lib/python3.8/site-packages/lab_black.py", line 29, in _format_code
    return format_str(src_contents=code, mode=FileMode())
  File "src/black/__init__.py", line 1154, in format_str
  File "src/black/__init__.py", line 1164, in _format_str_once
  File "src/black/parsing.py", line 128, in lib2to3_parse
black.parsing.InvalidInput: Cannot parse: 15:0: EOF in multi-line statement


In [261]:
ai = [1, 2, 3]
bi = [1, 2]

In [262]:
list(itertools.product(ai, bi))

[(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2)]

In [264]:
list(itertools.combinations(ai, 2))

[(1, 2), (1, 3), (2, 3)]

In [260]:
list(itertools.combinations([[1,2,3],[1,2]]))

TypeError: combinations() missing required argument 'r' (pos 2)

In [258]:
list(itertools.product([1, 2, 3], [1, 2]))

[(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2)]

# DEBUG 

In [None]:
# 3, 4, 7, 10, 13, 18, 24,  28, 31,  37

In [121]:
cs = [3, 4, 7, 10, 13, 18, 24, 28, 31, 37]
cu.iloc[cs].to_pickle("test_vector.pkl")

In [122]:
cu.iloc[cs]

Unnamed: 0,pt_sum,z0_low,z0_high,ntracks
3,42.534632,-2.402344,-1.875,9.0
4,25.086727,-3.691406,-3.515625,5.0
7,6.899173,-1.464844,-1.40625,2.0
10,2.085667,-2.578125,-2.578125,1.0
13,2.7131,-1.699219,-1.699219,1.0
18,2.792957,1.347656,1.347656,1.0
24,1.974743,0.058594,0.058594,1.0
28,13.517229,0.292969,0.351562,4.0
31,4.706585,0.585938,0.644531,2.0
37,2.204966,1.054688,1.054688,1.0


In [123]:
cut = cu.iloc[cs].copy()

In [115]:
cu[cu.pt_sum > 0]
# 3, 4, 7, 10, 13, 18, 24,  28, 31,  37

Unnamed: 0,pt_sum,z0_low,z0_high,ntracks
0,4.330626,-4.042969,-4.042969,2.0
1,6.248494,-0.410156,-0.292969,2.0
2,10.609376,0.234375,0.410156,4.0
3,42.534632,-2.402344,-1.875,9.0
4,25.086727,-3.691406,-3.515625,5.0
5,6.515635,-3.046875,-2.929688,3.0
6,7.957812,-5.625,-5.507812,2.0
7,6.899173,-1.464844,-1.40625,2.0
8,4.852124,-4.921875,-4.921875,2.0
9,4.153777,6.503906,6.621094,2.0


## Forward merging

In [124]:
cut

Unnamed: 0,pt_sum,z0_low,z0_high,ntracks
3,42.534632,-2.402344,-1.875,9.0
4,25.086727,-3.691406,-3.515625,5.0
7,6.899173,-1.464844,-1.40625,2.0
10,2.085667,-2.578125,-2.578125,1.0
13,2.7131,-1.699219,-1.699219,1.0
18,2.792957,1.347656,1.347656,1.0
24,1.974743,0.058594,0.058594,1.0
28,13.517229,0.292969,0.351562,4.0
31,4.706585,0.585938,0.644531,2.0
37,2.204966,1.054688,1.054688,1.0


So when we perform the merging we will delete the current cluster (i) rather than (j) and instead propagate the information to (j)

let's start with a small test example function

In [125]:
def clusters_overlap_test(ci, cj, eps=0.5) -> bool:
    z0_low_idx = 1
    z0_high_idx = 2

    # |---- c_i -----|   |---- c_j ----|
    case1 = ci[z0_low_idx] - eps <= cj[z0_high_idx]
    case2 = ci[z0_high_idx] + eps >= cj[z0_low_idx]

    overlap = case1 and case2

    return overlap

In [142]:
def merge_clusters_test(c: np.array, eps=0.5) -> np.array:

    clusters = c.copy()
    n_clusters = clusters.shape[0]

    max_pt = 0
    max_pt_i = 0
    merge_count = 0

    comb = list(itertools.permutations(range(n_clusters), 2))
    z0_low_idx = 1
    z0_high_idx = 2
    pt_idx = 0
    noise_idx = 3

    for i, j in comb:

        # skip if cluster  is outside detector
        if (clusters[i, z0_low_idx] >= 21) or (clusters[j, z0_low_idx] >= 21):
            continue

        ci = copy.copy(clusters[i, :])
        cj = copy.copy(clusters[j, :])

        overlap = clusters_overlap_test(clusters[i, :], clusters[j, :], eps=eps)

        if overlap:

            merge_count += 1

            # Expand boundaries of cluster after merging
            if clusters[i, z0_low_idx] < clusters[j, z0_low_idx]:
                clusters[j, z0_low_idx] = clusters[i, z0_low_idx]
            if clusters[i, z0_high_idx] > clusters[j, z0_high_idx]:
                clusters[j, z0_high_idx] = clusters[i, z0_high_idx]

            # Add the pT of the cluster being merged.
            clusters[j, pt_idx] += clusters[i, pt_idx]

            # Erase merged cluster.
            clusters[i, pt_idx] = 0
            clusters[i, z0_low_idx] = 21
            clusters[i, z0_high_idx] = 21
            clusters[i, noise_idx] = 0

        # check if the pT_sum max is now higher
        if max_pt < clusters[j, pt_idx]:
            max_pt = clusters[j, pt_idx]
            max_pt_i = j

    print(max_pt, max_pt_i, merge_count)
    return pd.DataFrame(clusters, columns=["pt_sum", "z0_low", "z0_high", "noise"])

In [143]:
merge_clusters_test(cut.values)

54.23257303237915 4 7


Unnamed: 0,pt_sum,z0_low,z0_high,noise
0,0.0,21.0,21.0,0.0
1,25.086727,-3.691406,-3.515625,5.0
2,0.0,21.0,21.0,0.0
3,0.0,21.0,21.0,0.0
4,54.232573,-2.578125,-1.40625,1.0
5,0.0,21.0,21.0,0.0
6,0.0,21.0,21.0,0.0
7,0.0,21.0,21.0,0.0
8,0.0,21.0,21.0,0.0
9,25.196481,0.058594,1.347656,1.0


In [144]:
test_full = merge_clusters_test(cu.values, eps=0.15)

142.0233987569809 68 42


In [141]:
test_full.iloc[68]

pt_sum     142.023399
z0_low      -3.164062
z0_high     -1.406250
noise        1.000000
Name: 68, dtype: float64

In [136]:
test_full

Unnamed: 0,pt_sum,z0_low,z0_high,noise
0,0.0,21.0,21.0,0.0
1,0.0,21.0,21.0,0.0
2,0.0,21.0,21.0,0.0
3,0.0,21.0,21.0,0.0
4,0.0,21.0,21.0,0.0
5,0.0,21.0,21.0,0.0
6,7.957812,-5.625,-5.507812,2.0
7,0.0,21.0,21.0,0.0
8,4.852124,-4.921875,-4.921875,2.0
9,0.0,21.0,21.0,0.0
