In [1]:
#0: imports

import uproot 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch

from imports.data import CCV1
from torch_geometric.data import DataLoader 
from imports.models import Net_SEC, Net_GAT, Net_Trans
from torch_geometric.nn import knn_graph

import numpy as np
import awkward as ak
import time
from imports.Agglomerative import Aggloremative

import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

In [2]:
testpath = '/vols/cms/mm1221/Data/mix/test/'
# Load test data
data_test = CCV1(testpath, max_events=500)
test_loader = DataLoader(data_test, batch_size=1, shuffle=False, follow_batch=['x'])


### Loading tracksters data


  0%|                                                     | 0/1 [00:00<?, ?it/s]

/vols/cms/mm1221/Data/mix/test/raw/test.root


  0%|                                                     | 0/1 [00:45<?, ?it/s]

Reached 500 events!





In [7]:
model = Net_SEC(256,3, dropout=0.3, contrastive_dim=512)
checkpoint= torch.load('/vols/cms/mm1221/hgcal/Mixed/Track/NegativeMining/runs/SECNEW/hd256nl3cd512k64/epoch-100.pt',  map_location=torch.device('cpu'))
#checkpoint= torch.load('/vols/cms/er421/hgcal/code/code/Mixed/LC/Full/results/hd128nl3cd16k64/epoch-100.pt',  map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['model'])  
model.eval()

Net_SEC(
  (lc_encode): Sequential(
    (0): Linear(in_features=16, out_features=256, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=256, out_features=256, bias=True)
    (3): ELU(alpha=1.0)
  )
  (convs): ModuleList(
    (0-2): 3 x CustomStaticEdgeConv(
      (nn_module): Sequential(
        (0): Linear(in_features=512, out_features=256, bias=True)
        (1): ELU(alpha=1.0)
        (2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (3): Dropout(p=0.3, inplace=False)
      )
    )
  )
  (output): Sequential(
    (0): Linear(in_features=256, out_features=64, bias=True)
    (1): ELU(alpha=1.0)
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=64, out_features=32, bias=True)
    (4): ELU(alpha=1.0)
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=32, out_features=512, bias=True)
  )
)

In [8]:
import time
import numpy as np
from sklearn.metrics.pairwise import cosine_distances  # for cosine distance calculation

all_predictions = []  
start_time = time.time()

# Get predictions for each event
for i, data in enumerate(test_loader):
    if i > 300:
        break
    edge_index = knn_graph(data.x[:, :3], k=64, batch=data.x_batch)


    predictions = model(data.x, edge_index, data.x_batch)
    all_predictions.append(predictions[0].detach().cpu().numpy())  

all_predictions = np.array(all_predictions)

# 3.2: Cluster using threshold found in Script A
all_cluster_labels = Aggloremative(all_predictions, threshold=0.165)
#all_cluster_labels = affinity_propagation_clustering(all_predictions, damping=0.7)
"""
all_cluster_labels = mean_shift_clustering(
    all_predictions,
    bandwidth=None,    # Or a numeric value if you already have a good estimate
    quantile=0.2,      # Tweak quantile to control bandwidth estimation
    n_samples=500      # You can limit the sample size if data is large
)
"""

end_time = time.time()

# 3.3: Calculate average inference time
time_diff = end_time - start_time
inference_time = time_diff / len(all_cluster_labels)
print("average inference time:", inference_time)


average inference time: 0.015218088397155964


In [9]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import torch
import torch.nn.functional as F

# ---------------------------------------------
# Helper: compute cosine similarities for a list of edges
def compute_cosine_similarities(embeddings, edge_indices, skip_self=True):
    """
    embeddings: NumPy array of shape [N, D]
    edge_indices: list of [src, tgt]
    skip_self: whether to skip edges where src == tgt
    """
    sims = []
    for edge in edge_indices:
        src, tgt = edge
        if skip_self and src == tgt:
            continue
        # Cosine similarity for these two rows
        sim = cosine_similarity(embeddings[[src, tgt], :])[0, 1]
        sims.append(sim)
    return np.array(sims)

# ---------------------------------------------
# Build pos & neg edges from data_test[i].assoc
def build_edges_from_assoc(data_item):
    """
    data_item.assoc: array of length N specifying a group ID for each node in the event.
    We assume data_item.assoc[n] is an integer: the group ID of node n.
    """
    group_ids = data_item.assoc
    N = len(group_ids)

    pos_edges = []
    neg_edges = []

    # Generate all unique pairs i < j
    for i in range(N):
        for j in range(i + 1, N):
            if group_ids[i] == group_ids[j]:
                pos_edges.append([i, j])
            else:
                neg_edges.append([i, j])

    return pos_edges, neg_edges

# ---------------------------------------------
num_events = 100  # Example: process the first 10 events
all_pos_sims = []
all_neg_sims = []

for i in tqdm(range(num_events), desc="Processing events"):
    # 'all_predictions[i]' is shape [N, embedding_dim],
    # matching the length of data_test[i].assoc
    pred_tensor = torch.tensor(all_predictions[i], dtype=torch.float32)
    # Normalize embeddings
    pred_norm = F.normalize(pred_tensor, p=2, dim=1)
    embeddings = pred_norm.cpu().numpy()

    # Build positive & negative edges based on group IDs
    pos_edge_indices, neg_edge_indices = build_edges_from_assoc(data_test[i])

    # Compute similarities
    pos_sims = compute_cosine_similarities(embeddings, pos_edge_indices, skip_self=True)
    neg_sims = compute_cosine_similarities(embeddings, neg_edge_indices, skip_self=True)

    # Accumulate for global stats
    all_pos_sims.extend(pos_sims.tolist())
    all_neg_sims.extend(neg_sims.tolist())

all_pos_sims_no = np.array(all_pos_sims)
all_neg_sims_no = np.array(all_neg_sims)

mean_pos_sim = np.mean(all_pos_sims)
mean_neg_sim = np.mean(all_neg_sims)

print(f"Mean Positive Edge Cosine Similarity: {mean_pos_sim:.4f}")
print(f"Mean Negative Edge Cosine Similarity: {mean_neg_sim:.4f}")



Processing events: 100%|██████████████████████| 100/100 [00:03<00:00, 26.08it/s]

Mean Positive Edge Cosine Similarity: 0.8901
Mean Negative Edge Cosine Similarity: 0.4884





In [None]:
import matplotlib.pyplot as plt

# Suppose we already have these arrays:
# all_pos_sims, all_neg_sims (for NTXENT)
# all_pos_sims_HN, all_neg_sims_HN (for HNNTXENT)

fig, axs = plt.subplots(1, 2, figsize=(12, 6))

# ------------------------------------------------------
# Left subplot: NTXENT
axs[0].hist(all_pos_sims_no, bins=50, histtype="step", linewidth=2, label="NTXENT Positive")
axs[0].hist(all_neg_sims_no, bins=50, histtype="step", linewidth=2, label="NTXENT Negative")

axs[0].set_xlim(-1, 1)  # range of cosine similarity
axs[0].set_xlabel("Cosine Similarity", fontsize=14)
axs[0].set_ylabel("Frequency", fontsize=14)
axs[0].set_title("NTXENT Distribution", fontsize=16)
axs[0].legend()

# ------------------------------------------------------
# Right subplot: HNNTXENT
axs[1].hist(all_pos_sims_hard, bins=50, histtype="step", linewidth=2, label="HN-NTXENT Positive")
axs[1].hist(all_neg_sims_hard, bins=50, histtype="step", linewidth=2, label="HN-NTXENT Negative")

axs[1].set_xlim(-1, 1)  # range of cosine similarity
axs[1].set_xlabel("Cosine Similarity", fontsize=14)
axs[1].set_ylabel("Frequency", fontsize=14)
axs[1].set_title("HN-NTXENT Distribution", fontsize=16)
axs[1].legend()

plt.tight_layout()
plt.savefig("CosineTL.pdf")
plt.show()
