In [1]:
import h5py
import numpy as np  
from tqdm import tqdm   

In [5]:
# data = h5py.File('/global/cfs/cdirs/m4392/ACAT_Backup/Data/QG/Quark_Gluon.h5', 'r')  
data = h5py.File('/global/cfs/cdirs/m4392/ACAT_Backup/Data/Top/Boosted_Top.h5', 'r')  
dataset = h5py.Dataset(data['train_jet'])

ValueError: <HDF5 dataset "train_jet": shape (3164800, 125, 125, 8), type "<f4"> is not a DatasetID

In [3]:
data.chunks

AttributeError: 'File' object has no attribute 'chunks'

In [15]:
data.keys()

<KeysViewHDF5 ['test_jet', 'test_meta', 'train_jet', 'train_meta', 'validation_jet', 'validation_meta']>

In [16]:
# QG dataset count:793900
# Boosted Top dataset count: 3164800
data['train_meta'].shape[0]

3164800

In [8]:
data['train_meta'][10]

array([101.89284 ,  16.671894,   0.      ], dtype=float32)

In [4]:
sum_vector = np.zeros((125,125,8),dtype=np.float64) 
count = 0
for string in ['train','validation','test']:
    for sample in tqdm(data[f'{string}_jet']):
        sample = np.array(sample,dtype=np.float64)
        sample = (sample - np.min(sample))/(np.max(sample) - np.min(sample))
        if np.isnan(sample).any():
            continue
        sum_vector += sample
        count += 1

sum_vector = sum_vector.mean(axis=(0,1))/count
sum_vector

  sample = (sample - np.min(sample))/(np.max(sample) - np.min(sample))
100%|██████████| 3164800/3164800 [46:41<00:00, 1129.75it/s] 
100%|██████████| 393600/393600 [05:20<00:00, 1226.62it/s]
100%|██████████| 400000/400000 [05:45<00:00, 1158.83it/s]


array([0.00030618, 0.00032988, 0.00022402, 0.00050185, 0.00590864,
       0.00434622, 0.00457014, 0.00442105])

In [6]:
#Quark Gluon mean: [0.00036276, 0.00050321, 0.00560932]
#Boosted Top mean: [0.00030618, 0.00032988, 0.00022402, 0.00050185, 0.00590864,
    #    0.00434622, 0.00457014, 0.00442105]

In [5]:
#Quark Gluon std: [0.00023162, 0.00031747, 0.00257909]
#Boosted Top std: [1.57844912e-04, 1.27444286e-04, 9.95345503e-05, 3.33565491e-04,
    #    3.09065050e-03, 1.89672124e-03, 2.02093365e-03, 2.01831124e-03]
std_vector = np.zeros(8,dtype=np.float64)
count = 0
mean = sum_vector
for string in ['train','validation','test']:
    for sample in tqdm(data[f'{string}_jet']):
        sample = np.array(sample,dtype=np.float64)
        sample = (sample - np.min(sample))/(np.max(sample) - np.min(sample))
        sample = (sample.mean(axis=(0,1)) - mean)**2
        if np.isnan(sample).any():
            continue
        std_vector += sample
        count += 1

std_vector = np.sqrt(std_vector/count)
std_vector

  sample = (sample - np.min(sample))/(np.max(sample) - np.min(sample))
100%|██████████| 3164800/3164800 [33:21<00:00, 1581.58it/s] 
100%|██████████| 393600/393600 [04:18<00:00, 1522.75it/s]
100%|██████████| 400000/400000 [04:04<00:00, 1637.66it/s]


array([1.57844912e-04, 1.27444286e-04, 9.95345503e-05, 3.33565491e-04,
       3.09065050e-03, 1.89672124e-03, 2.02093365e-03, 2.01831124e-03])

In [9]:
std_vector.shape

(125, 125, 3)

In [1]:
import configs.model_cfg as cfg

In [3]:
cfg.base['patch_size']

5

In [6]:
from torch.utils.data import Dataset
import torch
from tqdm import tqdm
import h5py
import numpy as np  

class H5Dataset(Dataset):
    '''
    Loads a dataset from h5 file
    args:
        file_path: str, path to the h5 file
        partition: str, one of 'train', 'validation', or 'test'
    '''
    def __init__(self, 
                 file_path: str, 
                 partition: str
    )-> None:
        assert partition in ['train', 'validation', 'test'],\
              "Partition must be one of 'train', 'validation', or 'test'"
        self.file_path = file_path
        self.data = h5py.File(file_path, 'r')[f'{partition}_jet']  
        self.labels = h5py.File(file_path, 'r')[f'{partition}_meta']
        self.neglect = []
        for i,d in tqdm(enumerate(self.data)):
            if np.min(d) == np.max(d):
                self.neglect.append(i)

    def __len__(self):
        return self.labels.shape[0]

    def __getitem__(self, idx):
        data = self.data[idx]
        data = (data - np.min(data)) / (np.max(data) - np.min(data))    
        return torch.Tensor(data), torch.Tensor(np.array(self.labels[idx][-1]))

In [9]:
data = H5Dataset('/global/cfs/cdirs/m4392/ACAT_Backup/Data/Top/Boosted_Top.h5', 'train')

227it [00:00, 1056.88it/s]

3164800it [05:59, 8806.78it/s]


In [8]:
data.neglect

[]

In [6]:
import timm
import torch.nn as nn
from functools import partial
class VisionTransformer(timm.models.vision_transformer.VisionTransformer):
    """ Vision Transformer with support for global average pooling
    """
    def __init__(self, global_pool=False, **kwargs):
        super(VisionTransformer, self).__init__(**kwargs)

        self.global_pool = global_pool
        if self.global_pool:
            # self.fc_norm = nn.BatchNorm2d(kwargs['embed_dim'], affine=False)
            norm_layer = partial(nn.LayerNorm, eps=1e-6)
            embed_dim = kwargs['embed_dim']
            self.fc_norm = norm_layer(embed_dim)

            del self.norm  # remove the original norm

    def forward_features(self, x):
        B = x.shape[0]
        x = self.patch_embed(x)

        cls_tokens = self.cls_token.expand(B, -1, -1)  
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.pos_embed
        x = self.pos_drop(x)

        for blk in self.blocks:
            x = blk(x)

        if self.global_pool:
            x = x[:, 1:, :].mean(dim=1)  # global pool without cls token
            x = self.fc_norm(x)
        else:
            x = self.norm(x)
            x = x[:, 0]

        return x


    def forward(self, x):
        x = self.forward_features(x)
        x = self.head(x)
        return x

In [7]:
model = VisionTransformer(global_pool=True)

KeyError: 'embed_dim'

In [2]:
import random
n_total_train = 1000
CHUNK_SIZE = 32

# Helper function to chunk the indices
def chunk_indices(indices, chunk_size):
    for i in range(0, len(indices), chunk_size):
        yield indices[i:i + chunk_size]

# Create indices for the entire dataset
indices = list(range(n_total_train))

# Break the indices into chunks of size 32
chunked_indices = list(chunk_indices(indices, CHUNK_SIZE))

# Shuffle the chunks
random.shuffle(chunked_indices)

# Flatten the shuffled chunks back into a single list of indices
shuffled_indices = [index for chunk in chunked_indices for index in chunk]

# Determine the sizes for pretrain, train, and validation sets
pretrain_size = int(0.7 * n_total_train)
train_size = int(0.2 * n_total_train)

# Split the indices
pretrain_indices = shuffled_indices[:pretrain_size]

In [3]:
pretrain_indices    

[320,
 321,
 322,
 323,
 324,
 325,
 326,
 327,
 328,
 329,
 330,
 331,
 332,
 333,
 334,
 335,
 336,
 337,
 338,
 339,
 340,
 341,
 342,
 343,
 344,
 345,
 346,
 347,
 348,
 349,
 350,
 351,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 384,
 385,
 386,
 387,
 388,
 389,
 390,
 391,
 392,
 393,
 394,
 395,
 396,
 397,
 398,
 399,
 400,
 401,
 402,
 403,
 404,
 405,
 406,
 407,
 408,
 409,
 410,
 411,
 412,
 413,
 414,
 415,
 800,
 801,
 802,
 803,
 804,
 805,
 806,
 807,
 808,
 809,
 810,
 811,
 812,
 813,
 814,
 815,
 816,
 817,
 818,
 819,
 820,
 821,
 822,
 823,
 824,
 825,
 826,
 827,
 828,
 829,
 830,
 831,
 416,
 417,
 418,
 419,
 420,
 421,
 422,
 423,
 424,
 425,
 426,
 427,
 428,
 429,
 430,
 431,
 432,
 433,
 434,
 435,
 436,
 437,
 438,
 439,
 440,
 441,
 442,
 443,
 444,
 445,
 446,
 447,
 672,
 673,
 674,
 675,
 676,
 677,
 678,
 679,
 680,
 681,
 682,
 683,
