# Retrieving all MC-Data and storing it in matrices

To reduce computation time the data will be preprocessed to fit into the CNN. First of all the photon arrival times are loaded from the server and are shaped to fit into a flat 2d-matrix.

Execute this script only once.

In [None]:
from multiprocessing import Pool
import numpy as np
import random
import pickle
import gzip
import json
import os

In [None]:
# Important variables
mc_data_path = '/net/big-tank/POOL/projects/fact/simulation/photon_stream/fact_tools/v.0.18.0/'
id_position_path = '/home/jbehnken/06_FACT_Pipeline/01_hexagonal_position_dict.p'
temporary_path = '/fhgfs/users/jbehnken/01_Data/99_Temporary'
processed_data_path = '/fhgfs/users/jbehnken/01_Data/01_MC_Data'

In [None]:
def getMetadata():
    '''
    Gathers the file paths of the training data
    '''
    # Iterate over every file in the subdirs and check if it has the right file extension
    file_paths = [os.path.join(dirPath, file) for dirPath, dirName, fileName in os.walk(os.path.expanduser(mc_data_path)) for file in fileName if '.json' in file]
    return file_paths


def reformat(dataset, labels):
    dataset = dataset.reshape((-1, 46, 45, 1)).astype(np.float32)
    labels = (np.arange(2) == labels[:,None]).astype(np.float32)
    return dataset, labels

Every gzip-file will be opened and the contained information will be reshaped with the mapping of the hexagonal-position dictionary. Afterwards the images are ready to ft into the CNN. Batches of 1000 events will be stored in separated files.

In [None]:
file_paths = getMetadata()
id_position = pickle.load(open(id_position_path, "rb"))

data = []
num = 0
for path in file_paths:
    with gzip.open(path) as file:
        # Gamma=True, Proton=False
        label = True if 'gamma' in path else False
        
        for line in file:
            event_photons = json.loads(line.decode('utf-8'))['PhotonArrivals_500ps']
            
            input_matrix = np.zeros([46,45])
            for i in range(1440):
                x, y = id_position[i]
                input_matrix[int(x)][int(y)] = len(event_photons[i])
            
            data.append([input_matrix, label])
            
            if len(data)%1000 == 0:
                pic, lab = zip(*data)
                pic, lab = reformat(np.array(pic), np.array(lab))
                data_dict={'Image':pic, 'Label':lab}
                
                with gzip.open( temporary_path + "/PhotonArrivals_500ps_"+str(num)+".p", "wb" ) as data_file:
                    pickle.dump(data_dict, data_file)
                data = []
                num += 1

# Randomizing and standardizing the formated data

All files will be loaded into memory, then they will be shuffled, standardized and stored again into the same data structure.

In [None]:
# Load pickled data and split it into pictures and labels
def load_data(file):
    with gzip.open(temporary_path+'/'+file, 'rb') as f:
        data_dict = pickle.load(f)
    pic = data_dict['Image']
    lab = data_dict['Label']
    return (pic, lab)

# Pool-load pickled data and split it into pictures and labels (list)
p = Pool()
data = p.map(load_data, os.listdir(temporary_path))
pics, labs = zip(*data)
del data, p

# Concatenate the data to a single np.array
pic = np.concatenate(pics)
lab = np.concatenate(labs)
del pics, labs


# Values to standardize the data
mean = np.mean(pic)
std = np.std(pic)
print(mean, std)


# Randomize and split the data into train/validation/test dataset
p = np.random.permutation(len(pic))
all_pics = pic[p]
all_labels = lab[p]
del p, pic, lab

def save_data(i):
    pics_batch = all_pics[(i-1)*1000:i*1000]
    labels_batch = all_labels[(i-1)*1000:i*1000]
    
    data_dict={'Image':(pics_batch-mean)/std, 'Label':labels_batch}
    with gzip.open(processed_data_path + '/PhotonArrivals_500ps_{}.p'.format(i), 'wb') as f:
        pickle.dump(data_dict, f)
        
num_files = len(os.listdir(temporary_path))
p = Pool()
data = p.map(save_data, range(1,num_files+1))

mean: 1.24904

std: 2.36506

# Result

The 2.422.000 events are standardized, randomized and saved to the disc in files containing 1000 single events.