# Retrieving all MC-Data and storing it in matrices

To reduce computation time the data will be preprocessed to fit into the CNN. First of all the photon arrival times are loaded from the server and are shaped to fit into a flat 2d-matrix.

Execute this script only once.

In [None]:
def getMetadata(load_metadata=True):
    '''
    Loads/gathers the file paths and the number of contained events
    '''
    
    file_path = '01_File_event_count.csv'
    
    if load_metadata:
        df = pd.read_csv(file_path)
        
    else:
        # Path to the directory containing subdirectories and all datafile
        main_path = '/net/big-tank/POOL/projects/fact/simulation/photon_stream/fact_tools/v.0.18.0/'

        # Iterate over every file in the subdirs and check if it has the right file extension
        file_paths = [os.path.join(dirPath, file) for dirPath, dirName, fileName in os.walk(os.path.expanduser(main_path)) for file in fileName if '.json' in file]
        
        # Count numbers of files in every subdir
        proton_files = []
        gustav_files = []
        werner_files = []
        fehler_files = []

        for file in file_paths:
            if 'proton' in file:
                proton_files.append(file)
            elif 'gustav' in file:
                gustav_files.append(file)
            elif 'werner' in file:
                werner_files.append(file)
            else: fehler_files.append(file)
        
        # Count every element in every file
        events = []
        for subdir in [proton_files, gustav_files, werner_files]:
            file_list = []
            for file in subdir:
                event_count = 0
                with gzip.open(file) as event_data:
                    for event in event_data:
                        event_count += 1
                file_list.append([file, event_count])
            events.append(file_list)
        
        data = []
        for elem in events:
            for i in elem:
                data.append(i)
                
        # Save metadata to a df
        df = pd.DataFrame(data, columns=['File_name', 'Event_count'])
        df['Particle'] = df['File_name'].apply(lambda x: False if 'proton' in x else True)
        df.to_csv(file_path, encoding='utf-8', index=False)
        
    return df

Every gzip-file will be opened and the contained information will be reshaped with the mapping of the hexagonal-position dictionary. Batches of 1000 events will be stored in separated files.

In [None]:
import pandas as pd
import numpy as np
import pickle
import gzip
import json

df = getMetadata(load_metadata=True)
id_position = pickle.load(open("01_hexagonal_position_dict.p", "rb"))

data = []
num = 0
for elem in df.values:
    with gzip.open(elem[0]) as file:
        for line in file:
            event_photons = json.loads(line.decode('utf-8'))['PhotonArrivals_500ps']
            
            input_matrix = np.zeros([46,45])
            for i in range(1440):
                x, y = id_position[i]
                input_matrix[int(x)][int(y)] = len(event_photons[i])
            
            data.append([input_matrix, elem[2]])
            
            if len(data)%1000 == 0:
                with gzip.open( "/fhgfs/users/jbehnken/01_Data/99_Temporary/PhotonArrivals_500ps_"+str(num)+".p", "wb" ) as data_file:
                    pickle.dump(data, data_file)
                data = []
                num += 1

# Formating the preprocessed data

Every file will be opened, the data will be converted to np.arrays and pictures and labels will be stored together in a dictionary. The resulting files will still contain 1000 events.

In [None]:
import numpy as np
import pickle
import gzip
import os

path = '/fhgfs/users/jbehnken/01_Data/99_Temporary'
path_new = '/fhgfs/users/jbehnken/01_Data/99_Temporary'

def reformat(dataset, labels):
    dataset = dataset.reshape((-1, 46, 45, 1)).astype(np.float32)
    labels = (np.arange(2) == labels[:,None]).astype(np.float32)
    return dataset, labels

def rewrite(file):
        with gzip.open(path+'/'+file, 'rb') as f:
                data = pickle.load(f)
                pic, lab = zip(*data)
                pic, lab = reformat(np.array(pic), np.array(lab))

        data_dict={'Image':pic, 'Label':lab}

        with gzip.open(path_new+'/'+file, 'wb') as f:
                pickle.dump(data_dict, f)

from multiprocessing import Pool
p = Pool()
data = p.map(rewrite, os.listdir(path))

# Randomizing and standardizing the formated data

All files will be loaded into memory, then they will be shuffled, standardized and stored again into the same data structure.

In [1]:
import pickle
import gzip
import numpy as np
import os
import random
from multiprocessing import Pool

# Path to preprocessed data
path = '/fhgfs/users/jbehnken/01_Data/99_Temporary'


# Load pickled data and split it into pictures and labels
def load_data(file):
    with gzip.open(path+'/'+file, 'rb') as f:
        data_dict = pickle.load(f)
    pic = data_dict['Image']
    lab = data_dict['Label']
    return (pic, lab)

# Pool-load pickled data and split it into pictures and labels (list)
p = Pool()
data = p.map(load_data, os.listdir(path))
pics, labs = zip(*data)
del data, p

# Concatenate the data to a single np.array
pic = np.concatenate(pics)
lab = np.concatenate(labs)
del pics, labs


# Values to standardize the data
mean = np.mean(pic)
std = np.std(pic)
print(mean, std)


# Randomize and split the data into train/validation/test dataset
p = np.random.permutation(len(pic))
all_pics = pic[p]
all_labels = lab[p]
del p, pic, lab

def save_data(i):
    pics_batch = all_pics[(i-1)*1000:i*1000]
    labels_batch = all_labels[(i-1)*1000:i*1000]
    
    data_dict={'Image':(pics_batch-mean)/std, 'Label':labels_batch}
    with gzip.open('/fhgfs/users/jbehnken/01_Data/01_MC_Data/PhotonArrivals_500ps_{}.p'.format(i), 'wb') as f:
        pickle.dump(data_dict, f)
        
num_files = len(os.listdir(path))
p = Pool()
data = p.map(save_data, range(1,num_files+1))

1.24904 2.36506


mean: 1.24904

std: 2.36506

# Result

The 2.422.000 events are standardized, randomized and saved to the disc in files containing 1000 single events.

In [2]:
import pandas as pd
import os

file_path = '01_File_event_count.csv'
df = pd.read_csv(file_path)

main_path = '/net/big-tank/POOL/projects/fact/simulation/photon_stream/fact_tools/v.0.18.0/'
# Iterate over every file in the subdirs and check if it has the right file extension
file_paths = [os.path.join(dirPath, file) for dirPath, dirName, fileName in os.walk(os.path.expanduser(main_path)) for file in fileName if '.json' in file]
        

In [8]:
df.head(1)

Unnamed: 0,File_name,Event_count,Particle
0,/net/big-tank/POOL/projects/fact/simulation/ph...,1322,False


In [9]:
file_paths[0]

'/net/big-tank/POOL/projects/fact/simulation/photon_stream/fact_tools/v.0.18.0/gamma_gustav_12/output_spe_extractor_mc_39.json'