In [24]:
import uproot
import pandas as pd
import gc
from sklearn.preprocessing import MinMaxScaler

In [25]:
ROOT_FILE_NAME = '341294_afp_hits'
ROOT_INPUT_PATH = 'input_root/' + '341294_afp_hits' + '.root'
ENTRY_LIMIT = 100

file = uproot.open(ROOT_INPUT_PATH)
tree = file['TreeHits']
tree.show()
dataset = tree.arrays(['hits','hits_row', 'hits_col', 'hits_q', 'timestamp', 'lmiBl', 'mu'], 
                      library='pd', entry_stop = ENTRY_LIMIT)
dataset = dataset.copy()
dataset.head()

# number of hits
dataset['a_hits_n'] = dataset.filter(regex='^hits\\[[01]',axis=1).sum(axis=1)
dataset['c_hits_n'] = dataset.filter(regex='^hits\\[[23]',axis=1).sum(axis=1)
dataset.drop(dataset.filter(regex='^hits\\[',axis=1), axis=1, inplace=True)

#average coordinates
# TODO rewrite - it's for testing purposes only. Works, but is ugly as hell
# first detector (in hit order, which means for side A we take detector #2 -> detector #1 data)
weights_a_1 = dataset.filter(regex='^hits_q\\[1',axis=1).where(dataset.filter(regex='^hits_q\[',axis=1) > 0.0, 0)
weights_c_1 = dataset.filter(regex='^hits_q\\[2',axis=1).where(dataset.filter(regex='^hits_q\[',axis=1) > 0.0, 0)
rows_a = dataset.filter(regex='^hits_row\\[1',axis=1)
rows_c = dataset.filter(regex='^hits_row\\[2',axis=1)
dataset['a_hit_row_1'] = (rows_a * weights_a_1.values).sum(axis=1) / weights_a_1.sum(axis = 1)
dataset['c_hit_row_1'] = (rows_c * weights_c_1.values).sum(axis=1) / weights_c_1.sum(axis = 1)

# second detector (in hit order)
weights_a_2 = dataset.filter(regex='^hits_q\\[0',axis=1).where(dataset.filter(regex='^hits_q\[',axis=1) > 0.0, 0)
weights_c_2 = dataset.filter(regex='^hits_q\\[3',axis=1).where(dataset.filter(regex='^hits_q\[',axis=1) > 0.0, 0)
dataset.drop(dataset.filter(regex='^hits_q',axis=1), axis=1, inplace=True)
rows_a = dataset.filter(regex='^hits_row\\[0',axis=1)
rows_c = dataset.filter(regex='^hits_row\\[3',axis=1)

dataset['a_hit_row_2'] = (rows_a * weights_a_2.values).sum(axis=1) / weights_a_2.sum(axis = 1)
dataset['c_hit_row_2'] = (rows_c * weights_c_2.values).sum(axis=1) / weights_c_2.sum(axis = 1)

del [rows_a, rows_c]
gc.collect()

columns_a = dataset.filter(regex='^hits_col\\[1',axis=1)
columns_c = dataset.filter(regex='^hits_col\\[2',axis=1)
dataset['a_hit_column_1'] = (columns_a * weights_a_1.values).sum(axis=1) / weights_a_1.sum(axis = 1)
dataset['c_hit_column_1'] = (columns_c * weights_c_1.values).sum(axis=1) / weights_c_1.sum(axis = 1)
columns_a = dataset.filter(regex='^hits_col\\[0',axis=1)
columns_c = dataset.filter(regex='^hits_col\\[3',axis=1)
dataset['a_hit_column_2'] = (columns_a * weights_a_2.values).sum(axis=1) / weights_a_2.sum(axis = 1)
dataset['c_hit_column_2'] = (columns_c * weights_c_2.values).sum(axis=1) / weights_c_2.sum(axis = 1)
del [columns_a, columns_c]
del [weights_a_1, weights_c_1, weights_a_2, weights_c_2]
gc.collect()



# STANDARD DEVIATION

dataset['a_std_col'] = dataset.filter(regex='^hits_col\\[[01]',axis=1).std(axis=1)
dataset['a_std_row'] = dataset.filter(regex='^hits_row\\[[01]',axis=1).std(axis=1)

dataset['c_std_col'] = dataset.filter(regex='^hits_col\\[[23]',axis=1).std(axis=1)
dataset['c_std_row'] = dataset.filter(regex='^hits_row\\[[23]',axis=1).std(axis=1)

print("- - - Extracted data - - \n-", dataset.shape)
print(dataset.head())
print(dataset.tail())

name                 | typename                 | interpretation                
---------------------+--------------------------+-------------------------------
evN                  | int32_t                  | AsDtype('>i4')
lmiBl                | int32_t                  | AsDtype('>i4')
mu                   | float                    | AsDtype('>f4')
timestamp            | uint32_t                 | AsDtype('>u4')
bcid                 | uint32_t                 | AsDtype('>u4')
hits                 | int32_t[4][4]            | AsDtype("('>i4', (4, 4))")
hits_row             | int32_t[4][4][100]       | AsDtype("('>i4', (4, 4, 100...
hits_col             | int32_t[4][4][100]       | AsDtype("('>i4', (4, 4, 100...
hits_q               | float[4][4][100]         | AsDtype("('>f4', (4, 4, 100...


  out[name] = series[name]


- - - Extracted data - - 
- (100, 3217)
   hits_row[0][0][0]  hits_row[0][0][1]  hits_row[0][0][2]  hits_row[0][0][3]  \
0                 -1                 -1                 -1                 -1   
1                 -1                 -1                 -1                 -1   
2                197                 -1                 -1                 -1   
3                 -1                 -1                 -1                 -1   
4                 -1                 -1                 -1                 -1   

   hits_row[0][0][4]  hits_row[0][0][5]  hits_row[0][0][6]  hits_row[0][0][7]  \
0                 -1                 -1                 -1                 -1   
1                 -1                 -1                 -1                 -1   
2                 -1                 -1                 -1                 -1   
3                 -1                 -1                 -1                 -1   
4                 -1                 -1                 -1          

In [26]:
from math import sqrt

# TODO add pipeline
MINIMUM_HIT_NUMBER = 1
MAXIMUM_HIT_NUMBER = 100

# - - - temporary as different cell as it is not optimal solution mmemmory wise- - -
buffor = dataset.drop(dataset.filter(regex='^c',axis=1), axis=1, inplace=False)
buffor.rename(columns={'a_hits_n': 'hits_n', 'a_hit_row_1': 'hit_row_1', 'a_hit_row_2': 'hit_row_2',
                       'a_hit_column_1': 'hit_column_1', 'a_hit_column_2': 'hit_column_2',
                       'a_std_col' : '_std_col', 'a_std_row' : '_std_row'}, inplace = True)


dataset.drop(dataset.filter(regex='^a',axis=1), axis=1, inplace=True)
dataset.rename(columns={'c_hits_n': 'hits_n', 'c_hit_row_1': 'hit_row_1', 'c_hit_row_2': 'hit_row_2',
                       'c_hit_column_1': 'hit_column_1', 'c_hit_column_2': 'hit_column_2',
                       'c_std_col' : '_std_col', 'c_std_row' : '_std_row'}, inplace = True)
dataset = dataset.append(buffor)

dataset['std_distance'] = dataset['_std_col'] * dataset['_std_col'].values + dataset['_std_row'].values * dataset['_std_row'].values
dataset['std_distance'] = dataset['std_distance'].pow(1/2)

dataset.drop(dataset.filter(regex='^_',axis=1), axis=1, inplace=True)

dataset = dataset[dataset['hits_n'] >= MINIMUM_HIT_NUMBER]
dataset = dataset[dataset['hits_n'] <= MAXIMUM_HIT_NUMBER]

print("- - - After appending c and a sides - - \n-", dataset.shape)
print(dataset.head())

- - - After appending c and a sides - - 
- (104, 3209)
   hits_row[0][0][0]  hits_row[0][0][1]  hits_row[0][0][2]  hits_row[0][0][3]  \
0                 -1                 -1                 -1                 -1   
1                 -1                 -1                 -1                 -1   
2                197                 -1                 -1                 -1   
3                 -1                 -1                 -1                 -1   
4                 -1                 -1                 -1                 -1   

   hits_row[0][0][4]  hits_row[0][0][5]  hits_row[0][0][6]  hits_row[0][0][7]  \
0                 -1                 -1                 -1                 -1   
1                 -1                 -1                 -1                 -1   
2                 -1                 -1                 -1                 -1   
3                 -1                 -1                 -1                 -1   
4                 -1                 -1              

In [27]:
# NORMALIZATION

scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(dataset) 
dataset.loc[:,:] = scaled_values


print("- - - After normalization - - \n-", dataset.shape)
print(dataset.head())
print(dataset.tail())

path = 'preprocessed_data/' + ROOT_FILE_NAME + '.pkl'
dataset.to_pickle(path)

- - - After normalization - - 
- (104, 3209)
   hits_row[0][0][0]  hits_row[0][0][1]  hits_row[0][0][2]  hits_row[0][0][3]  \
0           0.000000                0.0                0.0                0.0   
1           0.000000                0.0                0.0                0.0   
2           0.594595                0.0                0.0                0.0   
3           0.000000                0.0                0.0                0.0   
4           0.000000                0.0                0.0                0.0   

   hits_row[0][0][4]  hits_row[0][0][5]  hits_row[0][0][6]  hits_row[0][0][7]  \
0                0.0                0.0                0.0                0.0   
1                0.0                0.0                0.0                0.0   
2                0.0                0.0                0.0                0.0   
3                0.0                0.0                0.0                0.0   
4                0.0                0.0                0.0     