In [43]:
import uproot
import pandas as pd
import gc
from sklearn.preprocessing import MinMaxScaler


In [57]:
ROOT_INPUT_PATH = 'input_root/341294_afp_hits.root'

file = uproot.open(ROOT_INPUT_PATH)
tree = file['TreeHits']
tree.show()
dataset_t = tree.arrays(['hits','hits_row', 'hits_col', 'hits_q', 'timestamp', 'bcid', 'lmiBl', 'mu'], library='pd', entry_stop = None)
dataset = dataset_t.copy()
del dataset_t
gc.collect()
dataset.head()


# number of hits
dataset['left_hits_n'] = dataset.filter(regex='^hits\\[[01]',axis=1).sum(axis=1)
dataset['right_hits_n'] = dataset.filter(regex='^hits\\[[23]',axis=1).sum(axis=1)
dataset.drop(dataset.filter(regex='^hits\\[',axis=1), axis=1, inplace=True)

#average coordinates
weights_left = dataset.filter(regex='^hits_q\\[[01]',axis=1).where(dataset.filter(regex='^hits_q\[',axis=1) > 0.0, 0)
weights_right = dataset.filter(regex='^hits_q\\[[23]',axis=1).where(dataset.filter(regex='^hits_q\[',axis=1) > 0.0, 0)
dataset.drop(dataset.filter(regex='^hits_q',axis=1), axis=1, inplace=True)

rows_left = dataset.filter(regex='^hits_row\\[[01]',axis=1)
rows_right = dataset.filter(regex='^hits_row\\[[23]',axis=1)
dataset.drop(dataset.filter(regex='^hits_row',axis=1), axis=1, inplace=True)
dataset['left_hit_row'] = (rows_left * weights_left.values).sum(axis=1) / weights_left.sum(axis = 1)
dataset['right_hit_row'] = (rows_right * weights_right.values).sum(axis=1) / weights_right.sum(axis = 1)
del [rows_left, rows_right]
gc.collect()

columns_left = dataset.filter(regex='^hits_col\\[[01]',axis=1)
columns_right = dataset.filter(regex='^hits_col\\[[23]',axis=1)
dataset.drop(dataset.filter(regex='^hits_col',axis=1), axis=1, inplace=True)
dataset['left_hit_column'] = (columns_left * weights_left.values).sum(axis=1) / weights_left.sum(axis = 1)
dataset['right_hit_column'] = (columns_right * weights_right.values).sum(axis=1) / weights_right.sum(axis = 1)
del [columns_left, columns_right]
del [weights_left, weights_right]
gc.collect()

print("- - - Raw extracted data - - \n-", dataset.shape)
print(dataset.head())
print(dataset.tail())

name                 | typename                 | interpretation                
---------------------+--------------------------+-------------------------------
evN                  | int32_t                  | AsDtype('>i4')
lmiBl                | int32_t                  | AsDtype('>i4')
mu                   | float                    | AsDtype('>f4')
timestamp            | uint32_t                 | AsDtype('>u4')
bcid                 | uint32_t                 | AsDtype('>u4')
hits                 | int32_t[4][4]            | AsDtype("('>i4', (4, 4))")
hits_row             | int32_t[4][4][100]       | AsDtype("('>i4', (4, 4, 100...
hits_col             | int32_t[4][4][100]       | AsDtype("('>i4', (4, 4, 100...
hits_q               | float[4][4][100]         | AsDtype("('>f4', (4, 4, 100...


  out[name] = series[name]


- - - Raw extracted data - - 
- (18697, 10)
    timestamp  bcid  lmiBl        mu  left_hits_n  right_hits_n  left_hit_row  \
0  1511286284  2801    157  2.327614            0            13 -5.383514e+08   
1  1511286290  2180    157  1.908377            0            11 -5.383514e+08   
2  1511286291  2357    157  1.598215           16             1 -5.352135e+08   
3  1511286301  3161    157  1.757314            0            10 -5.383514e+08   
4  1511286304  2215    157  2.355958            0            11 -5.383514e+08   

   right_hit_row  left_hit_column  right_hit_column  
0   6.127666e+07    -3.751467e+08      7.680988e+08  
1   6.062978e+07    -3.746810e+08      7.680988e+08  
2   6.061985e+07    -3.766279e+08      7.680988e+08  
3   6.126405e+07    -3.751468e+08      7.680988e+08  
4   6.061997e+07    -3.752630e+08      7.680988e+08  
        timestamp  bcid  lmiBl        mu  left_hits_n  right_hits_n  \
18692  1511290367   389    226  1.953009            0           128   
186

In [58]:
# TODO add pipeline

# - - - temporary as different cell as it is not optimal solution mmemmory wise- - -
buffor = dataset.drop(columns=['right_hits_n', 'right_hit_row', 'right_hit_column'])
buffor.rename(columns={'left_hits_n': 'hits_n', 'left_hit_row': 'hit_row', 'left_hit_column': 'hit_column'}, inplace = True)


dataset.drop(columns=['left_hits_n', 'left_hit_row', 'left_hit_column'], inplace = True)
dataset.rename(columns={'right_hits_n': 'hits_n', 'right_hit_row': 'hit_row', 'right_hit_column': 'hit_column'}, inplace = True)
dataset = dataset.append(buffor)
dataset = dataset[dataset['hits_n'] != 0]
dataset = dataset[dataset['hits_n'] < 100]

print("- - - After appending right and left sides - - \n-", dataset.shape)
print(dataset.head())
print(dataset.tail())

- - - After appending right and left sides - - 
- (19275, 7)
    timestamp  bcid  lmiBl        mu  hits_n       hit_row    hit_column
0  1511286284  2801    157  2.327614      13  6.127666e+07  7.680988e+08
1  1511286290  2180    157  1.908377      11  6.062978e+07  7.680988e+08
2  1511286291  2357    157  1.598215       1  6.061985e+07  7.680988e+08
3  1511286301  3161    157  1.757314      10  6.126405e+07  7.680988e+08
4  1511286304  2215    157  2.355958      11  6.061997e+07  7.680988e+08
        timestamp  bcid  lmiBl        mu  hits_n   hit_row    hit_column
18683  1511290266  1894    225  2.594671       9 -1.000000 -9.151355e+08
18688  1511290276  1323    225  2.235327      11  8.761513 -6.242398e+08
18689  1511290277  1399    225  2.009219      13  3.601856 -7.693728e+08
18690  1511290325   451    226  1.833004      34  3.525091 -6.400684e+08
18695  1511291383  1356    243  2.145624      10 -1.000000 -9.151355e+08


In [59]:
# NORMALIZATION

scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(dataset) 
dataset.loc[:,:] = scaled_values


print("- - - After normalization - - \n-", dataset.shape)
print(dataset.head())
print(dataset.tail())

- - - After normalization - - 
- (19275, 7)
   timestamp      bcid  lmiBl        mu    hits_n   hit_row  hit_column
0   0.000000  0.853943    0.0  0.248407  0.122449  0.523637    0.936787
1   0.001145  0.658044    0.0  0.171115  0.102041  0.523161    0.936787
2   0.001336  0.713880    0.0  0.113933  0.000000  0.523154    0.936787
3   0.003246  0.967508    0.0  0.143265  0.091837  0.523628    0.936787
4   0.003818  0.669085    0.0  0.253632  0.102041  0.523154    0.936787
       timestamp      bcid     lmiBl        mu    hits_n   hit_row  \
18683   0.760214  0.567823  0.772727  0.297642  0.081633  0.478548   
18688   0.762123  0.387697  0.772727  0.231392  0.102041  0.478548   
18689   0.762314  0.411672  0.772727  0.189707  0.122449  0.478548   
18690   0.771478  0.112618  0.784091  0.157219  0.336735  0.478548   
18695   0.973463  0.398107  0.977273  0.214854  0.091837  0.478548   

         hit_column  
18683  3.375078e-14  
18688  1.618951e-01  
18689  8.112280e-02  
18690  1.530858

In [60]:
preview = tree.arrays(['hits_q'], library='pd', entry_stop = 1000)
pd.set_option('display.max_rows', 1000, 'display.max_columns', 100)
print(preview)

  out[name] = series[name]


     hits_q[0][0][0]  hits_q[0][0][1]  hits_q[0][0][2]  hits_q[0][0][3]  \
0         -1000001.0       -1000001.0       -1000001.0       -1000001.0   
1         -1000001.0       -1000001.0       -1000001.0       -1000001.0   
2            11359.0       -1000001.0       -1000001.0       -1000001.0   
3         -1000001.0       -1000001.0       -1000001.0       -1000001.0   
4         -1000001.0       -1000001.0       -1000001.0       -1000001.0   
5            11359.0           7249.0       -1000001.0       -1000001.0   
6         -1000001.0       -1000001.0       -1000001.0       -1000001.0   
7         -1000001.0       -1000001.0       -1000001.0       -1000001.0   
8             5617.0           4267.0           4267.0           4267.0   
9            19639.0           5617.0           3199.0           5617.0   
10        -1000001.0       -1000001.0       -1000001.0       -1000001.0   
11            9163.0       -1000001.0       -1000001.0       -1000001.0   
12            5617.0     