# Data extraction notebook

This notebook will be used to extract the data from the ohio runs and potentially the data generated by geant
***

# Ohio Processing

In [1]:
import numpy as np, os
import matplotlib.pyplot as plt
import pandas as pd
from my_classes import load_data_ohio
from datetime import datetime

### Creates a list of file paths of the original ohio data
- Does not add the file to the list if help or 22 is in the name since help.txt is not a data file and run 22 is garbage

In [22]:
run_path = 'ohio_data/runs/'
run_paths = [os.path.join(run_path, file) for file in os.listdir(run_path)
             if all(('help' not in file, '22' not in file, file.endswith('.txt')))]

### Creates a dictionary holding each file and the energy deposition of each channel

- Some values are negative which doesn't make sense in this context since they are measureing light exposure through coulombs. Because of the random negative values I translate everything up by the magnitude of the most negative number (subtract the most negative number from everything)

In [23]:
file_edeps = {}
for file_path in run_paths:
    file_name = os.path.basename(file_path)
    df = pd.read_csv(file_path, delimiter='\t', skiprows=19)
    overall_min = df[5:-2].min().min() # Needs to be done twice dince the first one gives [1, 64] array of minimums of each column
    
    channel_edep = {}
    for col in df.columns[5:-2]:
        channel_edep[col] = sum(df[col] + abs(overall_min))
    
    file_edeps[file_name] = channel_edep

### Saves the channel and the corresponding energy deposition in a file with the same name as the original data

In [30]:
for file in file_edeps:
    with open(f'ohio_data/extracted_data/{file}', 'w') as fhandle:
        print(f'''This file was gnenerated on {str(datetime.today())} by Grant Finneman. This contains the data colleted by\nAlex in Ohio. The only processing that has been done so far is summing the edep of each channel.''', file=fhandle)
        print('Channel edep', file=fhandle)
        
        for channel, edep in file_edeps[file].items():
            print(f"{channel.replace(' ',''):5s} {edep:.5f}", file=fhandle)

***

# Geant Processing

- Generating list of cube and bar files

In [53]:
data_dir = 'geant_data/dosimeterDataMMRotated_Completed_1_19_2020/'
filenames = sorted([os.path.join(data_dir, file) for file in os.listdir(data_dir)
            if all(('Training' not in file, 'All' not in file, file.endswith('.txt')))])

cube_filenames = [path for path in filenames if 'Cube' in path]
bar_filenames = [path for path in filenames if 'Bars' in path]

- Generates dictionaries of bar files with edep array, cube files and edep array

In [54]:
cube_edeps = {}
for path in cube_filenames:
    filename = os.path.basename(path)
    df = pd.read_csv(path, delim_whitespace=True, names=['x_dim', 'y_dim', 'z_dim', 'edep'])
    cube_edeps[filename] = list(df['edep'])
    
bar_edeps = {}
for path in bar_filenames:
    filename = os.path.basename(path)
    df = pd.read_csv(path, delim_whitespace=True, names=['x_dim', 'y_dim', 'z_dim', 'edep'])
    bar_edeps[filename] = list(df['edep'])

- Writes the bar edep and cube edep to their extraction files
- Not much processing is required here since there are no weird values

In [55]:
extracted_cubes = 'geant_data/extracted_data/cubes/'

for filename, data in cube_edeps.items():
    new_name = f'{filename.rstrip(".txt")}_extracted.txt'
    with open(os.path.join(extracted_cubes, new_name), 'w') as file:
        print('cubeID edep', file=file)
        for ID, edep in enumerate(data):
            print(f'{ID} {edep:.5f}', file=file)
            
extracted_bars = 'geant_data/extracted_data/bars/'

for filename, data in bar_edeps.items():
    new_name = f'{filename.rstrip(".txt")}_extracted.txt'
    with open(os.path.join(extracted_bars, new_name), 'w') as file:
        print('barID edep', file=file)
        for ID, edep in enumerate(data):
            print(f'{ID} {edep:.5f}', file=file)

***

# Gate Simulation

In [36]:
hit_dir = 'gate_simulations/bar_simulation/8_8detector/output/'
paths = [os.path.join(hit_dir, file) for file in os.listdir(hit_dir)
        if all(('Hits' in file, True))]
paths

# generates the correct names for the columns I want
names = [f'col{num}' for num in range(1, 24)]
names[7] = 'id'
names[8] = 'edep'

In [37]:
df = pd.read_csv(paths[0], delim_whitespace=True, names=names)
df.head()

Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,id,edep,col10,...,col14,col15,col16,col17,col18,col19,col20,col21,col22,col23
0,0,0,1,0,0,0,0,38,0.0001,0.0,...,0.3833,22,1,0,0,0,0,conv,,
1,0,0,1,0,0,0,0,38,0.0001,1.538,...,0.4721,-11,3,1,0,0,0,msc,,
2,0,0,1,0,0,0,0,38,0.0001,0.2712,...,0.4893,-11,3,1,0,0,0,Transportation,,
3,0,0,1,0,0,0,0,37,0.0001,0.0,...,0.4893,-11,3,1,0,0,0,msc,,
4,0,0,1,0,0,0,0,37,0.0001,0.1767,...,0.4891,-11,3,1,0,0,0,eBrem,,


In [38]:
# Initializes the dictionary with keys from 0-63
edep_dict = dict.fromkeys(list(range(64)), 0)

for id_num, edep in zip(df['id'], df['edep']):
    edep_dict[id_num] += edep

In [39]:
edep_dict

{0: 177.82450024705764,
 1: 123.4150001908587,
 2: 131.36410018513484,
 3: 107.04360018939694,
 4: 116.57480014725076,
 5: 121.68190013340048,
 6: 52.14180007258344,
 7: 42.898000061607306,
 8: 409.9827006069259,
 9: 477.49470065120624,
 10: 464.43870059740965,
 11: 317.8618003953642,
 12: 190.34680029003133,
 13: 168.13830022622773,
 14: 117.53570016084574,
 15: 116.38490016932643,
 16: 3069.9968038706106,
 17: 2951.3719037798114,
 18: 2601.8993031975547,
 19: 1584.5090019682711,
 20: 845.8039011354276,
 21: 535.2236007402117,
 22: 349.2636004719447,
 23: 217.94870029303266,
 24: 45457.80835803768,
 25: 51582.513063359474,
 26: 46586.59995633389,
 27: 41411.584848726,
 28: 34039.88493927323,
 29: 24813.76102757328,
 30: 14300.318715318337,
 31: 5647.283805953375,
 32: 46328.12645898382,
 33: 50190.627262285634,
 34: 49015.71475907381,
 35: 42652.24595016432,
 36: 35456.99054074965,
 37: 25107.606628496855,
 38: 14778.572016093794,
 39: 5474.148805896132,
 40: 3013.5338039329044,
 41: 