# Data extraction notebook

This notebook will be used to extract the data from the ohio runs and potentially the data generated by geant
***

# Ohio Processing

In [2]:
import numpy as np, os
import matplotlib.pyplot as plt
import pandas as pd
from my_classes import load_data_ohio
from datetime import datetime
from tabulate import tabulate

### Creates a list of file paths of the original ohio data
- Does not add the file to the list if help or 22 is in the name since help.txt is not a data file and run 22 is garbage

In [2]:
run_path = 'ohio_data/runs/'
run_paths = [os.path.join(run_path, file) for file in os.listdir(run_path)
             if all(('help' not in file, '22' not in file, file.endswith('.txt')))]

### Creates a dictionary holding each file and the energy deposition of each channel

- Some values are negative which doesn't make sense in this context since they are measureing light exposure through coulombs. Because of the random negative values I translate everything up by the magnitude of the most negative number (subtract the most negative number from everything)

In [3]:
file_edeps = {}
for file_path in run_paths:
    file_name = os.path.basename(file_path)
    df = pd.read_csv(file_path, delimiter='\t', skiprows=19)
    overall_min = df[5:-2].min().min() # Needs to be done twice dince the first one gives [1, 64] array of minimums of each column
    
    channel_edep = {}
    for col in df.columns[5:-2]:
        channel_edep[col] = sum(df[col] + abs(overall_min))
    
    file_edeps[file_name] = channel_edep

### Saves the channel and the corresponding energy deposition in a file with the same name as the original data

In [30]:
for file in file_edeps:
    with open(f'ohio_data/extracted_data/{file}', 'w') as fhandle:
        print(f'''This file was gnenerated on {str(datetime.today())} by Grant Finneman. This contains the data colleted by\nAlex in Ohio. The only processing that has been done so far is summing the edep of each channel.''', file=fhandle)
        print('Channel edep', file=fhandle)
        
        for channel, edep in file_edeps[file].items():
            print(f"{channel.replace(' ',''):5s} {edep:.5f}", file=fhandle)

***

# Geant Processing

- Generating list of cube and bar files

In [53]:
data_dir = 'geant_data/dosimeterDataMMRotated_Completed_1_19_2020/'
filenames = sorted([os.path.join(data_dir, file) for file in os.listdir(data_dir)
            if all(('Training' not in file, 'All' not in file, file.endswith('.txt')))])

cube_filenames = [path for path in filenames if 'Cube' in path]
bar_filenames = [path for path in filenames if 'Bars' in path]

- Generates dictionaries of bar files with edep array, cube files and edep array

In [54]:
cube_edeps = {}
for path in cube_filenames:
    filename = os.path.basename(path)
    df = pd.read_csv(path, delim_whitespace=True, names=['x_dim', 'y_dim', 'z_dim', 'edep'])
    cube_edeps[filename] = list(df['edep'])
    
bar_edeps = {}
for path in bar_filenames:
    filename = os.path.basename(path)
    df = pd.read_csv(path, delim_whitespace=True, names=['x_dim', 'y_dim', 'z_dim', 'edep'])
    bar_edeps[filename] = list(df['edep'])

- Writes the bar edep and cube edep to their extraction files
- Not much processing is required here since there are no weird values

In [55]:
extracted_cubes = 'geant_data/extracted_data/cubes/'

for filename, data in cube_edeps.items():
    new_name = f'{filename.rstrip(".txt")}_extracted.txt'
    with open(os.path.join(extracted_cubes, new_name), 'w') as file:
        print('cubeID edep', file=file)
        for ID, edep in enumerate(data):
            print(f'{ID} {edep:.5f}', file=file)
            
extracted_bars = 'geant_data/extracted_data/bars/'

for filename, data in bar_edeps.items():
    new_name = f'{filename.rstrip(".txt")}_extracted.txt'
    with open(os.path.join(extracted_bars, new_name), 'w') as file:
        print('barID edep', file=file)
        for ID, edep in enumerate(data):
            print(f'{ID} {edep:.5f}', file=file)

***

# Gate Simulation Data Extraction

## Bar Extraction

In [30]:
hit_dir = 'gate_simulations/bar_simulation/output/'
paths = sorted([os.path.join(hit_dir, file) for file in os.listdir(hit_dir)
        if all(('Hits' in file, True))])

# generates the correct names for the columns I want
names = [f'col{num}' for num in range(1, 24)]
names[7] = 'id'
names[8] = 'edep'

In [None]:
for path in paths:
    f_name = os.path.basename(path).rstrip('Hits.dat')

    # Initializes the dictionary with keys from 0-63 and values of 0
    file_edep = dict.fromkeys(list(range(64)), 0)

    # Loads the data from the current Hits file
    df = pd.read_csv(path, delim_whitespace=True, names=names)

    for id_num, edep in zip(df['id'], df['edep']):
        file_edep[id_num] += edep

    with open(f'geant_data/extracted_data/grant_gate/bars/{f_name}extracted.txt', 'w') as fhandle:
        print(f'This file was gnenerated on {str(datetime.today())} by Grant Finneman. This contains the extracted data from the Gate simulations\nThe edep for each bar was summed up and printed to this file. The only math operation done has been addition.\n', file=fhandle)
        print(tabulate(file_edep.items(), headers=['id', 'edep'], showindex=False, tablefmt='plain'), file=fhandle)        
        

gate_simulations/bar_simulation/output/W05__H05__A000_bars_Hits.dat
gate_simulations/bar_simulation/output/W05__H05__A020_bars_Hits.dat
gate_simulations/bar_simulation/output/W05__H05__A040_bars_Hits.dat
gate_simulations/bar_simulation/output/W05__H05__A060_bars_Hits.dat
gate_simulations/bar_simulation/output/W05__H05__A080_bars_Hits.dat
gate_simulations/bar_simulation/output/W05__H05__A100_bars_Hits.dat
gate_simulations/bar_simulation/output/W05__H05__A120_bars_Hits.dat
gate_simulations/bar_simulation/output/W05__H05__A140_bars_Hits.dat
gate_simulations/bar_simulation/output/W05__H05__A160_bars_Hits.dat
gate_simulations/bar_simulation/output/W05__H05__A180_bars_Hits.dat
gate_simulations/bar_simulation/output/W05__H05__A200_bars_Hits.dat
gate_simulations/bar_simulation/output/W05__H05__A220_bars_Hits.dat
gate_simulations/bar_simulation/output/W05__H05__A240_bars_Hits.dat
gate_simulations/bar_simulation/output/W05__H05__A260_bars_Hits.dat
gate_simulations/bar_simulation/output/W05__H05_

- Iterates over all of the paths in the paths list and loads the data into a dataframe from pandas. I made a list of column names above and give that to pandas. Using the ability to access columns in pandas I iterate over the ID and edep columns and keep a running total of the edep for each bar with a dictionary. Still within the paths loop, I open a file and give it the same name as the original file using fstrings makes this easy. I print bar_id edep as column headers then print the individual rows with a or loop to the file.

***