# Data extraction notebook

This notebook will be used to extract the data from the ohio runs and potentially the data generated by geant
***

In [1]:
import numpy as np, os
import matplotlib.pyplot as plt
import pandas as pd
from my_classes import load_data_ohio
from datetime import datetime
from tabulate import tabulate
from my_classes import *
from glob import glob
from collections import defaultdict

***

# Ohio Processing

### Creates a list of file paths of the original ohio data
- Does not add the file to the list if help or 22 is in the name since help.txt is not a data file and run 22 is garbage

In [2]:
run_path = 'ohio_data/runs/'

# List of filenames that end in .txt except files that contain 'help' and '22'
run_paths = [os.path.join(run_path, file) for file in os.listdir(run_path)
             if all(('help' not in file, '22' not in file, file.endswith('.txt')))]

### Creates a dictionary holding each file and the energy deposition of each channel

- Some values are negative which doesn't make sense in this context since they are measureing light exposure through coulombs. Because of the random negative values I translate everything up by the magnitude of the most negative number (subtract the most negative number from everything)

In [3]:
file_edeps = {}
for file_path in run_paths:
    file_name = os.path.basename(file_path)
    df = pd.read_csv(file_path, delimiter='\t', skiprows=19)
    overall_min = df[5:-2].min().min() # Needs to be done twice since the first one gives [1, 64] array of minimums of each column
    
    channel_edep = {}
    for col in df.columns[5:-2]:
        channel_edep[col] = sum(df[col] + abs(overall_min))
    
    file_edeps[file_name] = channel_edep

### Saves the channel and the corresponding energy deposition in a file with the same name as the original data

In [30]:
for file in file_edeps:
    with open(f'ohio_data/extracted_data/{file}', 'w') as fhandle:
        print(f'''This file was gnenerated on {str(datetime.today())} by Grant Finneman. This contains the data colleted by\nAlex in Ohio. The only processing that has been done so far is summing the edep of each channel.''', file=fhandle)
        print('Channel edep', file=fhandle)
        
        for channel, edep in file_edeps[file].items():
            print(f"{channel.replace(' ',''):5s} {edep:.5f}", file=fhandle)

***

# Gate Simulation Data Extraction

I generalized the simulation extraction to an arbitrary number of volumes of the simulation so I could use a single function on both cube and bar simulations. It takes in a list of data files ready for extraction. It will iterate through them and use pandas to parse the text files. Since they both have volumes starting at number $0$ I don't need to pre-define the size of the dictionary. I use a dictionary since it's pretty easy to use them for keeping running totals.

In [2]:
def processGateVolumes(hit_paths, output_dir):
    '''This function takes in a list of paths that point to the 
    output files of the Gate simulation to be processed.
    The files containing the data should all
    have the ending Hits.dat.
    
    Params
    ------
    hit_paths : [list] A list of paths to the hits.dat files to be processed
    
    output_dir : [str] Path pointing to the direcotry where the extracted files will be placed
    this should not end with a '/'
    '''

    # Generates the correct names for the columns in the file
    names = [f'col{num}' for num in range(1, 24)]
    names[7] = 'id'
    names[8] = 'edep'

    for path in hit_paths:
        # Removes the suffix 'Hits.dat' from the filename for use in the output file
        f_name = os.path.basename(path).rstrip('Hits.dat')
        
        # Loads the data from the current Hits file
        df = pd.read_csv(path, delim_whitespace=True, names=names)
        
        # Defaultdect returns 0 for each key that doesn't exist
        # allows a running total to be easily calculated by +=
        volume_edep = defaultdict(int)
        
        for id_num, edep in sorted(zip(df['id'], df['edep'])):
            volume_edep[id_num] += edep
            
        with open(f'{output_dir}/{f_name}extracted.txt', 'w') as fhandle:
            print(f'This file was gnenerated on {str(datetime.today())} by Grant Finneman. This contains the extracted data from the Gate simulations\nThe edep for each bar was summed up and printed to this file. The only math operation done has been addition.\n', file=fhandle)
            print(tabulate(volume_edep.items(), headers=['id', 'edep'], showindex=False, tablefmt='plain'), file=fhandle)
    return

## Bar Extraction

In [4]:
hit_dir = 'gate_simulations/bar_simulation/output/'

# Globbing to find all files that end in Hits.dat
bar_paths = sorted(glob(f'{hit_dir}*Hits.dat'))
processGateVolumes(hit_paths=bar_paths, output_dir='gate_data/bars/')

## Cube Extraction

In [5]:
hit_dir = 'gate_simulations/cube_simulation/output/'

# Globbing to find all files that end in Hits.dat
cube_paths = sorted(glob(f'{hit_dir}*Hits.dat'))
processGateVolumes(hit_paths=cube_paths, output_dir='gate_data/cubes/')

***

# Geant Processing
**This section is now obsolete and not needed but here for illustration purposes**

- Generating list of cube and bar files

```python
data_dir = 'geant_data/dosimeterDataMMRotated_Completed_1_19_2020/'
filenames = sorted([os.path.join(data_dir, file) for file in os.listdir(data_dir)
            if all(('Training' not in file, 'All' not in file, file.endswith('.txt')))])

cube_filenames = [path for path in filenames if 'Cube' in path]
bar_filenames = [path for path in filenames if 'Bars' in path]
```

- Generates dictionaries of bar files with edep array, cube files and edep array

```python
cube_edeps = {}
for path in cube_filenames:
    filename = os.path.basename(path)
    df = pd.read_csv(path, delim_whitespace=True, names=['x_dim', 'y_dim', 'z_dim', 'edep'])
    cube_edeps[filename] = list(df['edep'])
    
bar_edeps = {}
for path in bar_filenames:
    filename = os.path.basename(path)
    df = pd.read_csv(path, delim_whitespace=True, names=['x_dim', 'y_dim', 'z_dim', 'edep'])
    bar_edeps[filename] = list(df['edep'])
```

- Writes the bar edep and cube edep to their extraction files
- Not much processing is required here since there are no weird values

```python
extracted_cubes = 'geant_data/extracted_data/cubes/'

for filename, data in cube_edeps.items():
    new_name = f'{filename.rstrip(".txt")}_extracted.txt'
    with open(os.path.join(extracted_cubes, new_name), 'w') as file:
        print('cubeID edep', file=file)
        for ID, edep in enumerate(data):
            print(f'{ID} {edep:.5f}', file=file)
            
extracted_bars = 'geant_data/extracted_data/bars/'

for filename, data in bar_edeps.items():
    new_name = f'{filename.rstrip(".txt")}_extracted.txt'
    with open(os.path.join(extracted_bars, new_name), 'w') as file:
        print('barID edep', file=file)
        for ID, edep in enumerate(data):
            print(f'{ID} {edep:.5f}', file=file)
```

***