# Text Functions


The Sectionary package was originally designed to deal with text data files containing different sections of data in different formats.

In [1]:
#%% Imports
from itertools import chain
import pandas as pd
#import read_dvh_file
import text_reader as tp
from sections import Rule, RuleSet, ProcessingMethods


#### Standard Python Modules

In [2]:
from typing import List
from pathlib import Path
from pprint import pprint
import re
import sys

#### Useful Third Party Packages

In [3]:
import pandas as pd
import xlwings as xw

#### Sectionary Imports

In [4]:
#sys.path.append(r'../src/sectionary') 

import text_reader as tp
from sections import Rule, RuleSet, SectionBreak, ProcessingMethods, Section

In [5]:
# simple_cascading_iterators
def ml(x): return x*10
def dv(x): return x/5

def skip_odd(num_list):
    for i in num_list:
        if i % 2 == 0:
            yield i

source = range(5)
method_set = ProcessingMethods([skip_odd, ml, dv])
test_output = method_set.read(source, {})

test_output


[0.0, 4.0, 8.0]

In [6]:
# csv_parser
test_text = 'Part 1,"Part 2a, Part 2b"'
expected_output = [['Part 1', 'Part 2a, Part 2b']]
test_parser = tp.define_csv_parser(name='Default csv')
test_output = [row for row in test_parser(test_text)]
test_output


[['Part 1', 'Part 2a, Part 2b']]

In [7]:
# default_csv_parser
test_text = [
    'Export Version:,1',
    '================',
    '',
    'IMSure Version:,3.7.2',
    'Exported Date:,03.09.2020  14:20',
    'User:,Superuser',
    'Patient:,"____, ----"',
    'Patient ID:,0123456',
    ]
expected_output = [
    ['Export Version:', '1'],
    ['================'],
    [],
    ['IMSure Version:', '3.7.2'],
    ['Exported Date:', '03.09.2020  14:20'],
    ['User:', 'Superuser'],
    ['Patient:', '____, ----'],
    ['Patient ID:', '0123456']
    ]
test_parser = tp.define_csv_parser(name='Default')
test_iter = (test_parser(line) for line in test_text)
test_output = [row for row in chain.from_iterable(test_iter)]
test_output


[['Export Version:', '1'],
 [],
 ['IMSure Version:', '3.7.2'],
 ['Exported Date:', '03.09.2020  14:20'],
 ['User:', 'Superuser'],
 ['Patient:', '____, ----'],
 ['Patient ID:', '0123456']]

In [8]:
# list_csv_parser
test_text = [
    'Patient Name:     ____, ____',
    'Patient ID:   1234567',
    'Comment: DVHs for multiple plans and plan sums',
    'Date: Friday, January 17, 2020 09:45:07',
    'Exported by:    gsal',
    'Type: Cumulative Dose Volume Histogram',
    'Description:The cumulative DVH displays the percentage',
    'or volume (absolute) of structures that receive a dose',
    'equal to or greater than a given dose.',
    'Plan sum: Plan Sum',
    'Course: PLAN SUM',
    'Prescribed dose [cGy]: not defined',
    '% for dose (%): not defined'
    ]
expected_output = [
    ['Patient Name', '____, ____'],
    ['Patient ID', '1234567'],
    ['Comment', 'DVHs for multiple plans and plan sums'],
    ['Date', 'Friday, January 17, 2020 09', '45', '07'],
    ['Exported by', 'gsal'],
    ['Type', 'Cumulative Dose Volume Histogram'],
    ['Description', 'The cumulative DVH displays the percentage'],
    ['or volume (absolute) of structures that receive a dose'],
    ['equal to or greater than a given dose.'],
    ['Plan sum', 'Plan Sum'],
    ['Course', 'PLAN SUM'],
    ['Prescribed dose [cGy]', 'not defined'],
    ['% for dose (%)', 'not defined'],
    ]
test_parser = tp.define_csv_parser('dvh_info', delimiter=':',
                                    skipinitialspace=True)
test_iter = (test_parser(line) for line in test_text)
test_output = [row for row in chain.from_iterable(test_iter)]
test_output


[['Patient Name', '____, ____'],
 ['Patient ID', '1234567'],
 ['Comment', 'DVHs for multiple plans and plan sums'],
 ['Date', 'Friday, January 17, 2020 09', '45', '07'],
 ['Exported by', 'gsal'],
 ['Type', 'Cumulative Dose Volume Histogram'],
 ['Description', 'The cumulative DVH displays the percentage'],
 ['or volume (absolute) of structures that receive a dose'],
 ['equal to or greater than a given dose.'],
 ['Plan sum', 'Plan Sum'],
 ['Course', 'PLAN SUM'],
 ['Prescribed dose [cGy]', 'not defined'],
 ['% for dose (%)', 'not defined']]

In [9]:
# Prescribed Dose Rule
def make_prescribed_dose_rule() -> Rule:
    def parse_prescribed_dose(line, event) -> tp.ProcessedList:
        '''Split "Prescribed dose [cGy]" into 2 lines.

        Return two rows for a line containing:
            Prescribed dose [unit]: dose
        Gives:
            [['Prescribed dose', 'dose'],
            ['Prescribed dose unit', 'unit']],
        The line:
            Prescribed dose [unit]: not defined
        Results in:
            [['Prescribed dose', '5000.0'],
             ['Prescribed dose unit', 'cGy']]
        '''
        match_results = event.test_value.groupdict()
        if match_results['dose'] == 'not defined':
            match_results['dose'] = ''
            match_results['unit'] = ''

        parsed_lines = [
            ['Prescribed dose', match_results['dose']],
            ['Prescribed dose unit', match_results['unit']]
            ]
        for line in parsed_lines:
            yield line

    prescribed_dose_pattern = (
        r'^Prescribed dose\s*'  # Begins with Prescribed dose
        r'\['                   # Unit start delimiter
        r'(?P<unit>[A-Za-z]+)'  # unit group: text surrounded by []
        r'\]'                   # Unit end delimiter
        r'\s*:\s*'              # Dose delimiter with possible whitespace
        r'(?P<dose>[0-9.]+'     # dose group Number
        r'|not defined)'        #"not defined" alternative
        r'[\s\r\n]*'            # drop trailing whitespace
        r'$'                    # end of string
        )
    re_pattern = re.compile(prescribed_dose_pattern)
    dose_rule = Rule(sentinel=re_pattern, name='prescribed_dose_rule',
                        pass_method= parse_prescribed_dose, fail_method='None')
    return dose_rule




In [10]:
# parse_prescribed_dose_rule
test_text = [
    'Prescribed dose [cGy]: not defined',
    '% for dose (%): not defined',
    'Prescribed dose [cGy]: 5000.0',
    '% for dose (%): 100.0'
    ]
expected_output = [
    ['Prescribed dose', ''],
    ['Prescribed dose unit', ''],
    ['Prescribed dose', '5000.0'],
    ['Prescribed dose unit', 'cGy'],
    ]

dose_rule = make_prescribed_dose_rule()
test_output = list()
for line in test_text:
    result = dose_rule(line)
    line_output = [p_line for p_line in result]
    if dose_rule.event.test_passed:
        test_output.extend(line_output)
test_output


[['Prescribed dose', ''],
 ['Prescribed dose unit', ''],
 ['Prescribed dose', '5000.0'],
 ['Prescribed dose unit', 'cGy']]

In [11]:
def make_date_parse_rule() -> Rule:
    def date_parse(line: str) -> tp.ProcessedList:
        '''If Date,don't split beyond first :.'''
        parsed_line = line.split(':', maxsplit=1)
        return parsed_line

    date_rule = Rule('Date', location='START', name='date_rule',
                        pass_method=date_parse, fail_method='None')
    return date_rule



In [12]:
# date_parse_rule(self):
test_text = [
    'Date: Friday, January 17, 2020 09:45:07',
    'Exported by: gsal'
    ]
expected_output = [
    ['Date', ' Friday, January 17, 2020 09:45:07']
    ]
date_rule = make_date_parse_rule()
test_output = list()
for line in test_text:
    result = date_rule.apply(line)
    if date_rule.event.test_passed:
        test_output.append(result)
test_output


[['Date', ' Friday, January 17, 2020 09:45:07']]

In [13]:
# Approved Status
def make_approved_status_rule() -> Rule:
    '''If Treatment Approved, Split "Plan Status" into 3 lines:
        Plan Status
        Approved on
        Approved by
        '''
    def approved_status_parse(line, event) -> tp.ProcessedList:
        '''If Treatment Approved, Split "Plan Status" into 3 lines:

        Return three rows for a line containing "Treatment Approved"
            Prescribed dose [unit]: dose
        Gives:
            [['Plan Status', 'Treatment Approved'],
             ['Approved on', date],
             ['Approved by', person]
        '''
        idx1 = line.find(event.test_value)
        idx2 = idx1 + len(event.test_value)
        idx3 = line.find(' by')
        idx4 = idx3 + 4
        parsed_lines = [
            ['Plan Status', line[idx1:idx2]],
            ['Approved on', line[idx2+1:idx3]],
            ['Approved by', line[idx4:]]
            ]
        for line in parsed_lines:
            yield line

    approved_status_rule = Rule('Treatment Approved', location='IN',
                                   pass_method=approved_status_parse,
                                   fail_method='None',
                                   name='approved_status_rule')
    return approved_status_rule



In [14]:
# approved_status_rule
test_text = [
    'Plan: PARR',
    ('Plan Status: Treatment Approved Thursday, '
        'January 02, 2020 12:55:56 by gsal'),
    'Plan: PARR2-50Gy',
    'Plan Status: Unapproved'
    ]
expected_output = [
    ['Plan Status', 'Treatment Approved'],
    ['Approved on', 'Thursday, January 02, 2020 12:55:56'],
    ['Approved by', 'gsal']
    ]

approved_status_rule = make_approved_status_rule()
test_output = list()
for line in test_text:
    result = approved_status_rule.apply(line)
    if approved_status_rule.event.test_passed:
        line_output = [p_line for p_line in result]
        test_output.extend(line_output)
test_output


[['Plan Status', 'Treatment Approved'],
 ['Approved on', 'Thursday, January 02, 2020 12:55:56'],
 ['Approved by', 'gsal']]

In [15]:
# Prescribed Dose Rule
def make_prescribed_dose_rule() -> Rule:
    def parse_prescribed_dose(line, event) -> tp.ProcessedList:
        '''Split "Prescribed dose [cGy]" into 2 lines.

        Return two rows for a line containing:
            Prescribed dose [unit]: dose
        Gives:
            [['Prescribed dose', 'dose'],
            ['Prescribed dose unit', 'unit']],
        The line:
            Prescribed dose [unit]: not defined
        Results in:
            [['Prescribed dose', '5000.0'],
             ['Prescribed dose unit', 'cGy']]
        '''
        match_results = event.test_value.groupdict()
        if match_results['dose'] == 'not defined':
            match_results['dose'] = ''
            match_results['unit'] = ''

        parsed_lines = [
            ['Prescribed dose', match_results['dose']],
            ['Prescribed dose unit', match_results['unit']]
            ]
        for line in parsed_lines:
            yield line

    prescribed_dose_pattern = (
        r'^Prescribed dose\s*'  # Begins with Prescribed dose
        r'\['                   # Unit start delimiter
        r'(?P<unit>[A-Za-z]+)'  # unit group: text surrounded by []
        r'\]'                   # Unit end delimiter
        r'\s*:\s*'              # Dose delimiter with possible whitespace
        r'(?P<dose>[0-9.]+'     # dose group Number
        r'|not defined)'        #"not defined" alternative
        r'[\s\r\n]*'            # drop trailing whitespace
        r'$'                    # end of string
        )
    re_pattern = re.compile(prescribed_dose_pattern)
    dose_rule = Rule(sentinel=re_pattern, name='prescribed_dose_rule',
                        pass_method= parse_prescribed_dose, fail_method='None')
    return dose_rule


In [16]:
# dvh_line_parser
test_text = [
    'Patient Name: ____, ____',
    'Patient ID:   1234567',
    'Comment:      DVHs for multiple plans and plan sums',
    'Date:Friday, January 17, 2020 09:45:07',
    'Exported by:  gsal',
    'Type:         Cumulative Dose Volume Histogram',
    'Description:  The cumulative DVH displays the percentage',
    'or volume (absolute) of structures that receive a dose',
    '        equal to or greater than a given dose.',
    '',
    'Plan sum: Plan Sum',
    'Course: PLAN SUM',
    'Prescribed dose [cGy]: not defined',
    '% for dose (%): not defined',
    '',
    'Plan: PARR',
    'Course: C1',
    ('Plan Status: Treatment Approved Thursday, '
    'January 02, 2020 12:55:56 by gsal'),
    'Prescribed dose [cGy]: 5000.0',
    '% for dose (%): 100.0'
    ]
expected_output = [
    ['Patient Name', '____, ____'],
    ['Patient ID', '1234567'],
    ['Comment', 'DVHs for multiple plans and plan sums'],
    ['Date', 'Friday, January 17, 2020 09:45:07'],
    ['Exported by', 'gsal'],
    ['Type', 'Cumulative Dose Volume Histogram'],
    ['Description', 'The cumulative DVH displays the percentage'],
    ['or volume (absolute) of structures that receive a dose'],
    ['equal to or greater than a given dose.'],
    [],
    ['Plan sum', 'Plan Sum'],
    ['Course', 'PLAN SUM'],
    ['Prescribed dose', ''],
    ['Prescribed dose unit', ''],
    ['% for dose (%)', 'not defined'],
    [],
    ['Plan', 'PARR'],
    ['Course', 'C1'],
    ['Plan Status', 'Treatment Approved'],
    ['Approved on', 'Thursday, January 02, 2020 12:55:56'],
    ['Approved by', 'gsal'],
    ['Prescribed dose', '5000.0'],
    ['Prescribed dose unit', 'cGy'],
    ['% for dose (%)', '100.0']
    ]

default_parser = tp.define_csv_parser('dvh_info', delimiter=':',
                                        skipinitialspace=True)
parsing_rules = [
    make_prescribed_dose_rule(),
    make_date_parse_rule(),
    make_approved_status_rule()
    ]

test_parser = RuleSet(parsing_rules, default=default_parser)
test_output = list()
for line in test_text:
    test_output.extend(test_parser(line, {}))
test_output


[['Patient Name', '____, ____'],
 ['Patient ID', '1234567'],
 ['Comment', 'DVHs for multiple plans and plan sums'],
 ['Date', 'Friday, January 17, 2020 09:45:07'],
 ['Exported by', 'gsal'],
 ['Type', 'Cumulative Dose Volume Histogram'],
 ['Description', 'The cumulative DVH displays the percentage'],
 ['or volume (absolute) of structures that receive a dose'],
 ['equal to or greater than a given dose.'],
 [],
 ['Plan sum', 'Plan Sum'],
 ['Course', 'PLAN SUM'],
 ['Prescribed dose', ''],
 ['Prescribed dose unit', ''],
 ['% for dose (%)', 'not defined'],
 [],
 ['Plan', 'PARR'],
 ['Course', 'C1'],
 ['Plan Status', 'Treatment Approved'],
 ['Approved on', 'Thursday, January 02, 2020 12:55:56'],
 ['Approved by', 'gsal'],
 ['Prescribed dose', '5000.0'],
 ['Prescribed dose unit', 'cGy'],
 ['% for dose (%)', '100.0']]

#### Fixed Width Parser


In [17]:
# uniform_width_parser
parser_constructor = tp.FixedWidthParser(widths=6,number=3)
parser = parser_constructor.parse
line = 'Part 1Part 2Part 2'
test_output = parser(line)

test_output

['Part 1', 'Part 2', 'Part 2']

In [18]:
#single_break_parser
parser_constructor = tp.FixedWidthParser(widths=6)
parser = parser_constructor.parse
line = 'Part 1Part 2Part 2'
test_output = parser(line)

test_output


['Part 1', 'Part 2Part 2']

In [19]:
#varied_width_parser
parser_constructor = tp.FixedWidthParser(widths=[6,7,8])
parser = parser_constructor.parse
line = 'Part 1Part 2aPart 3ab'
test_output = parser(line)

test_output

['Part 1', 'Part 2a', 'Part 3ab']

In [20]:
# position_parser
expected_output = ['Part 1', 'Part 2a', 'Part 3ab', 'Remainder']
parser_constructor = tp.FixedWidthParser(locations=[6,13,21])
parser = parser_constructor.parse
line = 'Part 1Part 2aPart 3abRemainder'
test_output = parser(line)

test_output


['Part 1', 'Part 2a', 'Part 3ab', 'Remainder']

In [21]:
# empty_parser
parser_constructor = tp.FixedWidthParser()
parser = parser_constructor.parse
line = 'Part 1Part 2aPart 3ab'
test_output = parser(line)

test_output


['Part 1Part 2aPart 3ab']

In [22]:
# Data Frame Output

# single_header_dataframe
test_text = [
    ['A', 'B', 'C'],
    [1, 2, 3],
    [4, 5, 6]
    ]
expected_output = pd.DataFrame({'A': [1,4],'B': [2,5],'C':[3,6]})
output = tp.to_dataframe(test_text, header=True)

output

Unnamed: 0,A,B,C
0,1,2,3
1,4,5,6


## Text Processing Functions

### Split a text string into parts.
- The `delimiter=';'` argument tells it to split the string on ";"s.
- The `skipinitialspace=True` argument tells it to strip leading spaces from the
 text.

 For example:
 |Text|Becomes|
 |----|-------|
 |`'       Course;C1'`|`['Course', 'C1']`]|
 |`'Intent;1_PRIMARY'`|`['Intent', '1_PRIMARY']`]|
 |`'Plan Id;PELB FB'`|`['Plan Id', 'PELB FB']`]|
 |`'Technique;'`|`['Technique']`]|

In [23]:
dict_parse = tp.define_csv_parser(
    delimiter=';',
    skipinitialspace=True)

### Convert a list of two-item lists to a dictionary.
- First item in the sub-list is the key.  The second item is the value.
- `default_value=None` will cause one-item sub-lists to be dropped.

 For example the text:
 ```[
    ['Course', 'C1'],
    ['Intent', '1_PRIMARY'],
    ['Plan Id', 'PELB FB',
    ['Technique']
    ]```

becomes:
```{
    'Course': 'C1',
    'Intent': '1_PRIMARY',
    'Plan Id': 'PELB FB'
    }```

*Note: * `['Technique']` is dropped because it is a single-item list.

In [24]:
from functools import partial
trim_dict = partial(tp.to_dict, default_value=None)

In [25]:
from typing import List, Tuple, Dict, Any
def make_section_dict(parsed_text: List[Tuple[str]], context: Dict[str, Any]):
    section_data = tr.to_dict(parsed_text, default_value=None)
    name = context['Current Section']
    section_dict = {name: section_data}
    return section_dict

### Identify strings containing "Warning" text.
- a regular expression is used to identify the "Warning" text:
    - `'(?P<Num>[0-9]+)[. ]+'` Looks for one or more digits followed by a "." 
    and/or spaces.  This is assigned as the "Num" group.
    - `'WARNING[: ]*'`  The word "WARNING", followed by optional ":" 
    and/or spaces 
    - `'(?P<Warning>.*$)'`  The warning text is then the remainder of the 
    string. This is assigned as the "Warning" group.

If a match is found returns a *two*-item list: 
`["Num" group, "Warning" group]`.<br>
If a match is **not** found returns a *one*-item list: `[Original Text]`.

For example:
> `'1. WARNING: Plan target volume is different than plan primary reference
 point volume.'`

 Returns:
> `['1', 'Plan target volume is different than plan primary reference
 point volume.']`

And
> `'PhotonAlg; AAA_15606_Golden_Beam'`

Returns:
> `['PhotonAlg; AAA_15606_Golden_Beam']`


In [26]:
def get_warning(text_line):
    warning_pattern = re.compile(
        '(?P<Num>[0-9]+)'   # Warning index as Num group
        '[. ]+'             # delimiter and space
        'WARNING'           # warning text
        '[: ]*'             # delimiter and space
        '(?P<Warning>.*$)'  # remaining text in line as Warning group
        )
    warning_match = warning_pattern.search(text_line)
    if warning_match:
        indexer = warning_match.group('Num')
        warning_text = warning_match.group('Warning')
        warning_output = [f'Warning{indexer}', warning_text]
    else:
        warning_output = [text_line]
    return warning_output

### Parse the User origin text line.
- The text is expected to have the form of three numbers with 'cm' units 
contained in brackets.  For example:
> `(-1.26cm, 9.95cm, -4.70cm)`
- A regular expression is used to parse the "User Origin" text:
    - `'[^(]+.'` Everything up to and including the first bracket.
    - `'(?P<X>[0-9.-]+)'`  The X number group. 
    - `'[ cm,]*'`  'cm' units, spaces and comma 
    - `'(?P<Y>[0-9.-]+)'`  The Y number group.
    - `'[ cm,]*'`  'cm' units, spaces and comma 
    - `'(?P<Z>[0-9.-]+)'`  The Z number group.
    - `'[ cm,)]*'`  'cm' units, spaces, comma  and end bracket

If a match is found, returns four output items, each containing a 
two-item list:
> `[`<br>
    `['User Origin', `*The original text line after the '='*`],`<br>
    `['Origin X', `*The matched 'X' group*`],`<br>
    `['Origin Y', `*The matched 'Y' group*`],`<br>
    `['Origin Z', `*The matched 'Z' group*`]`<br>
    `]`

If a match is **not** found, returns a list containing the original text string 
split on ";"s.

For example:
> `'User Origin;User origin DICOM offset = (-1.26cm, 9.95cm, -4.70cm)'`

 Returns:
> `[`<br>
    `['User Origin', '(-1.26cm, 9.95cm, -4.70cm)'],`<br>
    `['Origin X', -1.26],`<br>
    `['Origin Y', 9.95],`<br>
    `['Origin Z', -4.70]`<br>
    `]`

And
> `'PhotonAlg; AAA_15606_Golden_Beam'`

Returns:
> `['PhotonAlg', 'AAA_15606_Golden_Beam']`




In [27]:
def get_origin(text_line):
    origin_pattern = re.compile(
        '[^(]+.'           # Everything up to and including the first bracket
        '(?P<X>[0-9.-]+)'  # X number group
        '[ cm,]*'          # Unit, space and comma
        '(?P<Y>[0-9.-]+)'  # Y number group
        '[ cm,]*'          # Unit, space and comma
        '(?P<Z>[0-9.-]+)'  # Z number group
        '[ cm,)]*'         # Unit, space, comma and end bracket
        )
    origin_match = origin_pattern.search(text_line)
    if origin_match:
        origin_str = text_line.split('=')[1].strip()
        origin = [
            ['User Origin', origin_str],
            ['Origin X', origin_match.group('X')],
            ['Origin Y', origin_match.group('Y')],
            ['Origin Z', origin_match.group('Z')]
            ]
    else:
        origin = [text_line.split(';')]
    for row in origin:
        yield row

### Parse the gantry text line.
- The text is expected to have the form of three numbers with 'cm' units 
contained in brackets. <br>
For example:
    - `Gantry;0.0 deg to - deg` 
    <br> or <br>
    - `Gantry;181.0 degCW to 179.0 deg`

- A regular expression is used to parse the gantry text:
    - `'(?P<GantryStart>[0-9.-]+)'` gantry start angle, assigned to 
    "GantryStart".
    - `'[ degtoCCW]*'`  Unit, space direction and "to" (not captured). 
    - `'(?P<GantryEnd>[0-9.-]+)'`  Gantry end angle, assigned to 
    "GantryEnd".
    - `'[ deg]*'`  'deg' units and space  (not captured).
    - `'[ cm,]*'`  'cm' units, spaces and comma 

If a match is found, returns either one or three output items, each containing a 
two-item list.<br>
> If *GantryEnd* contains `'-'`  (meaning gantry doesn't move), returns:<br>
    >> `[['Gantry', `*GantryStart*`]]`

> Otherwise (moving gantry), returns:<br>
    >> `[`<br>
    >> `['Gantry', `*GantryStart*`],`<br>
    >> `['GantryStart', `*GantryStart*`],`<br>
    >> `['GantryEnd', `*GantryEnd*`],`<br>
    >> `]`
                
If a match is **not** found returns a list containing the original text string 
split on ";"s.

For example:
> `Gantry;0.0 deg to - deg`

 Returns:
> `[['Gantry', '0.0']]`

Or
> `Gantry;181.0 degCW to 179.0 deg`

 Returns:
> `[`<br>
> `['Gantry', '181.0'],`<br>
> `['GantryStart', '181.0'],`<br>
> `['GantryEnd', '179.0'],`<br>
> `]`

And
> `'PhotonAlg; AAA_15606_Golden_Beam'`

Returns:
> `['PhotonAlg', 'AAA_15606_Golden_Beam']`

In [28]:
def get_gantry(text_line):
    gantry_pattern = re.compile(
        '(?P<GantryStart>[0-9.-]+)'  # gantry start group
        '[ degtoCCW]*'               # Unit, space direction and "to"
        '(?P<GantryEnd>[0-9.-]+)'    # gantry start group
        '[ deg]*'                    # Unit and space
        )
    gantry_match = gantry_pattern.search(text_line)
    if gantry_match:
        gantry_start = gantry_match.group('GantryStart')
        gantry_end = gantry_match.group('GantryEnd')
        if '-' in gantry_end:
            gantry = [
                ['Gantry', gantry_start]
                ]
        else:
            gantry = [
                ['Gantry', gantry_start],
                ['GantryStart', gantry_start],
                ['GantryEnd', gantry_end],
                ]
    else:
        gantry = [text_line.split(';')]
    for row in gantry:
        yield row

### Relabel *"No Field Normalization"*
Replace lines with text: 
> *'NO_ISQLAW_NORM'*

with processed output:
> `norm_line = ['Norm Method', 'No Field Normalization']`

In [29]:
def clean_norm(text_line):
    if 'NO_ISQLAW_NORM' in text_line:
        norm_line = ['Norm Method', 'No Field Normalization']
    return norm_line

### Drop units from numeric data

- A regular expression is used to extract the value portion of a string tha contains a number with units.
-   - `'^\s*'` beginning of string and leading whitespace.
    - `'(?P<value>[-+]?\d+[.]?\d*)'`  The value group containing optional initial
        sign and decimal place with numbers before and/or after.
    - `'[ cm,]*'`  'cm' units, spaces and comma 
    - `'(?P<Y>[0-9.-]+)'`  The Y number group.
    - `'\s*'`  Optional whitespace between value and units.
    - `'(?P<unit>[^\s]*)'`  The units group, which does not contain spaces.
    - `'\s*$'`  Trailing whitespace and end of string



If a match is found, returns the value group as a float otherwise return the original text.



In [30]:
from typing import Union
def drop_units(text: str) -> Union[float, str]:
    number_value_pattern = re.compile(
        # beginning of string and leading whitespace
        r'^\s*'                
        # value group contains optional initial sign and decimal place with 
        # number before and/or after.
        r'(?P<value>[-+]?\d+[.]?\d*)'    
        r'\s*'              # Optional whitespace between value and units
        r'(?P<unit>[^\s]*)' # units do not contain spaces
        r'\s*'              # drop trailing whitespace
        r'$'                # end of string
        )
    find_num = number_value_pattern.search(text)
    if find_num:
        value, unit = find_num.groups()
        return float(value)
    return text


def numeric_values(text_row: Tuple[str]) -> Tuple[str, float]:
    try:
        label, text_value = text_row
    except ValueError:
        return text_row
    numeric_value = drop_units(text_value)
    return (label, numeric_value)


def numeric_values_list(text_list: List[str]) -> List[Union[str, float]]:
    converted_list = [drop_units(text_item) for text_item in text_list]
    return converted_list

In [31]:
test_file = Path.cwd() / 'examples' / 'test_DIR_Data.txt'
dir_text = test_file.read_text().splitlines()

In [32]:
date_pattern = tp.build_date_re(compile_re=False)
file_listing_pt = re.compile(
    f'{date_pattern}'  # Insert date pattern
    '[ ]+'             # Arbitrary number of spaces
    '(?P<size>'        # beginning of size string group
    '[0-9]+'           # Integer size of folder
    ')'                # end of size string group
    ' '                # Single space
    '(?P<filename>'    # beginning of filename string group
    '.*'               # Integer size of folder
    ')'                # end of size string group
    '$'                # end of string
    )


## Fixed Width Parser

The main part of a directory listing is formatted into columns

In [33]:
print('column index')
print(''.join(str(i)*10 for i in range(7)))
print(''.join(str(i) for i in range(10))*7)
#print(''.join(divider))
print(dir_text[7])
print(dir_text[12])

column index
0000000000111111111122222222223333333333444444444455555555556666666666
0123456789012345678901234567890123456789012345678901234567890123456789
2021-12-27  04:03 PM    <DIR>          Dir1
2016-04-21  01:06 PM              3491 xcopy.txt


In [34]:
column_breaks=[11, 20, 29, 38]

divider_list = ['.']*70
for brk in column_breaks:
    divider_list[brk] = '|'
divider = ''.join(divider_list)

In [35]:
print('column breaks')
#print(''.join(str(i)*10 for i in range(7)))
#print(''.join(str(i) for i in range(10))*7)
print(divider)

for line in dir_text[6:13]:
    print(line)
    
print(divider)

column breaks
...........|........|........|........|...............................
2021-12-27  03:33 PM    <DIR>          ..
2021-12-27  04:03 PM    <DIR>          Dir1
2021-12-27  05:27 PM    <DIR>          Dir2
2016-02-25  09:59 PM                 3 TestFile1.txt
2016-02-15  06:46 PM                 7 TestFile2.rtf
2016-02-15  06:47 PM                 0 TestFile3.docx
2016-04-21  01:06 PM              3491 xcopy.txt
...........|........|........|........|...............................


In [36]:
a = ['Part 1', 'Part 2a', 'Part 2b']
b = tp.FixedWidthParser([4,3])
[item for item in b.parser(a)]


[['Part', ' 1'], ['Part', ' 2a'], ['Part', ' 2b']]

In [37]:
b.parse(a[1])

['Part', ' 2a']

In [38]:
a = tp.FixedWidthParser(locations=[20,30,39])
a.parse(dir_text[12])

['2016-04-21  01:06 PM', '          ', '    3491 ', 'xcopy.txt']

In [39]:
b = tp.define_fixed_width_parser(locations=[20,30,39])
b(dir_text[8])

<generator object FixedWidthParser.parser at 0x0000025D080A47B0>

In [40]:
list(b(dir_text[8]))

[['2021-12-27  05:27 PM', '    <DIR> ', '         ', 'Dir2']]

In [41]:
# Define Functions
def dir_name_split(dir_line):
    output_dict = {'Folder Name': dir_line.rsplit('\\', 1)[1]}
    return output_dict
def file_count_split(dir_line):
    output_dict = {'Number of Files': dir_line.strip().split(' ', 1)[0]}
    return output_dict
def get_subfolder_name(dir_line):
    output_dict = {'Subdirectory': dir_line[36:]}
    return output_dict
def get_file_name(dir_line):
    output_dict = {'File': dir_line[36:]}
    return output_dict

# Define Rules
dir_name_rule = Rule('Directory of', pass_method=dir_name_split)
subfolder_rule = Rule('<DIR>', pass_method=get_subfolder_name)
file_count_rule = Rule('File(s)', pass_method=file_count_split)

#Define Rule Set
dir_process = RuleSet([dir_name_rule, subfolder_rule, file_count_rule], 
                      default=get_file_name)


In [42]:
for line in dir_text[0:20]:
    print('\t', line)

	  Volume in drive C is Windows
	  Volume Serial Number is DAE7-D5BA
	 
	  Directory of c:\users\...\Test Dir Structure
	 
	 2021-12-27  03:33 PM    <DIR>          .
	 2021-12-27  03:33 PM    <DIR>          ..
	 2021-12-27  04:03 PM    <DIR>          Dir1
	 2021-12-27  05:27 PM    <DIR>          Dir2
	 2016-02-25  09:59 PM                 3 TestFile1.txt
	 2016-02-15  06:46 PM                 7 TestFile2.rtf
	 2016-02-15  06:47 PM                 0 TestFile3.docx
	 2016-04-21  01:06 PM              3491 xcopy.txt
	                4 File(s)           3501 bytes
	 
	  Directory of c:\users\...\Test Dir Structure\Dir1
	 
	 2021-12-27  04:03 PM    <DIR>          .
	 2021-12-27  04:03 PM    <DIR>          ..
	 2016-02-15  06:48 PM                 0 File in Dir One.txt
