# Fix End Section Issue
The presence of end_section is preventing a section group from working as expected.

## Imports

In [78]:
from typing import Callable, List, Any, Dict, List, Tuple
from types import GeneratorType

from pathlib import Path
import re
from functools import partial
from pprint import pprint

import numpy as np
import pandas as pd

import text_reader as tp
from buffered_iterator import BufferedIterator
from sections import Rule, RuleSet, SectionBreak, Section, ProcessingMethods


In [79]:
info_split = partial(str.split, sep=':', maxsplit=1)

In [80]:
def plan_split(line: str)->List[str]:
    '''Spilt a text line into two parts on ':'.

    Spilt a text line into two parts on the first occurrence of ':'.
    Remove leading and trailing spaces from each part.
    Force the returned list to have length of two even if the text does not 
    contain a ':'.

    Args:
        line (str): The test to spilt

    Returns:
        List[str]: A length-2 list of strings
    '''
    parts = line.split(sep=':', maxsplit=1)
    # Remove leading and trailing spaces from each part
    clean_parts = [s.strip() for s in parts]
    # If the line is blank return an empty list
    if max(len(part) for part in clean_parts) == 0:
        clean_parts = []
    # Force clean_parts to be a length of 2
    elif len(clean_parts) == 1:
        clean_parts.append('')
    return clean_parts

In [81]:
def plan_lookup(plan_sections: List[Dict[str, Any]], 
                context: Dict[str, Any])->Dict[str, Dict[str, Any]]:
    '''Build a dictionary of plan information and add it to context.
    '''
    all_plans = pd.DataFrame(plan for plan in plan_sections if plan)
    all_plans.set_index(['Course', 'Plan'], inplace=True)
    context['PlanLookup'] = all_plans
    return all_plans

In [82]:
# Prescribed Dose Rule
def make_dose_data_rule() -> Rule:
    '''return a Rule to Parse all Structure Dose lines.

    Split dose parameter into label, value and unit if they exists, otherwise 
    split on the first ':'.
    
    The line:
        Volume [cm³]: 38.3
    Results in:
        ['Volume', 38.3],
        ['Volume unit', 'cm³']
        
    The line:
        Approval Status: Approved
    Results in:
        ['Approval Status', 'Approved']
        
    The line:
        Paddick CI: 
    Results in:
        ['Paddick CI', '']

    Returns (Rule): A sectionary Rule that will parse all Structure Dose lines.
    '''
    def parse_dose_data(line, event) -> tp.ProcessedList:
        match_results = event.test_value.groupdict()
        # Convert numerical value to float 
        match_results['value'] = float(match_results['value'])
        value_label = match_results['label'].strip()
        unit_label = value_label + ' unit'
        parsed_lines = [
            [value_label, match_results['value']],
            [unit_label, match_results['unit']]
            ]
        for line in parsed_lines:
            yield line

    structure_dose_pattern = (
        r'^(?P<label>[^[]+)'   # Initial parameter label
        r'\['                  # Unit start delimiter '['
        r'(?P<unit>[^\]]+)'    # unit group: text surrounded by []
        r'\]'                  # Unit end delimiter ']'
        r'\s*:\s*'             # Value delimiter with possible whitespace
        r'(?P<value>[0-9.]+)'  # value group Number
        r'\s*'                 # drop trailing whitespace
        r'$'                   # end of string
        )
    re_pattern = re.compile(structure_dose_pattern)
    dose_rule = Rule(name='make_dose_data_rule', 
                     sentinel=re_pattern, 
                     pass_method=parse_dose_data, 
                     fail_method=plan_split)
    return dose_rule

In [83]:
def header_parse(line: str) -> List[Tuple[str]]:
    '''Split each column header into label and unit.

    Accepts a string containing column labels and units.
    Returns a list of two-item tuples. The first item is the label
    and the second is the units.
    A supplied line like:
    `Dose [cGy]   Relative dose [%] Ratio of Total Structure Volume [%]`,
    Gives:
        [('Dose', 'cGy'), 
         ('Relative dose', '%'),
         ('Ratio of Total Structure Volume', '%')
         ]

    Args:
        line (str): Header line for DVH Curve

    Returns:
        List[Tuple[str]]: A list of two-item tuples. The first item is 
        the label and the second is the units. 
    '''
    header_pattern = (
        r'\s*'               # Initial spaces
        r'(?P<Label>'        # Beginning of label capture group
        r'[A-Za-z /]*'       # Label text (can include spaced and '/') 
        r')'                 # End of label capture group        
        r'\s*'               # Possible whitespace
        r'\['                # Units start delimiter
        r'(?P<Units>[^]]*)'  # Text containing units (all text until ']'
        r'\]'                # Units end delimiter
        )
    re_pattern = re.compile(header_pattern)
    label_list = []
    for match in re_pattern.finditer(line):
        match_results = match.groupdict()
        header = (match_results['Label'], match_results['Units'])
        label_list.append(header)
    return label_list


In [84]:
def is_blank(line: str):
    return len(line) == 0

In [85]:
def split_data_points(line: str)->List[float]:
    return [float(num) for num in line.split()]

In [86]:
dose_info_section = Section(
    name='Structure',
    start_section=('Structure:', 'START', 'Before'),
    end_section=(is_blank, None, 'Before'),
    processor=[make_dose_data_rule()],
    assemble=tp.to_dict
    )

In [87]:
dose_header_section = Section(
    name='Header',
    start_section=('Dose [', 'IN', 'Before'),
    end_section=True,
    processor=header_parse
    )

In [88]:
dose_curve_section = Section(
    name='DVH Curve',
    start_search=False,
    end_section=('Structure:', 'START', 'Before'),
    processor=split_data_points
    )

![Error](Error.png) Section group not working

- With end_section present, there is no output.

> ```Python
> >>> dvh_dose = Section(
>         name='DVH Dose',
>         start_search=('Structure:', 'START', 'Before'),
>         end_section=('Structure:', 'START', 'Before'),
>         processor=[(dose_info_section, 
>                     dose_header_section, 
>                     dose_curve_section)]
>         )
> >>> demo_dvh_text = demo_dvh_1.read_text(encoding='utf_8_sig').splitlines()
> >>> dvh_dose.read(demo_dvh_text)
>
> []
> ```

- Without end_section present, the section group returns correctly.
> ```Python
> >>> dvh_dose = Section(
>         name='DVH Dose',
>         start_search=('Structure:', 'START', 'Before'),
>         #end_section=('Structure:', 'START', 'Before'),
>         processor=[(dose_info_section, 
>                     dose_header_section, 
>                     dose_curve_section)]
>         )
> >>> demo_dvh_text = demo_dvh_1.read_text(encoding='utf_8_sig').splitlines()
> >>> dvh_dose.read(demo_dvh_text)
>
> [{'Structure': {'Structure': 'BODY',
>    'Approval Status': 'Approved',
>    'Plan': 'CHWR',
>    'Course': 'C1',
>    'Volume': 20449.1,
>    'Volume unit': 'cm³',
>    'Dose Cover.': 100.0,
>    ...
>    [100.0, 4250.0, 0.0],
>    [101.0, 4292.5, 0.0],
>    [102.0, 4335.0, 0.0],
>    [103.0, 4377.5, 0.0],
>    []]}]
> ```


In [89]:
test_text = '''
Structure: BODY
Approval Status: Approved
Min Dose [%]: 0.0
: 

Relative dose [%]          Dose [cGy] Ratio of Total Structure Volume [%]
                0                   0                       100
                1                42.5                   29.5637
                2                  85                    20.601
              102                4335                 0.0411928
              103              4377.5                0.00206949

Structure: Cricoid
'''
test_lines = test_text.splitlines()

In [90]:
dvh_dose = Section(
    name='DVH Dose',
    start_section=('Structure:', 'START', 'Before'),
    end_section=('Structure:', 'START', 'Before'),
    end_on_first_item=False,
    processor=[(dose_info_section, 
                dose_header_section, 
                dose_curve_section)]
    )
dvh_dose.read(test_lines)

[{'Structure': {'Structure': 'BODY',
   'Approval Status': 'Approved',
   'Min Dose': 0.0,
   'Min Dose unit': '%'},
  'Header': [[('Relative dose ', '%'),
    ('Dose ', 'cGy'),
    ('Ratio of Total Structure Volume ', '%')]],
  'DVH Curve': [[0.0, 0.0, 100.0],
   [1.0, 42.5, 29.5637],
   [2.0, 85.0, 20.601],
   [102.0, 4335.0, 0.0411928],
   [103.0, 4377.5, 0.00206949],
   []]}]

In [91]:
test_text = '''Structure: BODY
Approval Status: Approved
Min Dose [%]: 0.0
: 

Relative dose [%]          Dose [cGy] Ratio of Total Structure Volume [%]
                0                   0                       100
                1                42.5                   29.5637
                2                  85                    20.601
              102                4335                 0.0411928
              103              4377.5                0.00206949

Structure: Cricoid
'''
test_lines = test_text.splitlines()

In [92]:
dvh_dose = Section(
    name='DVH Dose',
    start_section=('Structure:', 'START', 'Before'),
    end_section=('Structure:', 'START', 'Before'),
    end_on_first_item=False,
    processor=[(dose_info_section, 
                dose_header_section, 
                dose_curve_section)]
    )
dvh_dose.read(test_lines)

[{'Structure': {'Structure': 'BODY',
   'Approval Status': 'Approved',
   'Min Dose': 0.0,
   'Min Dose unit': '%'},
  'Header': [[('Relative dose ', '%'),
    ('Dose ', 'cGy'),
    ('Ratio of Total Structure Volume ', '%')]],
  'DVH Curve': [[0.0, 0.0, 100.0],
   [1.0, 42.5, 29.5637],
   [2.0, 85.0, 20.601],
   [102.0, 4335.0, 0.0411928],
   [103.0, 4377.5, 0.00206949],
   []]}]

------

In [16]:
test_text = '''
Structure: BODY
Approval Status: Approved
Min Dose [%]: 0.0
: 

Relative dose [%]          Dose [cGy] Ratio of Total Structure Volume [%]
                0                   0                       100
                1                42.5                   29.5637
                2                  85                    20.601
              102                4335                 0.0411928
              103              4377.5                0.00206949

Structure: Cricoid
'''
test_lines = test_text.splitlines()

In [17]:
dvh_dose = Section(
    name='DVH Dose',
    start_search=('Structure:', 'START', 'Before'),
    end_section=('Structure:', 'START', 'Before'),
    end_on_first_item=False,
    )
dvh_dose.read(test_lines)

['']

In [18]:
test_text = '''Structure: BODY
Approval Status: Approved
Min Dose [%]: 0.0
: 

Relative dose [%]          Dose [cGy] Ratio of Total Structure Volume [%]
                0                   0                       100
                1                42.5                   29.5637
                2                  85                    20.601
              102                4335                 0.0411928
              103              4377.5                0.00206949

Structure: Cricoid
'''
test_lines = test_text.splitlines()

In [19]:
dvh_dose = Section(
    name='DVH Dose',
    start_search=('Structure:', 'START', 'Before'),
    end_section=('Structure:', 'START', 'Before'),
    end_on_first_item=False,
    )
dvh_dose.read(test_lines)

['Structure: BODY',
 'Approval Status: Approved',
 'Min Dose [%]: 0.0',
 ': ',
 '',
 'Relative dose [%]          Dose [cGy] Ratio of Total Structure Volume [%]',
 '                0                   0                       100',
 '                1                42.5                   29.5637',
 '                2                  85                    20.601',
 '              102                4335                 0.0411928',
 '              103              4377.5                0.00206949',
 '']

In [60]:
test_text = '''
Structure: BODY
: 

Relative dose [%]          Dose [cGy] Ratio of Total Structure Volume [%]
              103              4377.5                0.00206949

Structure: Cricoid
'''
test_lines = test_text.splitlines()

In [61]:
dvh_dose = Section(
    name='DVH Dose',
    start_search=('Structure:', 'START', 'Before'),
    end_section=('Structure:', 'START', 'Before'),
    end_on_first_item=False,
    )
dvh_dose.read(test_lines)

['']

In [62]:
test_text = '''
Structure: BODY
Relative dose [%]          Dose [cGy] Ratio of Total Structure Volume [%]
              103              4377.5                0.00206949
Structure: Cricoid
'''
test_lines = test_text.splitlines()

In [63]:
dvh_dose = Section(
    name='DVH Dose',
    start_search=('Structure:', 'START', 'Before'),
    end_section=('Structure:', 'START', 'Before'),
    end_on_first_item=False,
    )
dvh_dose.read(test_lines)

['']

------

In [68]:
test_text = '''
Structure: BODY
EndSection A

StartSection B
Structure: Cricoid
EndSection B
'''
test_lines = test_text.splitlines()

In [69]:
dvh_dose = Section(
    name='DVH Dose',
    start_search=('Structure:', 'START', 'Before'),
    end_section=('Structure:', 'START', 'Before'),
    end_on_first_item=False,
    )
dvh_dose.read(test_lines)

['']

In [70]:
test_text = '''
StartSection A
EndSection A

StartSection B
EndSection B
'''
test_lines = test_text.splitlines()

In [71]:
dvh_dose = Section(
    name='DVH Dose',
    start_search=('Start', 'START', 'Before'),
    end_section=('Start', 'START', 'Before'),
    end_on_first_item=False,
    )
dvh_dose.read(test_lines)

['']

In [72]:
test_text = '''StartSection A
EndSection A

StartSection B
EndSection B
'''
test_lines = test_text.splitlines()

In [73]:
dvh_dose = Section(
    name='DVH Dose',
    start_search=('Start', 'START', 'Before'),
    end_section=('Start', 'START', 'Before'),
    end_on_first_item=False,
    )
dvh_dose.read(test_lines)

['StartSection A', 'EndSection A', '']

------

In [24]:
test_text = '''
StartSection A
EndSection A

StartSection B
EndSection B
'''
test_lines = test_text.splitlines()

In [25]:
full_section = Section(
    name='Full',
    start_section=('Start', 'START', 'Before'),
    end_section=('Start', 'START', 'Before'),
    end_on_first_item=False
    )

In [26]:
full_section.read(test_lines)

['StartSection A', 'EndSection A', '']

In [27]:
test_text = '''StartSection A
EndSection A

StartSection B
EndSection B
'''
test_lines = test_text.splitlines()

In [28]:
full_section.read(test_lines)

['StartSection A', 'EndSection A', '']

In [29]:
test_text = '''StartSection A
EndSection A

StartSection B
EndSection B'''
test_lines = test_text.splitlines()

In [30]:
full_section.read(test_lines)

['StartSection A', 'EndSection A', '']

In [31]:
test_text = '''StartSection A
EndSection A

StartSection B
EndSection B
'''
test_lines = test_text.splitlines()

In [32]:
full_section.read(test_lines)

['StartSection A', 'EndSection A', '']

------

In [33]:
dvh_dose = Section(
    name='DVH Dose',
    start_search=('Structure:', 'START', 'Before'),
    end_section=('Structure:', 'START', 'Before'),
    end_on_first_item=False
    )


In [34]:
dvh_dose.read(test_lines)

['StartSection A', 'EndSection A', '', 'StartSection B', 'EndSection B']

In [35]:
test_text = '''
Structure: BODY
Approval Status: Approved
: 

Structure: Cricoid
'''
test_lines = test_text.splitlines()

In [36]:
dvh_dose.read(test_lines)

['']

-----------

In [37]:
full_section = Section(
    name='Full',
    start_section=('Start', 'START', 'Before'),
    end_section=('Start', 'START', 'Before'),
    end_on_first_item=False
    )

In [38]:
test_text = [
    '',
    'StartSection A',
    'EndSection A',
    '',
    'StartSection B',
    'EndSection B'
    ]

In [39]:
full_section.read(test_text)

['StartSection A', 'EndSection A', '']

In [40]:
test_text = '''StartSection A
EndSection A

StartSection B
EndSection B'''
test_lines = test_text.splitlines()

In [41]:
full_section.read(test_lines)

['StartSection A', 'EndSection A', '']

In [42]:
test_text = '''
StartSection A
EndSection A

StartSection B
EndSection B'''
test_lines = test_text.splitlines()

In [43]:
full_section.read(test_lines)

['StartSection A', 'EndSection A', '']

In [44]:
test_text = '''Structure: BODY
Approval Status: Approved
: 

Structure: Cricoid
'''
test_lines = test_text.splitlines()

In [45]:
full_section = Section(
    name='Full',
    start_section=('Structure', 'START', 'Before'),
    end_section=('Structure', 'START', 'Before'),
    end_on_first_item=False
    )

In [46]:
full_section.read(test_lines)

['Structure: BODY', 'Approval Status: Approved', ': ', '']

In [47]:
test_text = '''
Structure: BODY
Approval Status: Approved
: 

Structure: Cricoid
'''
test_lines = test_text.splitlines()

In [48]:
full_section.read(test_lines)

['Structure: BODY', 'Approval Status: Approved', ': ', '']

------

In [49]:
test_text = [
            'Text to be ignored',
            'StartSection A',
            'MiddleSection A',
            'EndSection A',            
            'Unwanted text between sections',            
            'StartSection B',
            'MiddleSection B',
            'EndSection B',
            'StartSection C',
            'MiddleSection C',
            'EndSection C',
            'Even more text to be ignored',
            ]

In [50]:
sub_section = Section(
    name='SubSection',
    start_section=('Start', 'START', 'Before'),
    end_section=('Unwanted', 'IN', 'Before'),
    processor=str.split,
    assemble=tp.to_dict
    )

In [51]:
sub_section.read(test_text)

{'StartSection': 'A', 'MiddleSection': 'A', 'EndSection': 'A'}

In [52]:
unwanted_section = Section(
    name='Unwanted',
    start_section=('Unwanted', 'IN', 'Before'),
    end_section=True
    )

In [53]:
full_section = Section(
    name='Full',
    start_section=('Start', 'START', 'Before'),
    end_section=('Start', 'START', 'Before'),
    processor=(sub_section, unwanted_section)
    )

In [54]:
full_section.read(test_text)

[{'SubSection': {'StartSection': 'A', 'MiddleSection': 'A', 'EndSection': 'A'},
  'Unwanted': ['Unwanted text between sections']}]

------

In [55]:
start_section = Section(
    name='StartSection',
    start_section=('Start', 'START', 'Before'),
    end_section=('Middle', 'IN', 'Before'),
    processor=str.split,
    assemble=tp.to_dict
    )

In [56]:
mid_section = Section(
    name='MiddleSection',
    start_section=('Middle', 'START', 'Before'),
    end_section=True,
    processor=str.split,
    assemble=tp.to_dict
    )

In [57]:
end_section = Section(
    name='RemainderSection',
    start_section=('End', 'START', 'Before'),
    end_section=('Start', 'START', 'Before')
    )

In [58]:
full_section = Section(
    name='Full',
    start_section=('Start', 'START', 'Before'),
    end_section=('Start', 'START', 'Before'),
    processor=[(start_section, mid_section, end_section)]
    )

In [59]:
full_section.read(test_text)

[{'StartSection': {'StartSection': 'A'},
  'MiddleSection': {'MiddleSection': 'A'},
  'RemainderSection': ['EndSection A', 'Unwanted text between sections']}]

------