# Fix End Section Issue
The presence of end_section is preventing a section group from working as expected.

## Imports

In [1]:
from typing import Callable, List, Any, Dict, List, Tuple
from types import GeneratorType

from pathlib import Path
import re
from functools import partial
from pprint import pprint

import numpy as np
import pandas as pd

import text_reader as tp
from buffered_iterator import BufferedIterator
from sections import Rule, RuleSet, SectionBreak, Section, ProcessingMethods


## Demo File Path

In [2]:
demo_dvh_folder = Path.cwd() / r'./References/Text Files/DVH files'
demo_dvh_1 = demo_dvh_folder / 'Breast CHWR Relative Dose Relative Volume 1 cGy Step Size.dvh'
#demo_dvh_1.exists()
#demo_dvh_folder.exists()
multi_dvh = demo_dvh_folder / 'Replan1, Replan2, Replan3 Comparison DVH Absolute Dose Relative Volume 1 cGy Step Size.dvh'

In [3]:
info_split = partial(str.split, sep=':', maxsplit=1)

In [4]:
def plan_split(line: str)->List[str]:
    '''Spilt a text line into two parts on ':'.

    Spilt a text line into two parts on the first occurrence of ':'.
    Remove leading and trailing spaces from each part.
    Force the returned list to have length of two even if the text does not 
    contain a ':'.

    Args:
        line (str): The test to spilt

    Returns:
        List[str]: A length-2 list of strings
    '''
    parts = line.split(sep=':', maxsplit=1)
    # Remove leading and trailing spaces from each part
    clean_parts = [s.strip() for s in parts]
    # If the line is blank return an empty list
    if max(len(part) for part in clean_parts) == 0:
        clean_parts = []
    # Force clean_parts to be a length of 2
    elif len(clean_parts) == 1:
        clean_parts.append('')
    return clean_parts

In [5]:
def make_approved_status_rule() -> Rule:
    '''If Treatment Approved, Split "Plan Status" into 3 lines.

    Accepts a supplied line like:
    `Plan Status: Treatment Approved Thursday, January 02, 2020 12:55:56 by gsal`,
    Extracts and user.
    The approval date is the text between event.test_value and ' by'.
    The user is the text after ' by'.
    Yields three two-item lists.   
    A supplied line like:
    `Plan Status: Treatment Approved Thursday, January 02, 2020 12:55:56 by gsal`,
    Gives:
        [['Plan Status', 'Treatment Approved'],
            ['Approved on', Thursday, January 02, 2020 12:55:56],
            ['Approved by', gsal]
    '''    
    def approved_status_parse(line, event) -> tp.ProcessedList:
        match_results = event.test_value.groupdict()
        parsed_lines = [
            ['Plan Status', match_results['approval']],
            ['Approved on', match_results['date']],
            ['Approved by', match_results['user']]
            ]
        for line in parsed_lines:
            yield line

    approval_pattern = (
        r'.*'                  # Initial text
        r'(?P<approval>'       # Beginning of approval capture group
        r'Treatment Approved'  # Literal text 'Treatment Approved'
        r')'                   # End of approval capture group        
        r'\s*'                 # Possible whitespace
        r'(?P<date>.*?)'        # Text containing approval date
        r'\s*'                 # Possible whitespace
        r'by'                  # Literal text 'by'
        r'\s*'                 # Possible whitespace
        r'(?P<user>.*?)'       # Text containing user (non-greedy)
        r'\s*'                 # Possible trailing whitespace
        r'$'                   # end of string
        )
    re_pattern = re.compile(approval_pattern)
    approved_status_rule = Rule(name='approved_status_rule',
                                sentinel=re_pattern, 
                                pass_method= approved_status_parse, 
                                fail_method='None')
    return approved_status_rule

In [6]:
# Prescribed Dose Rule
def make_prescribed_dose_rule() -> Rule:
    '''Split Dose into dose vale and dose unit.
    For a line containing:
        Total dose [unit]: dose  OR
        Prescribed dose [unit]: dose
    The line:
        Prescribed dose [cGy]: 5000.0
    Results in:
        ['Prescribed dose', '5000.0'],
        ['Prescribed dose unit', 'cGy']
    The line:
        Total dose [cGy]: not defined
    Results in:
        ['Prescribed dose', ''],
        ['Prescribed dose unit', '']
    '''
    def parse_prescribed_dose(line, event) -> tp.ProcessedList:
        match_results = event.test_value.groupdict()
        # Convert numerical dose value to float and 
        # 'not defined' dose value to np.nan
        if match_results['dose'] == 'not defined':
            match_results['dose'] = np.nan
            match_results['unit'] = ''
        else:
            match_results['dose'] = float(match_results['dose'])

        parsed_lines = [
            ['Prescribed dose', match_results['dose']],
            ['Prescribed dose unit', match_results['unit']]
            ]
        for line in parsed_lines:
            yield line

    prescribed_dose_pattern = (
        r'^(Total|Prescribed)'  # Begins with 'Total' OR 'Prescribed'
        r'\s*dose\s*'           # Literal text 'dose' surrounded by whitespace
        r'\['                   # Unit start delimiter '['
        r'(?P<unit>[A-Za-z]+)'  # unit group: text surrounded by []
        r'\]'                   # Unit end delimiter ']'
        r'\s*:\s*'              # Dose delimiter with possible whitespace
        r'(?P<dose>[0-9.]+'     # dose group Number
        r'|not defined)'        #"not defined" alternative
        r'[\s\r\n]*'            # drop trailing whitespace
        r'$'                    # end of string
        )
    re_pattern = re.compile(prescribed_dose_pattern)
    dose_rule = Rule(sentinel=re_pattern, name='prescribed_dose_rule',
                        pass_method= parse_prescribed_dose, fail_method='None')
    return dose_rule

In [7]:
# Prescribed Isodose Line Rule
def make_prescribed_isodose_rule() -> Rule:
    '''Identify Prescribed isodose text lines. and convert them into a
    two-item list, with the isodose percentage converted to a number.

    For a line containing '% for dose (%): 100.0':
    Return:
        ['Prescription Isodose', 100.0]
    '''
    def parse_isodose(line, event) -> tp.ProcessedList:
        # Split the line at ':'
        parts = line.split(':')
        isodose_text = parts[1].strip()
        isodose = float(isodose_text)
        parsed_line = ['Prescription Isodose', isodose]
        return parsed_line
    prescribed_isodose_rule = Rule(r'% for dose (%)', location='IN',
                                   pass_method=parse_isodose,
                                   fail_method='None',
                                   name='make_prescribed_isodose_rule')
    return prescribed_isodose_rule

In [8]:
# Plan Sum Rule
def make_plan_sum_rule() -> Rule:
    '''Identify lines starting with Plan sum and convert them into a two-item 
    list, with the first item being 'Plan' and the second item being the text 
    after the ':'.
    '''
    def parse_plan_sum(line, event) -> tp.ProcessedList:
        # Split the line at ':'
        parts = line.split(':', maxsplit=1)
        plan_sum_id = parts[1].strip()
        parsed_line = ['Plan', plan_sum_id]
        return parsed_line
    plan_sum_rule = Rule('Plan sum', location='START',
                         pass_method=parse_plan_sum,
                         fail_method='None',
                         name='make_plan_sum_rule')
    return plan_sum_rule

In [9]:
def plan_lookup(plan_sections: List[Dict[str, Any]], 
                context: Dict[str, Any])->Dict[str, Dict[str, Any]]:
    '''Build a dictionary of plan information and add it to context.
    '''
    all_plans = pd.DataFrame(plan for plan in plan_sections if plan)
    all_plans.set_index(['Course', 'Plan'], inplace=True)
    context['PlanLookup'] = all_plans
    return all_plans

In [10]:
# Prescribed Dose Rule
def make_dose_data_rule() -> Rule:
    '''return a Rule to Parse all Structure Dose lines.

    Split dose parameter into label, value and unit if they exists, otherwise 
    split on the first ':'.
    
    The line:
        Volume [cm³]: 38.3
    Results in:
        ['Volume', 38.3],
        ['Volume unit', 'cm³']
        
    The line:
        Approval Status: Approved
    Results in:
        ['Approval Status', 'Approved']
        
    The line:
        Paddick CI: 
    Results in:
        ['Paddick CI', '']

    Returns (Rule): A sectionary Rule that will parse all Structure Dose lines.
    '''
    def parse_dose_data(line, event) -> tp.ProcessedList:
        match_results = event.test_value.groupdict()
        # Convert numerical value to float 
        match_results['value'] = float(match_results['value'])
        value_label = match_results['label'].strip()
        unit_label = value_label + ' unit'
        parsed_lines = [
            [value_label, match_results['value']],
            [unit_label, match_results['unit']]
            ]
        for line in parsed_lines:
            yield line

    structure_dose_pattern = (
        r'^(?P<label>[^[]+)'   # Initial parameter label
        r'\['                  # Unit start delimiter '['
        r'(?P<unit>[^\]]+)'    # unit group: text surrounded by []
        r'\]'                  # Unit end delimiter ']'
        r'\s*:\s*'             # Value delimiter with possible whitespace
        r'(?P<value>[0-9.]+)'  # value group Number
        r'\s*'                 # drop trailing whitespace
        r'$'                   # end of string
        )
    re_pattern = re.compile(structure_dose_pattern)
    dose_rule = Rule(name='make_dose_data_rule', 
                     sentinel=re_pattern, 
                     pass_method=parse_dose_data, 
                     fail_method=plan_split)
    return dose_rule

In [11]:
def header_parse(line: str) -> List[Tuple[str]]:
    '''Split each column header into label and unit.

    Accepts a string containing column labels and units.
    Returns a list of two-item tuples. The first item is the label
    and the second is the units.
    A supplied line like:
    `Dose [cGy]   Relative dose [%] Ratio of Total Structure Volume [%]`,
    Gives:
        [('Dose', 'cGy'), 
         ('Relative dose', '%'),
         ('Ratio of Total Structure Volume', '%')
         ]

    Args:
        line (str): Header line for DVH Curve

    Returns:
        List[Tuple[str]]: A list of two-item tuples. The first item is 
        the label and the second is the units. 
    '''
    header_pattern = (
        r'\s*'               # Initial spaces
        r'(?P<Label>'        # Beginning of label capture group
        r'[A-Za-z /]*'       # Label text (can include spaced and '/') 
        r')'                 # End of label capture group        
        r'\s*'               # Possible whitespace
        r'\['                # Units start delimiter
        r'(?P<Units>[^]]*)'  # Text containing units (all text until ']'
        r'\]'                # Units end delimiter
        )
    re_pattern = re.compile(header_pattern)
    label_list = []
    for match in re_pattern.finditer(line):
        match_results = match.groupdict()
        header = (match_results['Label'], match_results['Units'])
        label_list.append(header)
    return label_list


In [12]:
def is_blank(line: str):
    return len(line) == 0

In [13]:
def split_data_points(line: str)->List[float]:
    return [float(num) for num in line.split()]

In [14]:
plan_rule_set = RuleSet([make_approved_status_rule(),
                         make_prescribed_dose_rule(),
                         make_prescribed_isodose_rule(),
                         make_plan_sum_rule()],
                        default=plan_split)

In [15]:
dvh_info_section = Section(
    name='Information',
    start_section=None,
    end_section=('Description', 'START', 'Before'),
    processor=[info_split, 
               tp.trim_items, 
               tp.drop_blanks],
    assemble=tp.to_dict
    )

In [16]:
plan_info_section = Section(
    name='Plan',
    start_section=(['Plan:', 'Plan sum:'], 'START', 'Before'),
    end_section=('% for dose (%)', 'START', 'After'),
    processor=[plan_rule_set],
    assemble=tp.to_dict
    )

In [17]:
all_plans = Section(
    name='All Plans',
    start_section=(['Plan:', 'Plan sum:'], 'START', 'Before'),
    end_section=('Structure', 'START', 'Before'),
    processor=[plan_info_section],
    assemble=plan_lookup
    )

In [18]:
dose_info_section = Section(
    name='Structure',
    start_section=('Structure:', 'START', 'Before'),
    end_section=(is_blank, None, 'Before'),
    processor=[make_dose_data_rule()],
    assemble=tp.to_dict
    )

In [19]:
dose_header_section = Section(
    name='Header',
    start_section=('Dose [', 'IN', 'Before'),
    end_section=True,
    processor=header_parse
    )

In [20]:
dose_curve_section = Section(
    name='DVH Curve',
    start_search=False,
    end_section=('Structure:', 'START', 'Before'),
    processor=split_data_points
    )

![Error](Error.png) Section group not working

- With end_section present, there is no output.

> ```Python
> >>> dvh_dose = Section(
>         name='DVH Dose',
>         start_search=('Structure:', 'START', 'Before'),
>         end_section=('Structure:', 'START', 'Before'),
>         processor=[(dose_info_section, 
>                     dose_header_section, 
>                     dose_curve_section)]
>         )
> >>> demo_dvh_text = demo_dvh_1.read_text(encoding='utf_8_sig').splitlines()
> >>> dvh_dose.read(demo_dvh_text)
>
> []
> ```

- Without end_section present, the section group returns correctly.
> ```Python
> >>> dvh_dose = Section(
>         name='DVH Dose',
>         start_search=('Structure:', 'START', 'Before'),
>         #end_section=('Structure:', 'START', 'Before'),
>         processor=[(dose_info_section, 
>                     dose_header_section, 
>                     dose_curve_section)]
>         )
> >>> demo_dvh_text = demo_dvh_1.read_text(encoding='utf_8_sig').splitlines()
> >>> dvh_dose.read(demo_dvh_text)
>
> [{'Structure': {'Structure': 'BODY',
>    'Approval Status': 'Approved',
>    'Plan': 'CHWR',
>    'Course': 'C1',
>    'Volume': 20449.1,
>    'Volume unit': 'cm³',
>    'Dose Cover.': 100.0,
>    ...
>    [100.0, 4250.0, 0.0],
>    [101.0, 4292.5, 0.0],
>    [102.0, 4335.0, 0.0],
>    [103.0, 4377.5, 0.0],
>    []]}]
> ```


In [21]:
dvh_dose = Section(
    name='DVH Dose',
    start_search=('Structure:', 'START', 'Before'),
    end_section=('Structure:', 'START', 'Before'),
    end_on_first_item=False,
    processor=[(dose_info_section, 
                dose_header_section, 
                dose_curve_section)]
    )
demo_dvh_text = demo_dvh_1.read_text(encoding='utf_8_sig').splitlines()
dvh_dose.read(demo_dvh_text)

[]

In [22]:
dvh_dose = Section(
    name='DVH Dose',
    start_search=('Structure:', 'START', 'Before'),
    end_section=('Structure:', 'START', 'After'),
    end_on_first_item=False,
    processor=[(dose_info_section, 
                dose_header_section, 
                dose_curve_section)]
    )
demo_dvh_text = demo_dvh_1.read_text(encoding='utf_8_sig').splitlines()
dvh_dose.read(demo_dvh_text)

[{'Structure': {'Structure': 'BODY'}}]

In [23]:
dvh_dose = Section(
    name='DVH Dose',
    start_search=('Structure:', 'START', 'Before'),
    #end_section=('Structure:', 'START', 'Before'),
    processor=[(dose_info_section, 
                dose_header_section, 
                dose_curve_section)]
    )
demo_dvh_text = demo_dvh_1.read_text(encoding='utf_8_sig').splitlines()
dvh_dose.read(demo_dvh_text)

[{'Structure': {'Structure': 'BODY',
   'Approval Status': 'Approved',
   'Plan': 'CHWR',
   'Course': 'C1',
   'Volume': 20449.1,
   'Volume unit': 'cm³',
   'Dose Cover.': 100.0,
   'Dose Cover. unit': '%',
   'Sampling Cover.': 100.0,
   'Sampling Cover. unit': '%',
   'Min Dose': 0.0,
   'Min Dose unit': '%',
   'Max Dose': 103.4,
   'Max Dose unit': '%',
   'Mean Dose': 6.2,
   'Mean Dose unit': '%',
   'Modal Dose': 0.0,
   'Modal Dose unit': '%',
   'Median Dose': 0.3,
   'Median Dose unit': '%',
   'STD': 20.3,
   'STD unit': '%',
   'Equiv. Sphere Diam.': 33.9,
   'Equiv. Sphere Diam. unit': 'cm',
   'Conformity Index': 'N/A',
   'Gradient Measure [cm]': 'N/A',
   'Dose Level [cGy]': '',
   'RTOG CI': '',
   'Paddick CI': '',
   'GI': '',
   'ICRU83 HI': '',
   'D95.0%': 0.1,
   'D95.0% unit': 'cGy',
   'D98.0%': 0.0,
   'D98.0% unit': '%',
   'D99.0%': 0.0,
   'D99.0% unit': 'cGy',
   'V95.0%': 400.9586,
   'V95.0% unit': 'cm³',
   'V100.0%': 93.2574,
   'V100.0% unit': 'cm³'