### Imports

In [6]:
from functools import partial
from pprint import pprint
from typing import Tuple
import re

import sections as sec
import text_reader as tr
from buffered_iterator import BufferedIterator


### Sub-Section Section
> **Section contents:**
>```
>Multi Combined Group Section
>Section Name:D3
>D3 Content1:a
>D3 Content2:b
>D3 Content3:c
>Section Name:D4
>D4 Content1:d
>...
>D6 Content3:l
>End Section
>```
- Section Starts with *Multi Combined Group Section*
- Sub-section Starts **and Ends** with *Section Name*
- Section Ends with *End Section*
- Second column starts after ***:***

In [7]:
multi_group_source = [
    'Multi Combined Group Section',
    'Section Name:D3',
    'D3 Content1:a',
    'D3 Content2:b',
    'D3 Content3:c', 
    'Section Name:D4',
    'D4 Content1:d',
    'D4 Content2:e',
    'D4 Content3:f',
    'Section Name:D5',
    'D5 Content1:g',
    'D5 Content2:h',
    'D5 Content3:i',
    'Section Name:D6',
    'D6 Content1:j',
    'D6 Content2:k',
    'D6 Content3:l',
    'End Section',
    ]

In [8]:
multi_group_source = [
    'Multi Combined Group Section',
    'Section Name:D3',
    'D3 Content1:a',
    'D3 Content2:b',
    'D3 Content3:c', 
    'Section Name:D4',
    'D4 Content1:d',
    'D4 Content2:e',
    'D4 Content3:f',
    'Section Name:D5',
    'D5 Content1:g',
    'D5 Content2:h',
    'D5 Content3:i',
    'Section Name:D6',
    'D6 Content1:j',
    'D6 Content2:k',
    'D6 Content3:l'
    ]

### Define a subsection

In [9]:
delimiter_parser = tr.define_csv_parser(
    'delimiter_parser',
    delimiter=':',
    skipinitialspace=True
    )

delimiter_section_reader = [
    delimiter_parser,
    tr.trim_items,
    tr.drop_blanks
    ]

sub_delimiter_section = sec.Section(
    name='Sub Section',
    start_section=sec.SectionBreak('Section Name',
                                   break_offset='Before'),
    end_section=sec.SectionBreak('Section Name',
                                   break_offset='Before'),
    processor=delimiter_section_reader,
    assemble=partial(tr.to_dict, default_value=None)
    )

In [10]:
sub_delimiter_section.read(multi_group_source)

{'Section Name': 'D3',
 'D3 Content1': 'a',
 'D3 Content2': 'b',
 'D3 Content3': 'c'}

### Assemble definitions

In [11]:
def print_list(parsed_lines):
    '''print items and add then to a list.
    '''
    output = list()
    for line_item in parsed_lines:
        pprint(line_item)
        output.append(line_item)
    return output


def make_list(parsed_lines):
    '''add items to a list, dropping empty items.
    '''
    output = list()
    for line_item in parsed_lines:
        if line_item:
            output.append(line_item)
    return output

### Define Section with sub-sections

In [12]:
repeat_group_section = sec.Section(
    name='Repeated Sub-Section Groups',
    start_section='Multi Combined Group Section',
    end_section='End Section',
    processor=[sub_delimiter_section],
    end_on_first_item=True,
    assemble=make_list
    )

In [13]:
repeat_group_section.read(multi_group_source)

[{'Section Name': 'D3',
  'D3 Content1': 'a',
  'D3 Content2': 'b',
  'D3 Content3': 'c'},
 {'Section Name': 'D4',
  'D4 Content1': 'd',
  'D4 Content2': 'e',
  'D4 Content3': 'f'},
 {'Section Name': 'D5',
  'D5 Content1': 'g',
  'D5 Content2': 'h',
  'D5 Content3': 'i'},
 {'Section Name': 'D6',
  'D6 Content1': 'j',
  'D6 Content2': 'k',
  'D6 Content3': 'l'}]

### Fixed Width Section
> **Section contents:**
>```
>0123456789012345678790123456789
>
>Single Fixed Width Section
>Section Name    F1
>F1 Content1     d
>F1 Content2     e
>F1 Content3     f
>End Section
>
>0123456789012345678790123456789
>```
- Starts with *Single Fixed Width Section*
- Second column starts at ***16***
- Ends with *End Section*

In [14]:
fixed_width_source = [
    'Single Fixed Width Section',
    'Section Name    F1',
    'F1 Content1     d',
    'F1 Content2     e',
    'F1 Content3     f',
    'End Section'
    ]

In [15]:
fixed_width_reader = [
    tr.define_fixed_width_parser(widths=16),
    tr.trim_items,
    tr.drop_blanks,
    tr.convert_numbers
    ]

fixed_width_section = sec.Section(
    name='Fixed Width Section',
    start_section=sec.SectionBreak('Single Fixed Width Section', 
                                   break_offset='After'),
    end_section='End Section',
    processor=fixed_width_reader,
    assemble=partial(tr.to_dict, default_value=None)
    )

In [16]:
fixed_width_section.read(fixed_width_source)

{'Section Name': 'F1',
 'F1 Content1': 'd',
 'F1 Content2': 'e',
 'F1 Content3': 'f'}

### Delimiter Section
> **Section contents:**
>```
>Single Delimiter Section
>Section Name:D2
>D2 Content1:m
>D2 Content2:n
>D2 Content3:o
>End Section
>```
- Starts with *Single Delimiter Section*
- Second column starts after ***:***
- Ends with *End Section*

In [17]:
delimiter_source = [
    'Single Delimiter Section',
    'Section Name:D2',
    'D2 Content1:m',
    'D2 Content2:n',
    'D2 Content3:o',
    'End Section'
    ]

In [18]:
delimiter_parser = tr.define_csv_parser(
    'delimiter_parser',
    delimiter=':',
    skipinitialspace=True
    )

delimiter_section_reader = [
    delimiter_parser,
    tr.trim_items,
    tr.drop_blanks
    ]

delimiter_section = sec.Section(
    start_section=sec.SectionBreak('Single Delimiter Section', 
                                   break_offset='After'),
    end_section='End Section',
    processor=delimiter_section_reader,
    assemble=partial(tr.to_dict, default_value=None)
    )

In [19]:
delimiter_section.read(delimiter_source)

{'Section Name': 'D2',
 'D2 Content1': 'm',
 'D2 Content2': 'n',
 'D2 Content3': 'o'}

# Done To Here

# TODO Try to reproduce the duplicate row bug found in *read_printout_sections*

### Split a text string into parts.
- The `delimiter=';'` argument tells it to split the string on ";"s.
- The `skipinitialspace=True` argument tells it to strip leading spaces from the
 text.

 For example:
 |Text|Becomes|
 |----|-------|
 |`'       Course;C1'`|`['Course', 'C1']`]|
 |`'Intent;1_PRIMARY'`|`['Intent', '1_PRIMARY']`]|
 |`'Plan Id;PELB FB'`|`['Plan Id', 'PELB FB']`]|
 |`'Technique;'`|`['Technique']`]|

In [20]:
dict_parse = tr.define_csv_parser(
    delimiter=';',
    skipinitialspace=True)

In [21]:
def drop_units(text: str) -> float:
    number_value_pattern = re.compile(
        # beginning of string and leading whitespace
        r'^\s*'                
        # value group contains optional initial sign and decimal place with 
        # number before and/or after.
        r'(?P<value>[-+]?\d+[.]?\d*)'    
        r'\s*'              # Optional whitespace between value and units
        r'(?P<unit>[^\s]*)' # units do not contain spaces
        r'\s*'              # drop trailing whitespace
        r'$'                # end of string
        )
    find_num = number_value_pattern.search(text)
    if find_num:
        value, unit = find_num.groups()
        return value
    return text


def numeric_values(text_row: Tuple[str]) -> Tuple[str, float]:
    try:
        label, text_value = text_row
    except ValueError:
        return text_row
    numeric_value = drop_units(text_value)
    return (label, numeric_value)

In [22]:
numeric_values(['Dose', '4500.0 cGy'])

('Dose', '4500.0')

In [23]:
test_text = [
    'FIELD POINTS',
    'Field;Point;Dose;SSD;Depth;Effective Depth',
    'Plan;PELB;4500.0 cGy;;;',
    '  CW  ;PELB;89.0 cGy;-;-;-',
    '  CCW  ;PELB;91.0 cGy;-;-;-'
]


In [24]:
full_section = sec.Section(
    start_section=sec.SectionBreak('FIELD POINTS', break_offset='after'),
    end_section=sec.SectionBreak('PlanCheck', break_offset='before'),
    processor=[tr.clean_ascii_text, dict_parse, tr.trim_items,
               numeric_values],
    assemble=tr.to_dataframe,
    name='Point Dose')

In [25]:
output = full_section.read(test_text)
output

Unnamed: 0,Field,Point,Dose,SSD,Depth,Effective Depth
0,Plan,PELB,4500.0 cGy,,,
1,CW,PELB,89.0 cGy,-,-,-
2,CCW,PELB,91.0 cGy,-,-,-


In [26]:
output.to_dict()

{'Field': {0: 'Plan', 1: 'CW', 2: 'CCW'},
 'Point': {0: 'PELB', 1: 'PELB', 2: 'PELB'},
 'Dose': {0: '4500.0 cGy', 1: '89.0 cGy', 2: '91.0 cGy'},
 'SSD': {0: '', 1: '-', 2: '-'},
 'Depth': {0: '', 1: '-', 2: '-'},
 'Effective Depth': {0: '', 1: '-', 2: '-'}}

In [27]:
test_text = [
    'FIELD POINTS',
    'Field;Point;Dose;SSD;Depth;Effective Depth',
    'Plan;PELB;4500.0 cGy;;;',
    '  CW  ;PELB;89.0 cGy;-;-;-',
    '  CCW  ;PELB;91.0 cGy;-;-;-'
    ]
import pandas as pd

pd.DataFrame({
    'Field': ['Plan', 'CW', 'CCW'],
    'Point': ['PELB', 'PELB', 'PELB'],
    'Dose': ['4500.0', '89.0', '91.0'],
    'SSD': ['',  '-', '-'],
    'Depth': ['',  '-', '-'],
    'Effective Depth': ['', '-', '-']
    })

Unnamed: 0,Field,Point,Dose,SSD,Depth,Effective Depth
0,Plan,PELB,4500.0,,,
1,CW,PELB,89.0,-,-,-
2,CCW,PELB,91.0,-,-,-


In [28]:
point_dose_section = sec.Section(
    start_section=None,
    end_section=None,
    processor=[dict_parse],
    assemble=None)

In [29]:
output = point_dose_section.read(test_text)
output

[['FIELD POINTS'],
 ['Field', 'Point', 'Dose', 'SSD', 'Depth', 'Effective Depth'],
 ['Plan', 'PELB', '4500.0 cGy', '', '', ''],
 ['CW  ', 'PELB', '89.0 cGy', '-', '-', '-'],
 ['CCW  ', 'PELB', '91.0 cGy', '-', '-', '-']]

In [30]:
simple_section = sec.Section(
    start_section=None,
    end_section=None,
    processor=None,
    assemble=None)

In [32]:
output = simple_section.read(test_text)
output

['FIELD POINTS',
 'Field;Point;Dose;SSD;Depth;Effective Depth',
 'Plan;PELB;4500.0 cGy;;;',
 '  CW  ;PELB;89.0 cGy;-;-;-',
 '  CCW  ;PELB;91.0 cGy;-;-;-']

### Debugging the duplicate row bug found in *read_printout_sections*

```
source: <buffered_iterator.BufferedIterator object at 0x000001799
•	special variables
•	function variables buffer_size: 5
•	future_items: deque([’FIELD POINTS'])
•	previous_items: deque([])
•	source_gen: <list_iterator object at 0x0000017991337640> step_back: 0
_step_back: 0 start search: True
```


#### Input Text
        [
            'FIELD POINTS',
            'Field;Point;Dose;SSD;Depth;Effective Depth',
            'Plan;PELB;4500.0 cGy;;;',
            '  CW  ;PELB;89.0 cGy;-;-;-',
            '  CCW  ;PELB;91.0 cGy;-;-;-'
        ]

#### DeBug Log
1. Start section trigger on first line using *AlwaysBreak*
```
Text Processing      - DEBUG: Resetting source for: Section.
Text Processing      - DEBUG: Advancing to start of Section.
Buffered Iterator    - DEBUG: Getting item: FIELD POINTS         from source
Text Processing      - DEBUG: In:       Section Got item:       FIELD POINTS
Text Processing      - DEBUG: Break Status:     Scan In Progress
Text Processing      - DEBUG: Checking Trigger: AlwaysBreak
Text Processing      - DEBUG: in section_break.check
Text Processing      - DEBUG: Break triggered by True
Text Processing      - DEBUG: Stepping back 1 lines
Buffered Iterator    - DEBUG: Have 1 Previous Items
Buffered Iterator    - DEBUG: Need 1 Steps back
Text Processing      - DEBUG: Section Break Detected
Text Processing      - DEBUG: Skipped 0 lines.
```

2. Section processing of first line
```
Text Processing      - DEBUG: Starting New Section: Section.
Text Processing      - DEBUG: Entered sub-section processor for: Section
Text Processing      - DEBUG: No sub-sections in: Section
Buffered Iterator    - DEBUG: Getting item: FIELD POINTS         from future_items
Text Processing      - DEBUG: In:       Section Got item:       FIELD POINTS
Text Processing      - DEBUG: Break Status:     Scan In Progress
Text Processing      - DEBUG: This is item number: 1 of Section
Text Processing      - DEBUG: This is the first item in Section
```

3. Section processing of second line
> It is not reading duplicate.  The first line may be retained from the initial setup.
```
Buffered Iterator    - DEBUG: Getting item: Field;Point;Dose;SSD;Depth;Effective Depth   from source
Text Processing      - DEBUG: In:       Section Got item:       Field;Point;Dose;SSD;Depth;Effective Depth
Text Processing      - DEBUG: Break Status:     Scan In Progress
Text Processing      - DEBUG: This is item number: 2 of Section
```

1. Section processing of third line
```
Text Processing      - DEBUG: Checking Trigger: NeverBreak
Text Processing      - DEBUG: in section_break.check
Buffered Iterator    - DEBUG: Getting item: Plan;PELB;4500.0 cGy;;;      from source
Text Processing      - DEBUG: In:       Section Got item:       Plan;PELB;4500.0 cGy;;;
Text Processing      - DEBUG: Break Status:     Scan In Progress
Text Processing      - DEBUG: This is item number: 3 of Section
```

5. Section processing of fourth line
```
Text Processing      - DEBUG: Checking Trigger: NeverBreak
Text Processing      - DEBUG: in section_break.check
Buffered Iterator    - DEBUG: Getting item:   CW  ;PELB;89.0 cGy;-;-;-   from source
Text Processing      - DEBUG: In:       Section Got item:         CW  ;PELB;89.0 cGy;-;-;-
Text Processing      - DEBUG: Break Status:     Scan In Progress
Text Processing      - DEBUG: This is item number: 4 of Section
```

6. Section processing of fifth line
```
Text Processing      - DEBUG: Checking Trigger: NeverBreak
Text Processing      - DEBUG: in section_break.check
Buffered Iterator    - DEBUG: Getting item:   CCW  ;PELB;91.0 cGy;-;-;-  from source
Text Processing      - DEBUG: In:       Section Got item:         CCW  ;PELB;91.0 cGy;-;-;-
Text Processing      - DEBUG: Break Status:     Scan In Progress
Text Processing      - DEBUG: This is item number: 5 of Section
```

7. End of section
```
Text Processing      - DEBUG: Checking Trigger: NeverBreak
Text Processing      - DEBUG: in section_break.check
Text Processing      - DEBUG: Break Status:     End of Source
```

#### Output
```
[
    'FIELD POINTS',
    'FIELD POINTS', 
    'Field;Point;Dose;SSD;Depth;Effective Depth', 
    'Plan;PELB;4500.0 cGy;;;',
    '  CW  ;PELB;89.0 cGy;-;-;-',
    '  CCW  ;PELB;91.0 cGy;-;-;-'
    ]
```

#### Expected Output
```
[
    'FIELD POINTS',
    'Field;Point;Dose;SSD;Depth;Effective Depth',
    'Plan;PELB;4500.0 cGy;;;',
    '  CW  ;PELB;89.0 cGy;-;-;-',
    '  CCW  ;PELB;91.0 cGy;-;-;-'
    ]
```

In [None]:
output = point_dose_section.read(test_text)
output

### Source Definition
A list of strings

In [None]:
test_source = [
    'Single Fixed Width Section',
    'Section Name    F1',
    'F1 Content1     d',
    'F1 Content2     e',
    'F1 Content3     f',
    'End Section',
    '',
    'Text to be ignored',
    '',
    'More Text to be ignored',
    '',
    'Single Delimiter Section',
    'Section Name:D2',
    'D2 Content1:m',
    'D2 Content2:n',
    'D2 Content3:o',
    'End Section',
    '',
    'Even More Text to be ignored',
    '',
    'Single Fixed Width Section',
    'Section Name    F2',
    'F2 Content1     p',
    'F2 Content2     q',
    'F2 Content3     r',
    'End Section',
    '',
    'Final Text to be ignored',
    '',
    'Multi Combined Group Section',
    'Single Section',
    'Section Name:D3',
    'D3 Content1:a',
    'D3 Content2:b',
    'D3 Content3:c',
    'Single Section',
    'Section Name:D4',
    'D4 Content1:d',
    'D4 Content2:e',
    'D4 Content3:f',
    'Single Section',
    'Section Name:D5',
    'D5 Content1:g',
    'D5 Content2:h',
    'D5 Content3:i',
    'Single Section',
    'Section Name:D6',
    'D6 Content1:j',
    'D6 Content2:k',
    'D6 Content3:l',
    'End Section',
    ]

### Expected Results

In [None]:
test_result = {
    'Section D1': {
        'Section Name':'D1',
        'D1 Content1': 'a',
        'D1 Content2': 'b',
        'D1 Content3': 'c'
        },
    'Section F1': {
        'Section Name':'F1',
        'F1 Content1': 'd',
        'F1 Content2': 'e',
        'F1 Content3': 'f'
        },
    'Test Multi Group Section': [
        {'Section Name':'D3',
            'D3 Content1': 'a',
            'D3 Content2': 'b',
            'D3 Content3': 'c'
        },
        {'Section Name':'D4',
            'D4 Content1': 'd',
            'D4 Content2': 'e',
            'D4 Content3': 'f'
        },
        {'Section Name':'D5',
            'D5 Content1': 'g',
            'D5 Content2': 'h',
            'D5 Content3': 'i'
        },
        {'Section Name':'D6',
            'D6 Content1': 'j',
            'D6 Content2': 'k',
            'D6 Content3': 'l'
        }
        ]
    }

In [None]:
multi_group_section = sec.Section(
    name='Group Section',
    start_section='Multi Combined Group Section',
    end_section='End Section',
    processor=[sub_delimiter_section],
    assemble=make_list
    )


### Reader definitions

### SectionBreak definitions


In [None]:

section_end = sections.SectionBreak(
    name='Single Section',
    sentinel='End Section'
    )
group_section_start = sections.SectionBreak(
    name='Combined Group Section',
    sentinel='Combined Group Section',
    break_offset='After'
    )
multi_group_section_start = sections.SectionBreak(
    name='Multi Combined Group Section',
    sentinel='Multi Combined Group Section',
    break_offset='After'
    )
group_section_end = sections.SectionBreak(
    name='End Group Section',
    sentinel='Done Combined Group Section',
    break_offset='Before'
    )


### Section definitions


In [None]:

def test_delimiter_sub_section_read(self):
    test_section = self.delimiter_section
    source = BufferedIterator(self.test_source)
    test_output = test_section.read(source, start_search=True,
                                    context=self.context)
    self.assertDictEqual(test_output, self.test_result['Section D1'])


In [None]:

def test_fixed_width_sub_section_read(self):
    test_section = self.fixed_width_section
    source = BufferedIterator(self.test_source)
    test_output = test_section.read(source, start_search=True,
                                    context=self.context)
    self.assertDictEqual(test_output, self.test_result['Section F1'])


In [None]:

def test_group_section_read(self):
    test_section = self.group_section
    source = BufferedIterator(self.test_source)
    test_output = test_section.read(source, start_search=True,
                                    context=self.context)
    expected_output = self.test_result['Test Group Section']
    for count, output in enumerate(zip(test_output[0], expected_output)):
        with self.subTest(section=count):
            section_output = output[0]
            expected_section_output = output[1]
            self.assertDictEqual(section_output,
                                    expected_section_output)


In [None]:

def test_multi_group_section_read(self):
    test_section = self.multi_group_section
    source = BufferedIterator(self.test_source)
    test_output = test_section.read(source, start_search=True,
                                    context=self.context)
    expected_output = self.test_result['Test Multi Group Section']
    for section_count, section_output in enumerate(zip(test_output,
                                                        expected_output)):
        for count, output in enumerate(zip(section_output[0],
                                            section_output[1])):
            subsection = f'{section_count}.{count}'
            with self.subTest(subsection=subsection):
                s_output = output[0]
                e_output = output[1]
                self.assertDictEqual(s_output, e_output)



