# Subsections Issue

## Setup

### Imports

In [1]:
from typing import List
from pathlib import Path
from pprint import pprint
import re
import sys

import pandas as pd
import xlwings as xw

from buffered_iterator import BufferedIterator
import text_reader as tp
from sections import Rule, RuleSet, SectionBreak, ProcessingMethods, Section

### Function to compare context for two sections.

In [2]:
def compare_context(section1, section2):
    ctx_template = '{key:16s}:\t{item1:16s}\t{item2:16s}'
    context_1 = section1.context
    context_2 = section2.context
    keys_1 = set(context_1.keys())
    keys_2 = set(context_2.keys())
    all_keys = keys_1 | keys_2
    for key in all_keys:
        item1 = context_1.get(key, '')
        item2 = context_2.get(key, '')
        ctx_str = ctx_template.format(key=str(key), item1=str(item1), item2=str(item2))
        print(ctx_str)

In [3]:
GENERIC_TEST_TEXT = [
    'Text to be ignored',
    'StartSection Name: A',
    'EndSection Name: A',
    'Text between sections',
    'StartSection Name: B',
    'EndSection Name: B',
    'More text to be ignored'
    ]

In [4]:
GENERIC_TEST_TEXT1 = [
    'Text to be ignored',
    'StartSection A',
    'EndSection A',
    'StartSection B',
    'EndSection B',
    'StartSection C',
    'More text to be ignored',   # 'ignored' triggers end of top section
    'EndSection C',
    'Even more text to be ignored', 
    ]

In [5]:
GENERIC_TEST_TEXT1a = [
    'Text to be ignored',
    'StartSection A',
    'EndSection A',
    'StartSection B',  # Missing 'EndSection B',
    
    'StartSection C',
    'More text to be ignored',   # 'ignored' triggers end of top section
    'EndSection C',
    'Even more text to be ignored', 
    ]

In [6]:
GENERIC_TEST_TEXT2 = [
    'Text to be ignored',
    'StartSection A',
    'MiddleSection A',
    'EndSection A',
    'Unwanted text between sections',
    'StartSection B',
    'MiddleSection B',
    'EndSection B',
    'StartSection C',
    'MiddleSection C',
    'EndSection C',
    'Even more text to be ignored', 
    ]

## Three line sections

**********

# DONE TO HERE

### Run with two-line sections:
> `[`<br>
> `'Text to be ignored',`<br>
> `'StartSection Name: A',`<br>
> `'EndSection Name: A',`<br>
> `'Text between sections',`<br>
> `'StartSection Name: B',`<br>
> `'EndSection Name: B',`<br>
> `'More text to be ignored'`<br>
> `]`<br>

- Section start **Before** *StartSection*
- Section end **After** *EndSection*
- SubSection start **After** *StartSection*

```python
sub_section = Section(section_name='SubSection',
    start_section=SectionBreak('StartSection', break_offset='After')
    )

full_section = Section(section_name='Full',
    start_section=SectionBreak('StartSection', break_offset='Before'),
    end_section=SectionBreak('EndSection', break_offset='After'),
    subsections=[sub_section] 
    )
```

In [7]:
sub_section = Section(
    section_name='SubSection',
    start_section=SectionBreak('StartSection', break_offset='After')
    )
full_section = Section(
    section_name='Full',
    start_section=SectionBreak('StartSection', break_offset='Before'),
    end_section=SectionBreak('EndSection', break_offset='After'),
    subsections=sub_section
    )
multi_section = Section(section_name='Multi',
    subsections=full_section
    )

In [8]:
pprint(multi_section.read(GENERIC_TEST_TEXT))

[[['EndSection Name: A']], [['EndSection Name: B']]]


- ![Bad](../examples/error.png) Results in blank sub-list.
<table><thead><th>Expected</th><th>Actual</th></thead>
  <td><code>
    [<br>
      [['EndSection Name: A']],<br>
      [['EndSection Name: B']]<br>
    ]</code></td>
  <td><code>[]</code></td></tr>
</table>

In [9]:
sub_section = Section(
    section_name='SubSection',
    start_section=SectionBreak('EndSection', break_offset='Before')
    #end_on_first_item=True,
    #keep_partial=True,
    #end_section=SectionBreak(True)
    )

full_section = Section(
    section_name='Full',
    start_section=SectionBreak('StartSection', break_offset='Before'),
    end_section=SectionBreak('EndSection', break_offset='After'),
    subsections=sub_section 
    )

multi_section = Section(section_name='Multi',
    subsections=full_section,
    #end_on_first_item=True
    )

pprint(multi_section.read(GENERIC_TEST_TEXT2))

[[['EndSection A']], [['EndSection B']], [['EndSection C']]]


In [10]:
pprint(full_section.read(GENERIC_TEST_TEXT))

[['EndSection Name: A']]


- ![Bad](../examples/error.png) Results in blank sub-list.
<table><thead><th>Expected</th><th>Actual</th></thead>
<tr><td><code></code></td>
<td><code>[[]]</code></td></tr>
</table>

In [15]:
sub_section = Section(
    section_name='SubSection',
    start_section=SectionBreak('EndSection', break_offset='Before'),  # Added to use alone
    #end_on_first_item=True,
    #keep_partial=True,
    end_section=SectionBreak(True)
    )

full_section = Section(
    section_name='Full',
    start_section=SectionBreak('StartSection', break_offset='After'),
    #end_section=SectionBreak('EndSection', break_offset='After'),
    subsections=[sub_section]  
    )
pprint(full_section.read(GENERIC_TEST_TEXT))

[['EndSection Name: A'], ['EndSection Name: B']]


- ![Good](../examples/Valid.png) Results in one line section
- ![Bad](../examples/Error.png) - ??? Is this incorrect or is this expected???
  
<table><thead><th>Expected</th><th>Actual</th></thead>
<tr><td><code></code></td>
<td><code>
[['EndSection Name: A'], ['EndSection Name: B'], []]
  </code></td></tr>
</table>

### Add *Start After StartSection* and *End Before EndSection* to Section Definition, and for SubSection Definition, set *Start* to  *Before EndSection* and *End* to *`True` (Always Break)*
> **Section Definition**<br> 
> `start_section=SectionBreak('StartSection', break_offset='After'),`
> `end_section=SectionBreak('EndSection', break_offset='After'),`
    
> 
> **SubSection Definition**<br>
> `start_section=SectionBreak('EndSection', break_offset='Before'),`
> `end_section=SectionBreak(True),`
    

In [16]:
sub_section = Section(
    section_name='SubSection',
    start_section=SectionBreak('EndSection', break_offset='Before'),  # Added to use alone
    #end_on_first_item=True,
    #keep_partial=True,
    end_section=SectionBreak(True)
    )

full_section = Section(
    section_name='Full',
    start_section=SectionBreak('StartSection', break_offset='After'),
    end_section=SectionBreak('EndSection', break_offset='After'),
    subsections=[sub_section] 
    )
pprint(full_section.read(GENERIC_TEST_TEXT))

[['EndSection Name: A'], ['EndSection Name: B']]


- ![Good](../examples/Valid.png) Results in one line section
- ![Bad](../examples/Error.png) - ??? Is this incorrect or is this expected???
  
<table><thead><th>Expected</th><th>Actual</th></thead>
<tr><td><code></code></td>
<td><code>
[['EndSection Name: A']]
  </code></td></tr>
</table>

### Add *Start __Before__ StartSection* and *End Before EndSection* to Section Definition, and for SubSection Definition, set *Start* to  *Before EndSection* and *End* to *`True` (Always Break)*
> **Section Definition**<br> 
> `start_section=SectionBreak('StartSection', break_offset='Before'),`
> `end_section=SectionBreak('EndSection', break_offset='After'),`
    
> 
> **SubSection Definition**<br>
> `start_section=SectionBreak('EndSection', break_offset='Before'),`
> `end_section=SectionBreak(True),`
    

In [17]:
sub_section = Section(
    section_name='SubSection',
    start_section=SectionBreak('EndSection', break_offset='Before'),  # Added to use alone
    #end_on_first_item=True,
    #keep_partial=True,
    end_section=SectionBreak(True)
    )

full_section = Section(
    section_name='Full',
    start_section=SectionBreak('StartSection', break_offset='Before'),
    end_section=SectionBreak('EndSection', break_offset='After'),
    subsections=[sub_section]  
    )
pprint(full_section.read(GENERIC_TEST_TEXT))

[['EndSection Name: A']]


- ![Bad](../examples/Error.png) Results in empty list of lists
- ??? Is this incorrect or is this expected???
  
<table><thead><th>Expected</th><th>Actual</th></thead>
<tr><td><code></code></td>
<td><code>[[]]</code></td></tr>
</table>

### Add *Start __Before__ StartSection* and *End Before EndSection* to Section Definition, and don't set any SectionBreaks for SubSection Definition, 
> **Section Definition**<br> 
> `start_section=SectionBreak('StartSection', break_offset='Before'),`
> `end_section=SectionBreak('EndSection', break_offset='After'),`    
    

In [19]:
sub_section = Section(
    section_name='SubSection',
    #start_section=SectionBreak('EndSection', break_offset='Before'),  # Added to use alone
    #end_on_first_item=True,
    #keep_partial=True,
    #end_section=SectionBreak(True)
    )

full_section = Section(
    section_name='Full',
    start_section=SectionBreak('StartSection', break_offset='Before'),
    end_section=SectionBreak('EndSection', break_offset='After'),
    subsections=[sub_section]  
    )
pprint(full_section.read(GENERIC_TEST_TEXT))

[['StartSection Name: A', 'EndSection Name: A']]


- ![Good](../examples/Valid.png) ![Bad](../examples/Error.png) ??? Is this incorrect or is this expected???
  
<table><thead><th>Expected</th><th>Actual</th></thead>
<tr><td><code>[['StartSection Name: A', 'EndSection Name: A']]</code></td>
<td><code>[['StartSection Name: A', 'EndSection Name: A']]</code></td></tr>
</table>

## Three line sections

In [21]:
GENERIC_TEST_TEXT3 = [
    'Text to be ignored',
    'StartSection A',
    'MiddleSection A',
    'EndSection A',
    'Unwanted text between sections',
    'StartSection B',
    'Random text in the middle of a section',
    'MiddleSection B',
    'EndSection B',
    'StartSection C',
    'MiddleSection C',
    'EndSection C',
    'Even more text to be ignored', 
    ]

#### Single line subsection definitions 

In [22]:
start_sub_section = Section(
    section_name='StartSubSection',
    start_section=SectionBreak('StartSection', break_offset='Before'),
    end_section=SectionBreak(True, break_offset='Before')
    )

middle_sub_section = Section(
    section_name='MiddleSubSection',
    start_section=SectionBreak('MiddleSection', break_offset='Before'),
    end_section=SectionBreak(True, break_offset='Before')
    )

end_sub_section = Section(
    section_name='EndSubSection',
    start_section=SectionBreak('EndSection', break_offset='Before'),
    end_section=SectionBreak(True, break_offset='Before')
    )

#### Defining ***three_part_section*** 
- Contains an ending break:
    > `end_section=SectionBreak('ignored', break_offset='Before')`.

- Contains 3 subsections:
    > `[start_sub_section, middle_sub_section, end_sub_section]`

In [23]:
three_part_section = Section(
    section_name='Top Section',
    end_section=SectionBreak('ignored', break_offset='Before'),
    subsections=[start_sub_section, middle_sub_section, end_sub_section]
    )
pprint(three_part_section.read(GENERIC_TEST_TEXT3))

[[['StartSection A'], ['MiddleSection A'], ['EndSection A']],
 [['StartSection B'], ['MiddleSection B'], ['EndSection B']],
 [['StartSection C'], ['MiddleSection C'], ['EndSection C']]]


![Good](../examples/Valid.png) All 3 sections and subsections are completed. 

<table>
    <thead><th>Expected</th><th>Actual</th></thead>
    <tr>
        <td><code>
          [<br>
            [
              ['StartSection Name: A'],<br> 
              ['MiddleSection A'],<br> 
              ['EndSection Name: A']
            ],<br>
            [
              ['StartSection Name: B'],<br> 
              ['MiddleSection A'],<br> 
              ['EndSection Name: B']
            ]<br>
            [
              ['StartSection Name: C'],<br> 
              ['MiddleSection C'],<br> 
              ['EndSection Name: C']
            ]<br>
          ]
        </code></td>
        <td><code>
          [<br>
            [
              ['StartSection Name: A'],<br> 
              ['MiddleSection A'],<br> 
              ['EndSection Name: A']
            ],<br>
            [
              ['StartSection Name: B'],<br> 
              ['MiddleSection A'],<br> 
              ['EndSection Name: B']
            ]<br>
            [
              ['StartSection Name: C'],<br> 
              ['MiddleSection C'],<br> 
              ['EndSection Name: C']
            ]<br>
          ]
        </code></td></tr>
</table>

- Section start **Before** *StartSection*
- Section end **After** *EndSection*
- SubSection **End On First**

```python
sub_section = Section(section_name='SubSection',
    end_on_first_item=True,
    )

full_section = Section(section_name='Full',
    start_section=SectionBreak('StartSection', break_offset='Before'),
    end_section=SectionBreak('EndSection', break_offset='After'),
    subsections=[sub_section] 
    )
```

In [24]:
sub_section = Section(
    section_name='SubSection',
    #start_section=SectionBreak('StartSection', break_offset='After')
    end_on_first_item=True,
    #keep_partial=True,
    #end_section=SectionBreak(True)
    )

full_section = Section(
    section_name='Full',
    start_section=SectionBreak('StartSection', break_offset='Before'),
    end_section=SectionBreak('EndSection', break_offset='After'),
    subsections=[sub_section] 
    )

pprint(full_section.read(GENERIC_TEST_TEXT3))

[['StartSection A', 'MiddleSection A', 'EndSection A']]


In [25]:
top_section = Section(
    section_name='Top Section',
    end_section=SectionBreak('ignored', break_offset='Before'),
    subsections=[start_sub_section, end_sub_section]
    )
pprint(top_section.read(GENERIC_TEST_TEXT3))

[[['StartSection A'], ['EndSection A']],
 [['StartSection B'], ['EndSection B']],
 [['StartSection C'], ['EndSection C']]]
