# Subsections Issue

## Setup

### Imports

In [1]:
from typing import List
from pathlib import Path
from pprint import pprint
import re
import sys

import pandas as pd
import xlwings as xw

import text_reader as tp
from sections import Rule, RuleSet, SectionBreak, ProcessingMethods, Section

### Logging

## DIR Listing Example

### Source Data

In [2]:
raw_dir_text = '''
 Volume in drive C is Windows
 Volume Serial Number is DAE7-D5BA

 Directory of c:\\users\\...\\Test Dir Structure

2021-12-27  03:33 PM    <DIR>          .
2021-12-27  03:33 PM    <DIR>          ..
2021-12-27  04:03 PM    <DIR>          Dir1
2021-12-27  05:27 PM    <DIR>          Dir2
2016-02-25  09:59 PM                 3 TestFile1.txt
2016-02-15  06:46 PM                 7 TestFile2.rtf
2016-02-15  06:47 PM                 0 TestFile3.docx
2016-04-21  01:06 PM              3491 xcopy.txt
               4 File(s)           3501 bytes

 Directory of c:\\users\\...\\Test Dir Structure\Dir1

2021-12-27  04:03 PM    <DIR>          .
2021-12-27  04:03 PM    <DIR>          ..
2016-02-15  06:48 PM                 0 File in Dir One.txt
2021-12-27  03:45 PM    <DIR>          SubFolder1
2021-12-27  03:45 PM    <DIR>          SubFolder2
               1 File(s)              0 bytes
'''
dir_text = raw_dir_text.splitlines()

### DIR line processing functions

In [3]:
def dir_name_split(dir_line: str) -> str:
    '''Extract the folder name from the full path.

    Args:
        dir_line (str): The directory path line from a DIR folder listing.

    Returns (str): A tab delimited line with 'Folder Name:' before the tab and
        the folder name after the tab.
    '''
    output_line = 'Folder Name:\t' + dir_line.rsplit('\\', 1)[1]
    return output_line


def file_count_split(dir_line: str) -> str:
    '''Extract the number of files from the "File(s)" DIR line.

    Args:
        dir_line (str): The "File(s)" line from a DIR folder listing.

    Returns (str): A tab delimited line with 'Number of Files:' before the tab
        and the extracted number of files after the tab.
    '''
    output_line = 'Number of Files:\t' + dir_line.strip().split(' ', 1)[0]
    return output_line


def get_file_name(dir_line: str) -> str:
    '''Extract the name of the file or subdirectory from a DIR line.

    Args:
        dir_line (str): A main listing line from a DIR folder listing.

    Returns (str): A tab delimited line with 'File:' or 'Subdirectory:'before
        the tab and the extracted name of the file or subdirectory after
        the tab.
    '''
    if len(dir_line) < 39:  # This deals with blank lines.
        output_line = ''
    elif '<DIR>' in dir_line:  # Contains a subdirectory name.
        output_line = '\tSubdirectory:\t' + dir_line[39:]
    else:  # Contains a file name.
        output_line = '\tFile:\t\t' + dir_line[39:]
    return output_line


### Sub-Section Definitions

In [4]:
dir_section = Section(
    section_name='DirectoryName',
    end_section=SectionBreak(True),
    processor=[dir_name_split]
    )

filename_section = Section(
    section_name='FileNames',
    end_section=SectionBreak('File(s)', break_offset='Before'),
    processor=[get_file_name]
    )

files_section = Section(
    section_name='NumberOfFiles',
    end_section=SectionBreak(True),
    processor=[file_count_split]
    )

###  Combined Section Definition

In [5]:
dir_section = Section(
    section_name='Full Directory',
    start_section='Directory of',
    end_section=SectionBreak('File(s)', break_offset='After'),
    processor=[dir_section, filename_section, files_section]
    )

### **FIXME** Files Count sub-section returning empty list

In [6]:
pprint(dir_section.read(dir_text))

[[['Folder Name:\tTest Dir Structure'],
  ['',
   '\tSubdirectory:\t.',
   '\tSubdirectory:\t..',
   '\tSubdirectory:\tDir1',
   '\tSubdirectory:\tDir2',
   '\tFile:\t\tTestFile1.txt',
   '\tFile:\t\tTestFile2.rtf',
   '\tFile:\t\tTestFile3.docx',
   '\tFile:\t\txcopy.txt'],
  ['Number of Files:\t4']]]


## Very Simple Source for Testing

### Initial Test Data

In [7]:
GENERIC_TEST_TEXT = [
    'StartSection Name:         A',
    'A Content1:a',
    'B Content1:b',
    'C Content1:c',
    'EndSection Name:A'
    ]

### Sub-section Definitions

In [8]:
name_section = Section(
    section_name='Name',
    end_section=SectionBreak(True)
    )
content_section = Section(
    section_name='Content',
    end_section=SectionBreak('EndSection', break_offset='Before')
    )
end_section = Section(
    section_name='End',
    end_section=SectionBreak(True)
    )
full_section = Section(
    section_name='Full',
    start_section=SectionBreak('StartSection', break_offset='Before'),
    end_section=SectionBreak('EndSection', break_offset='After'),
    processor=[name_section, content_section, end_section]
    )

### **FIXME** Bug Occurring with Last Section

In [9]:
pprint(full_section.read(GENERIC_TEST_TEXT))

[[['StartSection Name:         A'],
  ['A Content1:a', 'B Content1:b', 'C Content1:c'],
  ['EndSection Name:A']]]


- `end_section` sub-section returning Empty List

## Trying Different Scenarios

### Base reference
- `end_section` sub-section returning Empty List

In [10]:
full_section = Section(
    section_name='Full',
    start_section=SectionBreak('StartSection', break_offset='Before'),
    end_section=SectionBreak('EndSection', break_offset='After'),
    processor=[name_section, content_section, end_section]
    )
pprint(full_section.read(GENERIC_TEST_TEXT))


[[['StartSection Name:         A'],
  ['A Content1:a', 'B Content1:b', 'C Content1:c'],
  ['EndSection Name:A']]]


### Removing SubSections.

In [11]:
full_section = Section(
    section_name='Full',
    start_section=SectionBreak('StartSection', break_offset='Before'),
    end_section=SectionBreak('EndSection', break_offset='After')
    )

pprint(full_section.read(GENERIC_TEST_TEXT))

['StartSection Name:         A',
 'A Content1:a',
 'B Content1:b',
 'C Content1:c',
 'EndSection Name:A']


- Result is correct section list

### Removing Only Last (`end_section`) Section.

In [12]:
full_section = Section(
    section_name='Full',
    start_section=SectionBreak('StartSection', break_offset='Before'),
    end_section=SectionBreak('EndSection', break_offset='After'),
    processor=[name_section, content_section]
    )
pprint(full_section.read(GENERIC_TEST_TEXT))


[[['StartSection Name:         A'],
  ['A Content1:a', 'B Content1:b', 'C Content1:c']],
 [['EndSection Name:A'], []]]


- Get `None` where *'EndSection Name:A'* should be

### Removing Only First (`name_section`) Section.

In [13]:
full_section = Section(
    section_name='Full',
    start_section=SectionBreak('StartSection', break_offset='Before'),
    end_section=SectionBreak('EndSection', break_offset='After'),
    processor=[content_section, end_section]
    )
pprint(full_section.read(GENERIC_TEST_TEXT))


[[['StartSection Name:         A',
   'A Content1:a',
   'B Content1:b',
   'C Content1:c'],
  ['EndSection Name:A']]]


- Get empty list where `['EndSection Name:A']` should be

### Set `name_section` to stop *'Before'* *'Content'* line.

In [14]:
name_section = Section(
    section_name='Name',
    end_section=SectionBreak('Content', break_offset='Before')
    )
full_section = Section(
    section_name='Full',
    start_section=SectionBreak('StartSection', break_offset='Before'),
    end_section=SectionBreak('EndSection', break_offset='After'),
    processor=[name_section, content_section, end_section]
    )
pprint(full_section.read(GENERIC_TEST_TEXT))

[[['StartSection Name:         A'],
  ['A Content1:a', 'B Content1:b', 'C Content1:c'],
  ['EndSection Name:A']]]


- Get empty list where `['EndSection Name:A']` should be

### Set `name_section` to stop *'After'* *'StartSection'* line.

In [15]:
name_section = Section(
    section_name='Name',
    end_section=SectionBreak('StartSection', break_offset='After')
    )
full_section = Section(
    section_name='Full',
    start_section=SectionBreak('StartSection', break_offset='Before'),
    end_section=SectionBreak('EndSection', break_offset='After'),
    processor=[name_section, content_section, end_section]
    )
pprint(full_section.read(GENERIC_TEST_TEXT))

[[['StartSection Name:         A',
   'A Content1:a',
   'B Content1:b',
   'C Content1:c',
   'EndSection Name:A'],
  [],
  []]]


- Result is `[None]`

### Set `name_section` back to original: `end_section=SectionBreak(True)`<br>Set `end_section` to stop *'After'* *'EndSection'* line.

In [16]:
name_section = Section(
    section_name='Name',
    end_section=SectionBreak(True)
    )
end_section = Section(
    section_name='End',
    end_section=SectionBreak('EndSection', break_offset='After')
    )
full_section = Section(
    section_name='Full',
    start_section=SectionBreak('StartSection', break_offset='Before'),
    end_section=SectionBreak('EndSection', break_offset='After'),
    processor=[name_section, content_section, end_section]
    )
pprint(full_section.read(GENERIC_TEST_TEXT))

[[['StartSection Name:         A'],
  ['A Content1:a', 'B Content1:b', 'C Content1:c'],
  ['EndSection Name:A']]]


- Get empty list where `['EndSection Name:A']` should be

## Trying Different Scenarios With New Section *B*.

In [17]:
GENERIC_TEST_TEXT = [
    'StartSection Name:         A',
    'A Content1:a',
    'B Content1:b',
    'C Content1:c',
    'EndSection Name:A',
    'StartSection Name:         B',
    'A Content1:a',
    'B Content1:b',
    'C Content1:c',
    'EndSection Name:B'
    ]

### Set `full_section` to stop *'Before'* *'StartSection'* line.

In [18]:
full_section = Section(
    section_name='Full',
    start_section=SectionBreak('StartSection', break_offset='Before'),
    end_section=SectionBreak('StartSection', break_offset='Before'),
    processor=[name_section, content_section, end_section]
    )

pprint(full_section.read(GENERIC_TEST_TEXT))

[[['StartSection Name:         A'],
  ['A Content1:a', 'B Content1:b', 'C Content1:c'],
  ['EndSection Name:A']]]


- Result is `[None]`

### Creating Super-Section with `full_section` as Sub-Section.

In [19]:
full_section = Section(
    section_name='Full',
    start_section=SectionBreak('StartSection', break_offset='Before'),
    end_section=SectionBreak('StartSection', break_offset='Before'),
    processor=[name_section, content_section, end_section]
    )
pprint(full_section.read(GENERIC_TEST_TEXT))


[[['StartSection Name:         A'],
  ['A Content1:a', 'B Content1:b', 'C Content1:c'],
  ['EndSection Name:A']]]


**This Hangs if Run**

```
    all_sections = Section(processor=[full_section])

    pprint(all_sections.read(GENERIC_TEST_TEXT))
```

## Adding *'End Of Section'* line to Test Data

In [20]:
GENERIC_TEST_TEXT = [
    'StartSection Name:         A',
    'A Content1:a',
    'B Content1:b',
    'C Content1:c',
    'EndSection Name:A',
    'End Of Section',
    'More text'
    ]

#### Set `full_section` to stop *'Before'* *'End Of Section'* line.

In [21]:

full_section = Section(
    section_name='Full',
    start_section=SectionBreak('StartSection', break_offset='Before'),
    end_section=SectionBreak('End Of Section', break_offset='Before'),
    processor=[name_section, content_section, end_section]
    )

pprint(full_section.read(GENERIC_TEST_TEXT))

[[['StartSection Name:         A'],
  ['A Content1:a', 'B Content1:b', 'C Content1:c'],
  ['EndSection Name:A']]]


- `end_section` sub-section now returning correctly


#### Setting `end_section` to *'StartSection'* with `break_offset` set to *'Before'*

In [22]:
full_section = Section(
    section_name='Full',
    start_section=SectionBreak('StartSection', break_offset='Before'),
    end_section=SectionBreak('StartSection', break_offset='Before'),
    processor=[name_section, content_section, end_section]
    )

pprint(full_section.read(GENERIC_TEST_TEXT))

[[['StartSection Name:         A'],
  ['A Content1:a', 'B Content1:b', 'C Content1:c'],
  ['EndSection Name:A', 'End Of Section', 'More text']]]


- Result is `[None]`