In [1]:
from pathlib import Path
from functools import partial
from typing import  List, Union

from true_iterable import true_iterable
from sectionary.text_reader import FixedWidthParser, define_fixed_width_parser
from sectionary.text_reader import trim_items, to_dataframe
from sectionary.sections import Section, SectionBreak, ProcessingMethods
# Rule, RuleSet, ProcessingMethods

In [2]:
TEXT_DIR = Path.cwd() / '..' / 'Documentation'

In [3]:
text_file_name = 'Varian System Database Reference Guide V13.6.txt'
text_file_path = TEXT_DIR / text_file_name
raw_text = text_file_path.read_text(encoding='utf8', errors='ignore')
raw_line = raw_text.splitlines()

In [None]:
start_marker = 'Table 3  VDT Name to Microsoft SQL Datatype'
end_marker = 'Entity'

In [None]:
start_text = raw_text.find(start_marker)
end_text = raw_text.find(end_marker, start_text)
lines = raw_text[start_text:end_text].splitlines()

In [4]:
def drop_lines(lines: List[str], text_list:Union[List[str], str])->List[str]:
    # convert single string to one element list of strings
    if not true_iterable(text_list):
        text_list = [text_list]
        
    processed_lines = lines
    for drop_text in text_list:
        processed_lines = [line for line in processed_lines 
                           if drop_text not in line]
    
    return processed_lines

In [8]:
tbl_parse = define_fixed_width_parser(locations = [37])
brk_text = [
    'Abbreviations for Table and Column Names',
    'variansystem Database Overview'
    ]
drop_page_brk = partial(drop_lines, text_list=brk_text)
abr_proc = ProcessingMethods([drop_page_brk, tbl_parse])

In [11]:
start_marker = SectionBreak('Table 3  VDT Name to Microsoft SQL Datatype',
                            break_offset='After')
end_marker = SectionBreak('Entity', break_offset='Before')
dir_section = Section(start_section=start_marker, end_section=end_marker)
a = dir_section.read(raw_text.splitlines())

In [13]:
abr_proc.read(a)

[[[' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   'V',
   'D',
   'T',
   ' ',
   'N',
   'a',
   'm',
   'e',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' '],
  ['M',
   'i',
   'c',
   'r',
   'o',
   's',
   'o',
   'f',
   't',
   ' ',
   'S',
   'Q',
   'L',
   ' ',
   'D',
   'a',
   't',
   'a',
   't',
   'y',
   'p',
   'e']],
 [[' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   'V',
   'D',
   'T',
   '_',
   'A',
   'C',
   'Q',
   'A',
   'D',
   'J',
   'U',
   'S',
   'T',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' '],
  ['f', 'l', 'o', 'a', 't']],
 [[' ',
   ' ',
   ' ',
   ' ',
   ' ',
   ' ',
   'v',
   'a',
   'r',
   'i',
   'a',
   'n',
   's',
   'y',
   's',
   't',
   'e',
   'm',
   ' ',
   'D',
   'a',
   't',
   'a',
   'b',
   'a'

In [None]:
text_list = [
    'Table 3  VDT Name to Microsoft SQL Datatype',
    'variansystem Database Overview'
    ]
clean_lines = drop_lines(lines, text_list)
dt_parse = FixedWidthParser(locations = [37])
processed_lines = dt_parse.parser(clean_lines)
data_type_lookup = to_dataframe([trim_items(l) for l in processed_lines])
data_type_lookup.set_index('VDT Name', inplace=True)

In [None]:
'Zones are only used to show a logical organization of the database structure.'
'The poster is a simplified entity-relationship (ER) diagram'

In [None]:
'Abbreviations for Table and Column Names'
'Chapter 3                    Building Queries and Reports'

In [None]:
tbl_parse = define_fixed_width_parser(locations = [37])
brk_text = [
    'Abbreviations for Table and Column Names',
    'variansystem Database Overview'
    ]
drop_page_brk = partial(drop_lines, text_list=brk_text)


In [None]:
dir_section = Section(
    start_section=SectionBreak('Abbreviations for Table and Column Names', 
                               break_offset='After'), 
    end_section=SectionBreak('Chapter 3', break_offset='Before')
    )

a = dir_section.read(raw_text.splitlines())

In [None]:
b = drop_page_brk(a)
c = tbl_parse(b)
d = trim_items(c)

In [None]:
tbl_parse(drop_page_brk(a))


In [None]:
abr_proc = ProcessingMethods([drop_page_brk, tbl_parse, trim_items])

In [None]:
from sectionary.sections import *


In [None]:
use_function = sig_match(drop_page_brk, sig_type='Process')
use_function.is_gen = True
use_function.__getattribute__('is_gen')

In [None]:
given_method = drop_page_brk
method_type='Process'
if isinstance(given_method, str):
    use_function = standard_action(given_method, method_type)
    use_function.is_gen = False
else:
    use_function = sig_match(given_method, sig_type=method_type)
    # Add a special attribute to use_function because sig_match hides
    # whether rule_method is a generator function.  This attribute is
    # checked when the function is called.
    if isgeneratorfunction(given_method):
        use_function.is_gen = True
    elif isinstance(given_method, partial):
        print('hello')
        if isgeneratorfunction(given_method.func):
            use_function.is_gen = True
    else:
        print('hi')
        use_function.is_gen = False
#isgeneratorfunction(given_method)
#use_function.__getattribute__('is_gen')

In [None]:
abr_proc.read(a)

In [None]:
dir_section = Section(
    start_section=SectionBreak('Abbreviations for Table and Column Names', 
                               break_offset='After'), 
    end_section=SectionBreak('Chapter 3', break_offset='Before'),
    processor=abr_proc)

dir_section.read(raw_text.splitlines())

In [None]:
text_list = [
    'Table 3  VDT Name to Microsoft SQL Datatype',
    'variansystem Database Overview'
    ]
clean_lines = drop_lines(lines, text_list)
dt_parse = FixedWidthParser(locations = [37])
processed_lines = dt_parse.parser(clean_lines)
data_type_lookup = to_dataframe([trim_items(l) for l in processed_lines])
data_type_lookup.set_index('VDT Name', inplace=True)

In [None]:
print('\n'.join(clean_lines))