In [1]:
intention = '''Draft or create a class system to represent the pular entries. 
Ideally this will contain a way to nest entry objects under a root
'''
#%pip install docx
#%pip install python-docx #this mutates docx? 
#%pip install pydantic
#%pip install mypy
# %pip install numpy
from typing import Optional, Dict, List, Any, Union, Tuple
from pydantic import BaseModel, ValidationError, validator, root_validator, Field, constr
import json
import docx
from docx import Document
from datetime import datetime
import logging
from itertools import compress, tee, chain
import re
from collections import Counter
import string
import numpy as np
from verbalexpressions import VerEx

In [2]:
#get current datetime
now = datetime.now()
current_time = now.strftime("%Y-%m-%d_-_%H-%M-%S")

# #create file to save prints (use with jupyter magic enabled at the top of this cell: %%capture cap --no-stderr)
# output_name = f"{current_time}_result.txt"
# experiment = input("Enter emperiment description:")
# print(f"Experiment time: {current_time}\nExperiment note: {experiment}\n\n")

logger_filename = f"logs_and_outputs/initialization_placeholder.log"

# Creating an object
logger = logging.getLogger()

# Setting the threshold of logger to DEBUG
logger.setLevel(logging.ERROR)

#add encoding
handler = logging.FileHandler(logger_filename, 'w', 'utf-8') 
handler.setFormatter(logging.Formatter('%(asctime)s %(message)s'))
logger.addHandler(handler) 

# # Test messages
logger.debug("current_time")
# logger.info("Just an information")
# logger.warning("Its a Warning")
# logger.error("Did you try to divide by zero")
# logger.critical("Internet is down")

In [3]:
def pairwise(iterable):
    # pairwise('ABCDEFG') --> AB BC CD DE EF FG
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)
    
def logger_root_validation_error_messages(e, logger_details, suppress = [], run_enumeration: Optional[int] = None) -> Union[RuntimeError, TypeError]:      
   #TODO add ability to handle assertion errors
   if run_enumeration is not None:
      run_num = f"|run#{run_enumeration}|" #type: ignore 
   else:
      run_num = ""
   try:
      for err in e.errors():
         if err['type'] in suppress['type'] or err['msg'] in suppress['msg']:
            logger.info(f"|SUPRESSED|{logger_details['function']}|{type(e)}|para#{logger_details['paragraph_enumeration']}{run_num}, with validation? error: {err}")
            return TypeError("suppressed Validation Error")
         else:
            logger.error(f"|unsuppressed|{logger_details['function']}|{type(e)}|para#{logger_details['paragraph_enumeration']}{run_num}, with validation? error: {err}")
            return TypeError("un-suppressed Validation Error")
   except:
      logger.error(f"|unsuppressed|{logger_details['function']}|{type(e)}|para#{logger_details['paragraph_enumeration']}{run_num}, with error: {e}")
      return RuntimeError("non-validation error")
   return RuntimeError("non-validation error")

def pular_str_strip_check(s:str) ->bool:
   in_len = len(s)
   new_s = s.strip()
   out_len = len(new_s)
   purported_whitespace: bool = in_len != out_len
   return purported_whitespace

def closest(ranger, target): #any target indeces occuring before the first ranger index will be ignored
   if not isinstance(target,np.ndarray):
      target = np.array(target)
   for a,b in ranger:
      begin = np.searchsorted(target,a)
      end = np.searchsorted(target,b)
      _, out, target = np.split(target, [begin,end])
      yield list(out)
   yield list(target)

In [4]:
class Docx_Paragraph (BaseModel):
   """input:   paragraph = your_paragraph_here
   
   when given a docx document's paragraph object, will parse it to a specified schema
   """
   # docx_document_paragraph: Optional[Any] #This should be validated below. Left optional because its inclusion causes problems with default repr and serialization
   para_text: str = Field(..., min_length = 1) ##required, must be string, must be 1 long or more
   para_first_line_indent: Optional[int] = Field(...) #Required, but must be int OR none. https://pydantic-docs.helpmanual.io/usage/models/#required-optional-fields 
   para_left_indent: Optional[int] = Field(...) #Required, but must be int OR none. https://pydantic-docs.helpmanual.io/usage/models/#required-optional-fields 

   @root_validator(pre=True) #TODO Try have post validator for runs only?
   def _docx_structure_check(cls, values: Dict[str, Any]) -> Dict[str, Any]:
      para = values.get("paragraph",False)
      assert isinstance(para, eval('docx.text.paragraph.Paragraph')), 'please enter a docx paragraph assigned to the variable "paragraph", in the form of     paragraph = your_paragraph_here'

      new_values: Dict[str, Any] = {}
      #extract para features, 
      new_values['para_text'] = para.text #type: ignore
      new_values['para_first_line_indent'] = para.paragraph_format.first_line_indent #type: ignore
      new_values['para_left_indent'] = para.paragraph_format.left_indent #type: ignore

      return new_values


class Docx_Run (BaseModel):
   """input:   run = your_run_here
   
   when given a docx document paragraphs run object, will parse it to a specified schema
   """
   run_text : str = Field(..., min_length = 1) #required, must be string, must be 1 long or more
   run_font_name : Optional[str] = Field(...) #required, must be string or None value
   run_font_size_pt : Optional[float] = Field(...)#Required, but must be float OR none value
   run_bold : Optional[bool] = Field(...) #Required, but must be bool OR none value
   run_italic : Optional[bool] = Field(...) #Required, but must be bool OR none value

   @root_validator(pre=True) #TODO Try have post validator for runs only?
   def _docx_structure_check(cls, values: Dict[str, Any]) -> Dict[str, Any]:
      run = values.get("run",False)
      assert isinstance(run, eval('docx.text.run.Run')), 'please enter a docx run assigned to the variable "run", in the form of     run = your_run_here'
      
      new_values : Dict[str, Any] = {}
      #loop through the runs in the paragraph and select the desired features
      new_values['run_text'] = run.text #type: ignore
      new_values['run_font_name'] = run.font.name #type: ignore
      if run.font.size is not None: #type: ignore
         new_values['run_font_size_pt'] = run.font.size.pt #type: ignore
      else: new_values['run_font_size_pt'] = None
      new_values['run_bold'] = run.bold #type: ignore
      new_values['run_italic'] = run.italic #type: ignore

      return new_values


class Docx_Run_List (BaseModel):
   """input:   run_list = your_runs_in_a_list
   
   when given a list of docx document paragraphs run object, will parse it to a specified schema
   """
   #because the internals are validated, don't need to validate these other than that they were made into lists
   run_text : List[Any] = Field(...) #Required, must be list
   run_font_name : List[Any] = Field(...) #Required, must be list
   run_font_size_pt : List[Any] = Field(...) #Required, must be list
   run_bold : List[Any] = Field(...) #Required, must be list
   run_italic : List[Any] = Field(...) #Required, must be list

   @root_validator(pre=True) #TODO Try have post validator for runs only?
   def _docx_structure_check(cls, values: Dict[str, List[Any]]) -> Dict[str, Any]:
      from collections import defaultdict
      paragraph_enumeration = values.get('paragraph_enumeration',"<<FAILURE_paragraph_enumeration>>")
      runs = values.get("run_list",False)
      if not runs:
         raise ValueError('please enter a docx run list assigned to the variable "run_list", in the form of     run_list = your_run_list_here')
      new_values = defaultdict(list)
      suppress = {'type': ['value_error.any_str.min_length' #ignore zero length run_text, per run validator
                           ],
                  'msg': ['suppressed Validation Error'] #ignore suppressed errors earlier/lower in the stack      
      }
      logger_details = {'function':'parsed_run', 'paragraph_enumeration':paragraph_enumeration }
      
      for run_enumumeration, run in enumerate(runs): #type: ignore
         try:
            parsed_run = Docx_Run(**{'run':run}) #this manner of root unpacking seems to give warnings since linter can't assess ahead of time
            assert isinstance(parsed_run, Docx_Run), 'RUNTIME_ERR - the docx run object did not return the type expected'
            for k,v in parsed_run.dict().items():
               new_values[k].append(v) 

         except BaseException as e:
            new_e = logger_root_validation_error_messages(e, logger_details, suppress,run_enumeration=run_enumumeration)
            raise new_e
             
      return new_values


In [9]:
# Docx_Run.schema()
# Docx_Run_List.schema()

In [5]:
class Docx_Paragraph_and_Runs (BaseModel):
   """input:   paragraph = your_paragraph_here
   
   when given a docx document's paragraph object, will parse it to a specified schema
   """

   class Config:
      extra = 'allow'
      # arbitrary_types_allowed = True

   @root_validator(pre=True) #TODO Try have post validator for runs only?
   def _docx_structure_check(cls, values: Dict[str, Any]) -> Dict[str, Any]:
      new_values: Dict[str, Any] = {}
      para = values.get("paragraph",False)
      assert isinstance(para, eval('docx.text.paragraph.Paragraph')), 'please enter a docx paragraph assigned to the variable "paragraph", in the form of     paragraph = your_paragraph_here'
      
      paragraph_enumeration: int = values.get('paragraph_enumeration',None)
      assert isinstance(paragraph_enumeration, int), "assertion error, bad paragraph count/paragraph_enumeration value passed. Please pass an integer"
      new_values['paragraph_enumeration'] = paragraph_enumeration

      
      #setting up error and logger handling
      #suppress these errors
      suppress = {'type': ['value_error.any_str.min_length' #ignore zero length run_text, per run validator
                           ],
                  'msg': ['suppressed Validation Error'] #ignore suppressed errors earlier/lower in the stack      
      }
      #try to extract para features, 
      logger_details = {'function':'Docx_Paragraph', 'paragraph_enumeration':paragraph_enumeration }
      try: 
         parsed_paras = Docx_Paragraph(**{'paragraph':para}) #type: ignore
         for k,v in parsed_paras.dict().items():
            new_values[k] = v
      # except ValidationError as e:
      #    logger_root_validation_error_messages(e, logger_details, suppress)
      except BaseException as e:
         new_e = logger_root_validation_error_messages(e, logger_details, suppress)
         raise new_e

      #try to extract runs features
      logger_details = {'function':'Docx_Run_List', 'paragraph_enumeration':paragraph_enumeration }    
      try:
         parsed_runs = Docx_Run_List(**{'run_list':para.runs, 'paragraph_enumeration':paragraph_enumeration}) #type: ignore
         for k,v in parsed_runs.dict().items():
            new_values[k] = v
      except BaseException as e:
         new_e = logger_root_validation_error_messages(e, logger_details, suppress)
         raise new_e
         
      return new_values
      
   def interogate__para_text(self) -> str:
      t = getattr(self, 'para_text', "")
      # 
      if len(t) == 0:
         logger.warning('interogator did not find para_text')
      #    print("no para_text with:\n\t", self.dict())
      return t

   def paragraph_logger(self,level:int,msg:str,print_bool:bool):
      if print_bool:
         print(msg)
      else:
         logger.log(level,msg)

   def single_run_feature_identify(self,params:Dict[str,Any]) -> Tuple[bool,Tuple[List[bool],List[Any]],Tuple[List[bool],List[Optional[str]]]]: 
      """if regex provided, must be in param dict with name 'text_regex_at_feature', and must be passed as a r'pattern' raw string
      return tuple of ('feature boolean', feature_Tuple[boolean mask, feature list], regex_tuple[boolean mask, regex match list])
      """
      enumeration : Optional[int] = getattr(self,"paragraph_enumeration",None)
      assert isinstance(enumeration, int),f"bad value for 'paragraph_enumeration' {enumeration}"
      run_texts : Optional[List[str]] = getattr(self,'run_text',None)
      assert run_texts is not None, f"bad value for 'run_text' {self.__repr__()}"
      feature = params['docxFeature']
      assert isinstance(feature,str),f"bad value for parameter 'docxFeature'. Check params: {params}"
      text_regex_at_feature = params.get('text_regex_at_feature',False)
      regex_mask: List[bool] = []
      regex_matches: List[Optional[str]] = []

      values_from_runs: List[Optional[Union[float,bool]]] = getattr(self,feature,[None]) 
      value_mask: List[bool] = [True if x == params['value'] else False for x in values_from_runs]
      
      if any(value_mask):
         # print('text and value mask: ',run_texts,value_mask)
         # if text_regex_at_feature:
            # pattern = text_regex_at_feature
            # for text in run_texts:
            #    match = re.search(pattern, text) #type: ignore
            #    if match is not None:
            #       regex_mask.append(True)
            #       regex_matches.append(match.group(0))
            #       # print(repr(self))
            #    else:
            #       regex_mask.append(False)
            #       regex_matches.append(None)
            # print('regex and match: ',regex_mask,regex_matches)
            # # print(f'inside regex bool for para#{enumeration}\tregex_mask_is: {regex_mask}\t\tvalue_mask is: {value_mask}')
            # if not any(compress(value_mask,regex_mask)):
            #    return False, (value_mask, values_from_runs), (regex_mask, regex_matches) #does not have feature
         return True, (value_mask, run_texts), (regex_mask, regex_matches)  #has Feature
      else:
         return False, (value_mask, run_texts), (regex_mask, regex_matches) #does not have feature

   def modify_run_lists(self, drop_runs: Optional[List[int]] = None, add_runs: Optional[Tuple[int, List[List[Any]]]] = None, merge_runs : bool = False): #-> Optional[Dict[str, List[List[Any]]]]
      """given a list of indexes as 'drop' will drop those indexes from runlists, and return those dropped
      given a tuple with an integer index and list of lists (run aligned), will add those to entries to the runlists at that index
      given bool merge, will greedy merge all runs with the same run features EXCEPT run_text. Run_texts will be concatenated
      """
      run_list_req_features: List[str] = Docx_Run_List.schema()['required']
      assert run_list_req_features[0] == 'run_text', "first feature in the schema should be run_text"
      para_enumeration = getattr(self, 'paragraph_enumeration',None)
      assert para_enumeration is not None, 'paragraph did not have an enumeration value'

      feature_run_lists : List[List[Any]] = []
      for f in run_list_req_features:
         feature_run_lists.append(getattr(self,f,[]))
      pivoted_run_lists = list(map(list, zip(*feature_run_lists)))
      number_of_runs : int = len(pivoted_run_lists)
      if number_of_runs < 1:
         raise ValueError('this paragraph does not have values in the run lists')

      merge_occured = False
      beginning_repr = self.__repr__()
      if drop_runs is not None:
         dropped_runs = {}
         logger.info(f'tried to drop a run from para#{para_enumeration}')
         for ind in drop_runs:
            dropped_runs[ind] = pivoted_run_lists.pop(ind) #mutates pivoted_run_lists
         if number_of_runs == len(pivoted_run_lists):
            raise RuntimeError('the runs_lists were not shortened as expected')
         number_of_runs : int = len(pivoted_run_lists)
         feature_run_lists = list(map(list, zip(*pivoted_run_lists)))

      if add_runs is not None:
         insert_ind = add_runs[0]
         add_lists = add_runs[1]
         assert len(add_lists[0]) == number_of_runs, "the added list of lists must have runs of the same length (feature space) as run_lists features in the schema: Docx_Run_List.schema()['required']"
         if insert_ind == -1:
            insert_ind = number_of_runs
         for lst in add_lists:
            pivoted_run_lists.insert(insert_ind,lst)
         number_of_runs : int = len(pivoted_run_lists)
         feature_run_lists = list(map(list, zip(*pivoted_run_lists)))
      
      if merge_runs is not False:
         i = 0
         still_merging = True
         beginning_repr = self.__repr__()
         while still_merging:
            pairs = list(pairwise(list(range(len(pivoted_run_lists))))) #index pairs
            if len(pairs) < 1: #onely 1 run, which causes pairwise to yield empty lists since nothing to pair with
               break
            num_merged = 0
            for a,b in pairs: #where a,b are indexes in the pivoted run list (each index is one run)
               a -= num_merged #mutate pivot indexes after the pivot array has been mutated
               b -= num_merged
               if pivoted_run_lists[a][1:] == pivoted_run_lists[b][1:]: #if all features EXCEPT run_text are the same #TODO add ability to config which features to merge on
                  pivoted_run_lists[b][0] = pivoted_run_lists[a][0] + pivoted_run_lists[b][0]
                  pivoted_run_lists.pop(a)
                  num_merged +=1
                  merge_occured = True #flag for end of function, to determine if any changes need to be set to 'self'
               else: pass 
            if num_merged < 1: #if no merges where made in this iteration, merging is done. Else keep while loop since new merges may occur with new neighbors
               still_merging = False
         number_of_runs : int = len(pivoted_run_lists)
         feature_run_lists = list(map(list, zip(*pivoted_run_lists)))

      if any([drop_runs is not None, add_runs is not None, merge_occured]):
         for i, f in enumerate(run_list_req_features):
            self.__setattr__(f,feature_run_lists[i])
         print(self.__repr__())
   def cleaner(self, execute_defaults: bool = True) -> bool : #params:Optional[Dict[str,Any]],
      """defaults to running "remove_para_leading_whitespace". This removes leading runs that are blank, and strips the first text run of any LEADING whitespace, if any is present.
      the params dict is not implemented currently
      returns bool value. True means cleaner would yield a valid para. False currently indicates all runs in para are whitespace.
      """
      #TODO aggregate these getattrs so that every function doesn't need to get it themselves. Or simplify this with a function that has an assert bool to require it or not
      para_enumeration = getattr(self, 'paragraph_enumeration',None)
      assert para_enumeration is not None, 'paragraph did not have an enumeration value'

      def remove_para_leading_whitespace(start_ind : int = 0): #run 
         # try: #expect to fail when reaches the end of the list
         para_text : Optional[str] = getattr(self, 'para_text',None)
         if isinstance(para_text,str):
            if len(para_text.strip()) == 0: #if para's text is ONLY whitespace
               return False
         run_text_list : List[str] = getattr(self, 'run_text',[''])
         num_runs = len(run_text_list)

         ind = start_ind
         droppable_runs : List[int] = [] #TODO this dropable section doesnt seem to be working correctly.
         while ind < num_runs:
            this_run_text = run_text_list[ind]
            stripped_run = this_run_text.lstrip() #TODO pass config to this to allow control of what can and can't be dropped.
            if len(stripped_run) == 0: #found ALL whitespace run. Need to iterate to see if next run is blank or has any leading whitespace
               droppable_runs.append(ind) #TODO convert this change to a an equivalent para_indent, since this paragraph likely has incorrect indents
               logger.info(f'paragraph#{para_enumeration} with text ""{para_text}"" had a run with ONLY whitespace')
            elif len(stripped_run) < len(this_run_text): #found run that is NOT ALL whitespace, but had SOME. Will only happen once. Can stop now since this is the true beginning of this paragraph
               run_text_list[ind] = stripped_run
               self.__setattr__("run_text", run_text_list) #TODO convert this change to a an equivalent para_indent, since this paragraph likely has incorrect indents
               logger.info(f'paragraph#{para_enumeration} with text ""{para_text}"" had leading whitespace removed')
               break
            else: #Can stop now since this is the true beginning of this paragraph
               break
            ind +=1
            
         if len(droppable_runs) > 0: #if a whole run_text was whitespace only
            if len(droppable_runs) == num_runs: #if the whole paragraph was whitespace only
               raise RuntimeError(f'for paragraph#{para_enumeration}, all runs purported droppable whitespace, but para_text purported not')
            self.modify_run_lists(drop_runs = droppable_runs) #this removes whole runs, not just modifying the run_text.
            logger.info(f'paragraph#{para_enumeration} with text ""{para_text}"" tried to drop a run whitespace')

      if execute_defaults:
         remove_para_leading_whitespace()
         self.modify_run_lists(merge_runs = True)

      return True

In [None]:
class Fula_Entry (BaseModel): 
   entity_word: List[str] #root, subroot, lemma
   features: Optional[Dict[str,str]] = {} #contains features for this entity, ie: txt file features like location, POS, etc. Only applicable directly. Lemmas have POS, roots do not, etc
   paragraphs_list: Dict[int,Any] #para enumeration, docx para obj
   paragraphs_extr : List[Docx_Paragraph_and_Runs] #class defined above
   sub_roots : List['Fula_Entry'] = [] #self reference
   lemmas : List['Fula_Entry'] = [] #self reference

   

## from __future__ import annotations
# from typing import ForwardRef
# Fula_Entry = ForwardRef('Fula_Entry')
# root = root_ind_list[0]
# lemma = lemma_ind_list[0]
# test_entry = Fula_Entry()
# print(len(root_ind_list),'\t',root_ind_list)
# print(lemma_ind_list)
# print(lemma_ind_list[8:])
# print(len(list(pairwise(root_ind_list))),'\t',list(pairwise(root_ind_list)))





In [None]:
# def paragraph_splitter(self): #almost certainly only going to be only the lemmas from roots
      #single_run_feature_identify(condition) -> mask
      #find index in mask where to split
      #create a clone of the object (default para_enumeration)
         #need to change class to allow all para enumerations to be float
         #default para_enumeration float split size (float p_e.##?)
      #modify_run_lists to drop the last runs from para_A, and first runs from para_B
      #run cleaner again to remove any leading whitespace in new para?


In [6]:
# %%capture cap --no-stderr

#get current datetime
now = datetime.now()
current_time = now.strftime("%Y-%m-%d_-_%H-%M-%S")

#create file to save prints (use with jupyter magic enabled at the top of this cell: %%capture cap --no-stderr)
# output_name = f"logs_and_outputs/{current_time}_docxFileParseResult.txt"
# experiment = input("Enter emperiment description:")
# print(f"Experiment time: {current_time}\nExperiment note: {experiment}\n\n")

logger_filename = f"logs_and_outputs/{current_time}docxFileParse.log"

# Creating an object
logger = logging.getLogger()

# Setting the threshold of logger to DEBUG
logger.setLevel(logging.INFO)

#add encoding
handler = logging.FileHandler(logger_filename, 'w', 'utf-8') 
handler.setFormatter(logging.Formatter('%(asctime)s %(message)s'))
logger.addHandler(handler) 

#Run docx module to parse the docx file
# docx_filename = "Fula_Dictionary-repaired.docx"
docx_filename = "pasted_docx page 1.docx"
document = Document(docx_filename)



char_counts = Counter()

docx_object_list = []
parsed_object_list = []
failed_paras_ind = []
handled_errors = []

for i, para in enumerate(document.paragraphs):

   docx_object_list.append((i,para))
   try:
      entryObj = Docx_Paragraph_and_Runs(**{'paragraph': para, 'paragraph_enumeration': i})
      char_counts.update(entryObj.interogate__para_text())
      parsed_object_list.append((i,entryObj))
   except ValidationError as e:
      suppress = {
            # 'type': ['value_error.any_str.min_length' #ignore zero length run_text, per run validator
            #          ],
            'msg': ['suppressed Validation Error'] #ignore suppressed errors earlier/lower in the stack      
      }
      for err in e.errors():
         if err['msg'] in suppress['msg']:
            handled_errors.append((i,para))
            pass
   except BaseException as e:
      print(e)
      failed_paras_ind.append((i,para))
      
print('total paras: ',len(docx_object_list))
print('parsed paras: ',len(parsed_object_list))
print('handled errors: ',len(handled_errors))
print('failed paras: ',len(failed_paras_ind))

assert len(docx_object_list) == len(parsed_object_list) + len(handled_errors) + len(failed_paras_ind)


total paras:  32507
parsed paras:  32040
handled errors:  467
failed paras:  0


In [7]:
# with open(output_name, 'w', encoding="utf-8") as f:
#     f.write(cap.stdout)

In [8]:
#get current datetime
now = datetime.now()
current_time = now.strftime("%Y-%m-%d_-_%H-%M-%S")

#create file to save prints (use with jupyter magic enabled at the top of this cell: %%capture cap --no-stderr)
output_name = f"logs_and_outputs/{current_time}_objList_processing_Output.txt"
experiment = input("Enter emperiment description:")
print(f"Experiment time: {current_time}\nExperiment note: {experiment}\n\n")

logger_filename = f"logs_and_outputs/{current_time}objList_processing.log"

# Creating an object
logger = logging.getLogger()

# Setting the threshold of logger to DEBUG, etc
logger.setLevel(logging.INFO)

#add encoding
handler = logging.FileHandler(logger_filename, 'w', 'utf-8') 
handler.setFormatter(logging.Formatter('%(asctime)s %(message)s'))
logger.addHandler(handler) 

Experiment time: 2022-07-22_-_09-57-58
Experiment note: full run




In [9]:
# # 2022-07-22 07:20:18,751 paragraph#2731 with text ""   -IR-"" had leading whitespace removed
# # parsed_object_list = parsed_object_list[2731:3000]
# # def any(self, value):
# #    return self.add("([%s])" % value)
# # def add(self, value):
# #    if isinstance(value, list):
# #       self.s.extend(value)
# #    else:
# #       self.s.append(value)
# #    return self
# low_alph_chars = ''.join([x.lower() for x in char_counts.keys() if x.upper() != x.lower()]) #only uppercase alphabetical chars
# up_alph_chars = ''.join([x.upper() for x in char_counts.keys() if x.upper() != x.lower()]) #only uppercase alphabetical chars
# root_note_chars = '-+()? ' #characters that encode the author's notes
# sub_root_beginnings = '-+('
# permissive_root_contents = ''.join(list(chain(up_alph_chars,root_note_chars,string.digits)))
# pattern = '^['+re.escape(sub_root_beginnings)+'][^'+low_alph_chars+']+'
# print(pattern)
# print(re.search(pattern = pattern, string = '-KLHFGSLK JEs'))
# pattern = '^['+re.escape(sub_root_beginnings)+']['+re.escape(permissive_root_contents)+']+'
# print(pattern)
# print(re.search(pattern = pattern, string = 'KLHFGSLK JEs'))

In [10]:
%%capture cap 
#--no-stderr
now = datetime.now()
current_time = now.strftime("%Y-%m-%d_-_%H-%M-%S")
print(f"Experiment time: {current_time}\nExperiment note: {experiment}\n\n")
root_ind_list = []
subroot_ind_list = []
lemma_ind_list = []
some_error_ind_list = []
reject_ind_list = []
root_and_lemma_one_line = []

up_alph_chars = [x.upper() for x in char_counts.keys() if x.upper() != x.lower()] #only uppercase alphabetical chars

for i, entryObj in parsed_object_list:

   try:
      successful_cleaner_output:bool = entryObj.cleaner() #by default cleaner removes leading whitespace and merges adjacent runs with identical format features
      # print('sucessful cleaner')
      if not successful_cleaner_output:
         reject_ind_list.append(i)
         print(f'para# {i} IS ONLY whitespace. Need to drop it. #TODO')
   except:
      some_error_ind_list.append(f"cleaner error on p: {i}. Text is {entryObj.interogate__para_text()}")
      print('error on cleaner')
   try:
      root_note_chars = '-+()? ' #characters that encode the author's notes
      sub_root_beginnings = '-+('
      permissive_root_contents = ''.join(list(chain(up_alph_chars,root_note_chars,string.digits)))
      # root_expression = (VerEx().
      #                start_of_line().
      #                any(up_alph_chars).
      #                any(permissive_root_contents).add('*')
      #                )
      # subroot_expression = (VerEx().
      #                   start_of_line().
      #                   any(sub_root_beginnings).
      #                   any(permissive_root_contents).add('+')
      #                   )
      featureConfig = {
      'root': {'docxFeature': 'run_font_size_pt',
               'strSummary':'fontSize_12.0', 
               'value':12.0,
               # 'text_regex_at_feature': root_expression.compile()
               },
      'subroot': {'docxFeature': 'run_font_size_pt',
               'strSummary':'fontSize_12.0', 
               'value':12.0,
               # 'text_regex_at_feature': subroot_expression.compile()
               },
      'lemma': {'docxFeature': 'run_bold',
               'strSummary':'fontBold', 
               'value':True},
      }
      is_subroot = False
      # return True, (value_mask, values_from_runs), (regex_mask, regex_matches)
      is_root, (mask,run_text), _ = entryObj.single_run_feature_identify(featureConfig['root'])
      if is_root:
         for j, r in enumerate(compress(run_text,mask)):
            if j==0:
               # low_alph_chars = ''.join([x.lower() for x in char_counts.keys() if x.upper() != x.lower()]) #only uppercase alphabetical chars
               up_alph_chars = ''.join([x.upper() for x in char_counts.keys() if x.upper() != x.lower()]) #only uppercase alphabetical chars
               root_note_chars = '-+()? ' #characters that encode the author's notes
               sub_root_beginnings = '-+('
               permissive_root_contents = ''.join(list(chain(up_alph_chars,root_note_chars,string.digits)))
               pattern = '^['+re.escape(sub_root_beginnings)+']['+re.escape(permissive_root_contents)+']+'
               m = re.search(pattern = pattern, string = r)
               if m is not None:
                  is_subroot = True
         if is_subroot:
            print('\n\nsubroot at para number: ',i)
            paraText = entryObj.interogate__para_text()
            print('\t',paraText)
            subroot_ind_list.append(i)
         else:
            print('\n\nroot at para number: ',i)
            paraText = entryObj.interogate__para_text()
            print('\t',paraText)
            root_ind_list.append(i)

      # # return True, (value_mask, values_from_runs), (regex_mask, regex_matches)
      # is_subroot, _, _ = entryObj.single_run_feature_identify(featureConfig['subroot'])
      # if is_subroot:
      #    print('\n\nsubroot at para number: ',i)
      #    paraText = entryObj.interogate__para_text()
      #    print('\t',paraText)
      #    subroot_ind_list.append(i)

      # return True, (value_mask, values_from_runs), (regex_mask, regex_matches)
      is_lemma, _, _ = entryObj.single_run_feature_identify(featureConfig['lemma'])
      if is_lemma:
         # entryObj.interogate__para_text()
         paraText = entryObj.interogate__para_text()
         print('\t\tp#',i,'\t\t',paraText)
         lemma_ind_list.append(i)
      # if is_lemma and (is_root or is_subroot):
      if is_lemma and is_root:
         print(f'this para# {i} has BOTH lemma AND root')
         root_and_lemma_one_line.append(i)

   except BaseException as e:
      
      some_error_ind_list.append(i)
      raise e
      # if not e.args[0][0].exc.args[0] == 'suppressed Validation Error':
         # print('\npara number: ',i)

print('total paras: ',len(docx_object_list))
print('parsed paras: ',len(parsed_object_list))
print('handled errors: ',len(handled_errors))
print('failed paras: ',len(failed_paras_ind))

assert len(docx_object_list) == len(parsed_object_list) + len(handled_errors) + len(failed_paras_ind)

print('roots: ',len(root_ind_list))
print('subroots: ',len(subroot_ind_list))
print('lemmas: ',len(lemma_ind_list))
print('root_and_lemma_one_line: ',len(root_and_lemma_one_line))
print('additional cleaner rejects: ',len(reject_ind_list))
print('additional error rejects: ',len(some_error_ind_list))

print('num entities: ',len(root_ind_list) + len(lemma_ind_list) + len(subroot_ind_list))
num_good_paras_of_other_content= len(root_ind_list) + len(lemma_ind_list) - len(root_and_lemma_one_line) - len(subroot_ind_list)\
                                    + len(reject_ind_list) + len(some_error_ind_list)
print('num_good_paras_of_other_content: ',num_good_paras_of_other_content)
# # Test messages
logger.debug("logger debug test")
logger.info("Just an information")
# logger.warning("Its a Warning")
# logger.error("Did you try to divide by zero")
# logger.critical("Internet is down")

In [25]:
# print(root_ind_list)

In [12]:
with open(output_name, 'w', encoding="utf-8") as f:
    f.write(cap.stdout)

In [14]:

all_paras = list(range(len(parsed_object_list)))
normal_para = [x for x in all_paras if x not in root_ind_list and x not in lemma_ind_list]

root_aligned_lemmas = list(closest(pairwise(root_ind_list), lemma_ind_list))
lemma_aligned_paras = list(closest(pairwise(lemma_ind_list),normal_para))
num_schema = {}
for i, r in enumerate(root_ind_list):
   # print('\n\n',r)
   num_schema[r] = {}
   for j, lemma in enumerate(root_aligned_lemmas[i]):
      num_schema[r][int(lemma)] = []
      # print('\n\t',lemma)
      for k, parag in enumerate(lemma_aligned_paras[j]):
         # print('\t\t',parag)
         num_schema[r][lemma].append(int(parag))
print(json.dumps(num_schema, indent=4))

{
    "4": {
        "5": [
            6,
            7
        ],
        "8": [
            9,
            10
        ],
        "11": [
            12,
            13
        ],
        "14": [
            15,
            16
        ],
        "17": [
            18,
            19
        ],
        "20": [
            21,
            22
        ],
        "23": [
            24,
            25
        ],
        "26": [
            27,
            28
        ]
    },
    "29": {
        "30": [
            6,
            7
        ],
        "33": [
            9,
            10
        ],
        "37": [
            12,
            13
        ],
        "40": [
            15,
            16
        ]
    },
    "43": {
        "44": [
            6,
            7
        ],
        "47": [
            9,
            10
        ]
    },
    "50": {
        "51": [
            6,
            7
        ]
    },
    "54": {
        "55": [
            6,
            7
        ]
   

In [15]:
      # p_text = entryObj.interogate__para_text()
      # if not set(p_text).isdisjoint(low_freq_odd_chars):
      #    msg = 'rare_characters\t\t'+p_text
      #    entryObj.paragraph_logger(level=40,msg = msg, print_bool=False)

#Validate Whitespace behavior

In [16]:
white_space_chars = [k for k in char_counts if len(k.strip()) == 0]
print(white_space_chars)

[' ', '\xa0', '\t']


#Validate Upper/lower
conclusion: using str.upper()/lower() functions is safe. No character in the dataset causes an error when used in those functions, and the only characters that don't cooperate to a new case are non-alphabetical characters such as numbers and punctuation. 
conclusion: using str.upper()==str.lower() is a viable way to check if a character is alphabetical or not.

In [17]:
upperWITHlowerChars = set()
upper_chars = []
lower_chars = []
non_castable = []
error_casting = []
nons = []
found_as_one_case_only = []
for k in char_counts:
   try:
      up = k.upper()
      low = k.lower()
      upperWITHlowerChars.add((up,low))
      if up == low:
         non_castable.append(k)
         upperWITHlowerChars.remove((up,low))
      elif k == up:
         upper_chars.append(up)
      elif k == low:
         lower_chars.append(low)
      else:
         nons.append(k)
   except:
      error_casting.append(k)
print('\nupper_chars:     ',sorted(upper_chars))
print('\nlower_chars:     ',sorted(lower_chars))
print('\nnon_castable:    ',sorted(non_castable))
print('\nerror_casting:   ',sorted(error_casting))
print('\nsilent fails:    ',sorted(nons))

print('\nupperWITHlowerChars:    ',sorted(upperWITHlowerChars))
for u,l in upperWITHlowerChars:
   pair = [u,l]
   unseen_possible_case = False
   if l not in lower_chars and l not in non_castable:
      pair[1] = None
      unseen_possible_case = True
   if u not in upper_chars and u not in non_castable:
      pair[0] = None
      unseen_possible_case = True
   if unseen_possible_case:
      found_as_one_case_only.append(tuple(pair))
      # print("upper possible, but not present:     ",u)
print('\nfound_as_one_case_only:      ', sorted(found_as_one_case_only))


upper_chars:      ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'Ñ', 'Ŋ', 'Ɓ', 'Ɗ', 'Ƴ']

lower_chars:      ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'à', 'â', 'ç', 'è', 'é', 'ê', 'î', 'ï', 'ñ', 'ò', 'ô', 'ù', 'û', 'ŋ', 'ƴ', 'ɓ', 'ɗ']

non_castable:     ['\t', ' ', '!', '"', '&', "'", '(', ')', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '[', ']', '`', '\xa0']

error_casting:    []

silent fails:     []

upperWITHlowerChars:     [('A', 'a'), ('B', 'b'), ('C', 'c'), ('D', 'd'), ('E', 'e'), ('F', 'f'), ('G', 'g'), ('H', 'h'), ('I', 'i'), ('J', 'j'), ('K', 'k'), ('L', 'l'), ('M', 'm'), ('N', 'n'), ('O', 'o'), ('P', 'p'), ('Q', 'q'), ('R', 'r'), ('S', 's'), ('T', 't'), ('U', 'u'), ('V', 'v'), ('W', 'w'), ('X', 'x'), ('Y', 'y'), ('Z', 'z'), ('À', 'à'), ('Â', 

In [18]:
def code_point(c):
   return "U+{:04X}".format(ord(c))
[(c,code_point(c),d,code_point(d)) for c,d in sorted(upperWITHlowerChars)]


[('A', 'U+0041', 'a', 'U+0061'),
 ('B', 'U+0042', 'b', 'U+0062'),
 ('C', 'U+0043', 'c', 'U+0063'),
 ('D', 'U+0044', 'd', 'U+0064'),
 ('E', 'U+0045', 'e', 'U+0065'),
 ('F', 'U+0046', 'f', 'U+0066'),
 ('G', 'U+0047', 'g', 'U+0067'),
 ('H', 'U+0048', 'h', 'U+0068'),
 ('I', 'U+0049', 'i', 'U+0069'),
 ('J', 'U+004A', 'j', 'U+006A'),
 ('K', 'U+004B', 'k', 'U+006B'),
 ('L', 'U+004C', 'l', 'U+006C'),
 ('M', 'U+004D', 'm', 'U+006D'),
 ('N', 'U+004E', 'n', 'U+006E'),
 ('O', 'U+004F', 'o', 'U+006F'),
 ('P', 'U+0050', 'p', 'U+0070'),
 ('Q', 'U+0051', 'q', 'U+0071'),
 ('R', 'U+0052', 'r', 'U+0072'),
 ('S', 'U+0053', 's', 'U+0073'),
 ('T', 'U+0054', 't', 'U+0074'),
 ('U', 'U+0055', 'u', 'U+0075'),
 ('V', 'U+0056', 'v', 'U+0076'),
 ('W', 'U+0057', 'w', 'U+0077'),
 ('X', 'U+0058', 'x', 'U+0078'),
 ('Y', 'U+0059', 'y', 'U+0079'),
 ('Z', 'U+005A', 'z', 'U+007A'),
 ('À', 'U+00C0', 'à', 'U+00E0'),
 ('Â', 'U+00C2', 'â', 'U+00E2'),
 ('Ç', 'U+00C7', 'ç', 'U+00E7'),
 ('È', 'U+00C8', 'è', 'U+00E8'),
 ('É', 'U+

In [19]:
[(c,code_point(c)) for c in sorted(non_castable)]

# ('\t', 'U+0009' -> ('`', 'U+0060')
#  ('\xa0', 'U+00A0'))
# ('A', 'U+0041' -> 'û', 'U+00FB')
# ('Ŋ', 'U+014A' -> 'ƴ', 'U+01B4')


[('\t', 'U+0009'),
 (' ', 'U+0020'),
 ('!', 'U+0021'),
 ('"', 'U+0022'),
 ('&', 'U+0026'),
 ("'", 'U+0027'),
 ('(', 'U+0028'),
 (')', 'U+0029'),
 ('+', 'U+002B'),
 (',', 'U+002C'),
 ('-', 'U+002D'),
 ('.', 'U+002E'),
 ('/', 'U+002F'),
 ('0', 'U+0030'),
 ('1', 'U+0031'),
 ('2', 'U+0032'),
 ('3', 'U+0033'),
 ('4', 'U+0034'),
 ('5', 'U+0035'),
 ('6', 'U+0036'),
 ('7', 'U+0037'),
 ('8', 'U+0038'),
 ('9', 'U+0039'),
 (':', 'U+003A'),
 (';', 'U+003B'),
 ('<', 'U+003C'),
 ('=', 'U+003D'),
 ('>', 'U+003E'),
 ('?', 'U+003F'),
 ('[', 'U+005B'),
 (']', 'U+005D'),
 ('`', 'U+0060'),
 ('\xa0', 'U+00A0')]

#Validate Regex Behavior

In [20]:
impossible_char = '\u0008' #utf backspace (\u0008) is unlikely to appear in a docx, and did not appear in this one.
s = impossible_char.join(char_counts.keys())
re_results = [False]*len(char_counts.keys())
for i, k in enumerate(char_counts):
   pattern = re.escape(k)
   # print(s)
   try:
      m = re.search(pattern,s) #type: ignore
      corrected_ind = m.start()/2
      # print(corrected_ind)
   except: print('exception: ',repr(i))
   # print(corrected_ind)
   if i == corrected_ind:
      re_results[i] = True
   else: print('failure: ',repr(i))
print(all(re_results))

True


In [21]:
alpha_chars = [x for x in char_counts.keys() if x.upper() != x.lower()]
stralpha = [x for x in alpha_chars if x.isalpha()]
assert stralpha == alpha_chars, 'note that str.isalpha does NOT work safely here'

In [22]:

#these frequencies were copied from a previous run, and only from successfully parsed objects
#the lowest frequencies were reviewed and selections pulled from those
   # low_freq_odd_chars = ('\t', 72), ('5', 67), ('`', 64), ('&', 49), ('ù', 30), ('ï', 26), ('X', 25), ('!', 15), ('"', 14), ('ò', 8), ('=', 4), ('Q', 4), ('\xa0', 1)
   # low_freq_odd_chars = [x[0] for x in low_freq_odd_chars]
#numbers do not appear to be used outside of scholarly references and some multiple-root instances
   # nums = list(range(10))
#X for example, is almost only in english or french glosses, or scholarly references)
   #('X', 25),

In [23]:
#cleaning notes
# `new Kunari' - region in western Niger ; `nouveau Kounari' - région dans l'ouest du Niger
   #here the ` seems to be used at the beginning of a quotation, and a normal apostrophe at the end

In [24]:
# # # char_counts
# sorted_char_val = sorted(char_counts.items(), key=lambda item: (-item[1], item[0]))
# print(sorted_char_val)

#Inconsistencies

leading white spaces
entries with root and lemma on one line

"errors"
   whitespace paras