In [1]:
intention = '''Draft or create a class system to represent the pular entries. 
Ideally this will contain a way to nest entry objects under a root
'''
#%pip install docx
#%pip install python-docx #this mutates docx? 
#%pip install pydantic
#%pip install mypy
# %pip install numpy
from typing import Optional, Dict, List, Any, Union, Tuple
from pydantic import BaseModel, ValidationError, validator, root_validator, Field, constr
import json
import docx
from docx import Document
from datetime import datetime
import logging

In [3]:
#get current datetime
now = datetime.now()
current_time = now.strftime("%Y-%m-%d_-_%H-%M-%S")

# #create file to save prints (use with jupyter magic enabled at the top of this cell: %%capture cap --no-stderr)
# output_name = f"{current_time}_result.txt"
# experiment = input("Enter emperiment description:")
# print(f"Experiment time: {current_time}\nExperiment note: {experiment}\n\n")

logger_filename = f"../logs_and_outputs/initialization_placeholder.log"

# Creating an object
logger = logging.getLogger()

# Setting the threshold of logger to DEBUG
logger.setLevel(logging.ERROR)

#add encoding
handler = logging.FileHandler(logger_filename, 'w', 'utf-8') 
handler.setFormatter(logging.Formatter('%(asctime)s %(message)s'))
logger.addHandler(handler) 

# # Test messages
logger.debug("current_time")
# logger.info("Just an information")
# logger.warning("Its a Warning")
# logger.error("Did you try to divide by zero")
# logger.critical("Internet is down")

In [4]:
# from IPython.display import Javascript

# script = '''
# require(["base/js/namespace"],function(Jupyter) {
#     Jupyter.notebook.save_checkpoint();
# });
# '''
# Javascript(script)


In [7]:
class Docx_Paragraph (BaseModel):
   """input:   paragraph = your_paragraph_here
   
   when given a docx document's paragraph object, will parse it to a specified schema
   """
   # docx_document_paragraph: Optional[Any] #This should be validated below. Left optional because its inclusion causes problems with default repr and serialization
   para_text: str = Field(..., min_length = 1) ##required, must be string, must be 1 long or more
   para_first_line_indent: Optional[int] = Field(...) #Required, but must be int OR none. https://pydantic-docs.helpmanual.io/usage/models/#required-optional-fields 
   para_left_indent: Optional[int] = Field(...) #Required, but must be int OR none. https://pydantic-docs.helpmanual.io/usage/models/#required-optional-fields 

   @root_validator(pre=True) #TODO Try have post validator for runs only?
   def _docx_structure_check(cls, values: Dict[str, Any]) -> Dict[str, Any]:
      para = values.get("paragraph",False)
      assert isinstance(para, eval('docx.text.paragraph.Paragraph')), 'please enter a docx paragraph assigned to the variable "paragraph", in the form of     paragraph = your_paragraph_here'

      new_values: Dict[str, Any] = {}
      #extract para features, 
      new_values['para_text'] = para.text #type: ignore
      new_values['para_first_line_indent'] = para.paragraph_format.first_line_indent #type: ignore
      new_values['para_left_indent'] = para.paragraph_format.left_indent #type: ignore

      return new_values


class Docx_Run (BaseModel):
   """input:   run = your_run_here
   
   when given a docx document paragraphs run object, will parse it to a specified schema
   """
   run_text : str = Field(..., min_length = 1) #required, must be string, must be 1 long or more
   run_font_name : Optional[str] = Field(...) #required, must be string or None value
   run_font_size_pt : Optional[float] = Field(...)#Required, but must be float OR none value
   run_bold : Optional[bool] = Field(...) #Required, but must be bool OR none value
   run_italic : Optional[bool] = Field(...) #Required, but must be bool OR none value

   @root_validator(pre=True) #TODO Try have post validator for runs only?
   def _docx_structure_check(cls, values: Dict[str, Any]) -> Dict[str, Any]:
      run = values.get("run",False)
      assert isinstance(run, eval('docx.text.run.Run')), 'please enter a docx run assigned to the variable "run", in the form of     run = your_run_here'
      
      new_values : Dict[str, Any] = {}
      #loop through the runs in the paragraph and select the desired features
      new_values['run_text'] = run.text #type: ignore
      new_values['run_font_name'] = run.font.name #type: ignore
      if run.font.size is not None: #type: ignore
         new_values['run_font_size_pt'] = run.font.size.pt #type: ignore
      else: new_values['run_font_size_pt'] = None
      new_values['run_bold'] = run.bold #type: ignore
      new_values['run_italic'] = run.italic #type: ignore

      return new_values


class Docx_Run_List (BaseModel):
   """input:   run_list = your_runs_in_a_list
   
   when given a list of docx document paragraphs run object, will parse it to a specified schema
   """
   #because the internals are validated, don't need to validate these other than that they were made into lists
   run_text : List[Any] = Field(...) #Required, must be list
   run_font_name : List[Any] = Field(...) #Required, must be list
   run_font_size_pt : List[Any] = Field(...) #Required, must be list
   run_bold : List[Any] = Field(...) #Required, must be list
   run_italic : List[Any] = Field(...) #Required, must be list

   @root_validator(pre=True) #TODO Try have post validator for runs only?
   def _docx_structure_check(cls, values: Dict[str, List[Any]]) -> Dict[str, Any]:
      from collections import defaultdict
      paragraph_enumeration = values.get('paragraph_enumeration',"<<FAILURE_paragraph_enumeration>>")
      runs = values.get("run_list",False)
      if not runs:
         raise ValueError('please enter a docx run list assigned to the variable "run_list", in the form of     run_list = your_run_list_here')
      new_values = defaultdict(list)
      suppress = {'type': ['value_error.any_str.min_length' #ignore zero length run_text, per run validator
                           ],
                  'msg': ['suppressed Validation Error'] #ignore suppressed errors earlier/lower in the stack      
      }
      logger_details = {'function':'parsed_run', 'paragraph_enumeration':paragraph_enumeration }
      
      for run_enumumeration, run in enumerate(runs): #type: ignore
         try:
            parsed_run = Docx_Run(**{'run':run}) #this manner of root unpacking seems to give warnings since linter can't assess ahead of time
            assert isinstance(parsed_run, Docx_Run), 'RUNTIME_ERR - the docx run object did not return the type expected'
            for k,v in parsed_run.dict().items():
               new_values[k].append(v) 

         except BaseException as e:
            new_e = logger_root_validation_error_messages(e, logger_details, suppress,run_enumeration=run_enumumeration)
            raise new_e
             
      return new_values


Writing _Docx_Paragraph__Docx_Run__Docx_Run_List


In [None]:
from itertools import compress, tee

def pairwise(iterable):
    # pairwise('ABCDEFG') --> AB BC CD DE EF FG
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)

def logger_root_validation_error_messages(e, logger_details, suppress = [], run_enumeration: Optional[int] = None) -> Union[RuntimeError, TypeError]:      
   #TODO add ability to handle assertion errors
   if run_enumeration is not None:
      run_num = f"|run#{run_enumeration}|" #type: ignore 
   else:
      run_num = ""
   try:
      for err in e.errors():
         if err['type'] in suppress['type'] or err['msg'] in suppress['msg']:
            logger.info(f"|SUPRESSED|{logger_details['function']}|{type(e)}|para#{logger_details['paragraph_enumeration']}{run_num}, with validation? error: {err}")
            return TypeError("suppressed Validation Error")
         else:
            logger.error(f"|unsuppressed|{logger_details['function']}|{type(e)}|para#{logger_details['paragraph_enumeration']}{run_num}, with validation? error: {err}")
            return TypeError("un-suppressed Validation Error")
   except:
      logger.error(f"|unsuppressed|{logger_details['function']}|{type(e)}|para#{logger_details['paragraph_enumeration']}{run_num}, with error: {e}")
      return RuntimeError("non-validation error")
   return RuntimeError("non-validation error")


def pular_str_strip_check(s:str) ->bool:
   in_len = len(s)
   new_s = s.strip()
   out_len = len(new_s)
   purported_whitespace: bool = in_len != out_len
   return purported_whitespace


class Docx_Paragraph_and_Runs (BaseModel):
   """input:   paragraph = your_paragraph_here
   
   when given a docx document's paragraph object, will parse it to a specified schema
   """

   class Config:
      extra = 'allow'
      # arbitrary_types_allowed = True

   @root_validator(pre=True) #TODO Try have post validator for runs only?
   def _docx_structure_check(cls, values: Dict[str, Any]) -> Dict[str, Any]:
      new_values: Dict[str, Any] = {}
      para = values.get("paragraph",False)
      assert isinstance(para, eval('docx.text.paragraph.Paragraph')), 'please enter a docx paragraph assigned to the variable "paragraph", in the form of     paragraph = your_paragraph_here'
      
      paragraph_enumeration: int = values.get('paragraph_enumeration',None)
      assert isinstance(paragraph_enumeration, int), "assertion error, bad paragraph count/paragraph_enumeration value passed. Please pass an integer"
      new_values['paragraph_enumeration'] = paragraph_enumeration

      
      #setting up error and logger handling
      #suppress these errors
      suppress = {'type': ['value_error.any_str.min_length' #ignore zero length run_text, per run validator
                           ],
                  'msg': ['suppressed Validation Error'] #ignore suppressed errors earlier/lower in the stack      
      }
      #try to extract para features, 
      logger_details = {'function':'Docx_Paragraph', 'paragraph_enumeration':paragraph_enumeration }
      try: 
         parsed_paras = Docx_Paragraph(**{'paragraph':para}) #type: ignore
         for k,v in parsed_paras.dict().items():
            new_values[k] = v
      # except ValidationError as e:
      #    logger_root_validation_error_messages(e, logger_details, suppress)
      except BaseException as e:
         new_e = logger_root_validation_error_messages(e, logger_details, suppress)
         raise new_e

      #try to extract runs features
      logger_details = {'function':'Docx_Run_List', 'paragraph_enumeration':paragraph_enumeration }    
      try:
         parsed_runs = Docx_Run_List(**{'run_list':para.runs, 'paragraph_enumeration':paragraph_enumeration}) #type: ignore
         for k,v in parsed_runs.dict().items():
            new_values[k] = v
      except BaseException as e:
         new_e = logger_root_validation_error_messages(e, logger_details, suppress)
         raise new_e
         
      return new_values
      

   def interogate__para_text(self) -> str:
      t = getattr(self, 'para_text', "")
      # 
      if len(t) == 0:
         logger.warning('interogator did not find para_text')
      #    print("no para_text with:\n\t", self.dict())
      return t

   def paragraph_logger(self,level:int,msg:str,print_bool:bool):
      if print_bool:
         print(msg)
      else:
         logger.log(level,msg)


   def single_run_feature_identify(self,params:Dict[str,Any]) -> Tuple[bool,List[bool],List[Any]]: 
      enumeration : Optional[int] = getattr(self,"paragraph_enumeration",None)
      assert isinstance(enumeration, int),f"bad value for 'paragraph_enumeration' {enumeration}"
      feature = params['docxFeature']
      assert isinstance(feature,str),f"bad value for parameter 'docxFeature'. Check params: {params}"

      list_from_runs: List[Optional[Union[float,bool]]] = getattr(self,feature,[None]) 
      mask: List[bool] = [True if x == params['value'] else False for x in list_from_runs]

      if any(mask):
         return True, mask, list_from_runs  #has Feature
      else:
         return False, mask, list_from_runs #does not have feature

   def modify_run_lists(self, drop_runs: Optional[List[int]] = None, add_runs: Optional[Tuple[int, List[List[Any]]]] = None, merge_runs : Optional[bool] = None) -> Optional[Dict[str, List[List[Any]]]]:
      """given a list of indexes as 'drop' will drop those indexes from runlists, and return those dropped
      given a tuple with an integer index and list of lists (run aligned), will add those to entries to the runlists at that index
      given bool merge, will greedy merge all runs with the same run features EXCEPT run_text. Run_texts will be concatenated
      """
      run_list_req_features: List[str] = Docx_Run_List.schema()['required']
      assert run_list_req_features[0] == 'run_list', "first feature in the schema should be run_list"
      feature_run_lists : List[List[Any]] = []
      for f in run_list_req_features:
         feature_run_lists.append(getattr(self,f,[]))
      pivoted_run_lists = list(map(list, zip(*feature_run_lists)))
      number_of_runs : int = len(pivoted_run_lists)
      if number_of_runs < 1:
         raise ValueError('this paragraph does not have values in the run lists')

      if drop_runs is not None:
         dropped_runs = {}
         for ind in drop_runs:
            dropped_runs[ind] = pivoted_run_lists.pop(ind)
         if number_of_runs == len(pivoted_run_lists):
            raise RuntimeError('the runs_lists were not shortened as expected')
         number_of_runs : int = len(pivoted_run_lists)
         feature_run_lists = list(map(list, zip(*pivoted_run_lists)))

      if add_runs is not None:
         insert_ind = add_runs[0]
         add_lists = add_runs[1]
         assert len(add_lists[0]) == number_of_runs, "the added list of lists must have runs of the same length (feature space) as run_lists features in the schema: Docx_Run_List.schema()['required']"
         if insert_ind == -1:
            insert_ind = number_of_runs
         for lst in add_lists:
            pivoted_run_lists.insert(insert_ind,lst)
         number_of_runs : int = len(pivoted_run_lists)
         feature_run_lists = list(map(list, zip(*pivoted_run_lists)))
      
      if merge_runs is not None:
         still_merging = True
         i = 0
         merge_occured = False
         while still_merging:
            for a,b in pairwise(list(range(len(pivoted_run_lists)))):
               if pivoted_run_lists[a][1:] == pivoted_run_lists[b][1:]:
                  pivoted_run_lists[b][0] = pivoted_run_lists[a][0] + pivoted_run_lists[b][0]
                  pivoted_run_lists.pop(a)
                  merge_occured = True
               else: pass
            if not merge_occured:
               still_merging = False
         number_of_runs : int = len(pivoted_run_lists)
         feature_run_lists = list(map(list, zip(*pivoted_run_lists)))

      if any([drop_runs is not None, add_runs is not None, merge_runs is not None]):
         for i, f in enumerate(run_list_req_features):
            self.__setattr__(f,feature_run_lists[i])


      
   # def cleaner(self,params:Dict[str,Any]):

   #    def remove_para_leading_whitespace(self):
   #       this_run_len = len(self.run_text[0])
   #       stripped_run = self.run_text[0].lstrip()
   #       # if len(stripped_run) == 0:
   #          # self

# run_text=['WOOP-  ', 'woopude  ', 'var.- woofude; V. woof- (1)  ', 'Dcz  C<FJ>,Z<FJ,FT>']
# run_font_name=[None, 'TmsRmn 10pt', 'TmsRmn 10pt', 'Helv 8pt']
# run_font_size_pt=[12.0, None, None, 8.0]
# run_italic=[None, None, True, None]
# run_bold=[None, True, None, None]



# features = [run_text,run_font_name,run_font_size_pt,run_italic,run_bold]
# # print(features)
# rotated_features = list(zip(*features))
# # print(rotated_features)
# rotated_features2 = list(zip(*rotated_features))
# rotated_features3 = list(map(list, zip(*rotated_features)))
# print(rotated_features2)
# print(rotated_features3)
# # print(list(pairwise(rotated_features3)))
# print(features[:0]+features[1:])


In [None]:
class Fula_Entry (BaseModel): 
   entity_word: Optional[str]
   features: Optional[Dict[str,str]] = {}
   paragraphs_list: List[Any]
   paragraphs_extr : List[Docx_Paragraph_and_Runs]
   sub_roots = []
   lemmas = []

In [None]:
%%capture cap --no-stderr

#get current datetime
now = datetime.now()
current_time = now.strftime("%Y-%m-%d_-_%H-%M-%S")

#create file to save prints (use with jupyter magic enabled at the top of this cell: %%capture cap --no-stderr)
output_name = f"logs_and_outputs/{current_time}_docxFileParseResult.txt"
experiment = input("Enter emperiment description:")
print(f"Experiment time: {current_time}\nExperiment note: {experiment}\n\n")

logger_filename = f"logs_and_outputs/{current_time}docxFileParse.log"

# Creating an object
logger = logging.getLogger()

# Setting the threshold of logger to DEBUG
logger.setLevel(logging.WARNING)

#add encoding
handler = logging.FileHandler(logger_filename, 'w', 'utf-8') 
handler.setFormatter(logging.Formatter('%(asctime)s %(message)s'))
logger.addHandler(handler) 

#Run docx module to parse the docx file
docx_filename = "Fula_Dictionary-repaired.docx"
# docx_filename = "pasted_docx page 1.docx"
document = Document(docx_filename)


from collections import Counter
char_counts = Counter()

docx_object_list = []
parsed_object_list = []
failed_paras_ind = []

for i, para in enumerate(document.paragraphs):
   docx_object_list.append((i,para))
   try:
      entryObj = Docx_Paragraph_and_Runs(**{'paragraph': para, 'paragraph_enumeration': i})
      char_counts.update(entryObj.interogate__para_text())
      parsed_object_list.append((i,entryObj))
      if char_counts.get(repr('\n'),False):
         print(i)
   except BaseException as e:
      failed_paras_ind.append((i,para))
      if not e.args[0][0].exc.args[0] == 'suppressed Validation Error':
         print('\npara number: ',i)

      # p_text = entryObj.interogate__para_text()
      # if not set(p_text).isdisjoint(low_freq_odd_chars):
      #    msg = 'rare_characters\t\t'+p_text
      #    entryObj.paragraph_logger(level=40,msg = msg, print_bool=False)
print(failed_paras_ind)
print('i can print')



In [None]:
print(len(parsed_object_list))

with open(output_name, 'w', encoding="utf-8") as f:
    f.write(cap.stdout)

32040


In [None]:
%%capture cap --no-stderr

#get current datetime
now = datetime.now()
current_time = now.strftime("%Y-%m-%d_-_%H-%M-%S")

#create file to save prints (use with jupyter magic enabled at the top of this cell: %%capture cap --no-stderr)
output_name = f"logs_and_outputs/{current_time}_docxFileParseResult.txt"
experiment = input("Enter emperiment description:")
print(f"Experiment time: {current_time}\nExperiment note: {experiment}\n\n")

logger_filename = f"logs_and_outputs/{current_time}docxFileParse.log"

# Creating an object
logger = logging.getLogger()

# Setting the threshold of logger to DEBUG
logger.setLevel(logging.WARNING)

#add encoding
handler = logging.FileHandler(logger_filename, 'w', 'utf-8') 
handler.setFormatter(logging.Formatter('%(asctime)s %(message)s'))
logger.addHandler(handler) 

#Run docx module to parse the docx file
docx_filename = "Fula_Dictionary-repaired.docx"
# docx_filename = "pasted_docx page 1.docx"
document = Document(docx_filename)

# docx_object_list = []
# parsed_object_list = []
# failed_paras_ind = []
root_ind_list = []
lemma_ind_list = []
rejected_ind_list = []

for i, entryObj in parsed_object_list:

   try:
      featureConfig = {
      'root': {'docxFeature': 'run_font_size_pt',
               'strSummary':'fontSize_12.0', 
               'value':12.0},
      'lemma': {'docxFeature': 'run_bold',
               'strSummary':'fontBold', 
               'value':True},
      }
      
      is_root = entryObj.single_run_feature_identify(featureConfig['root'])
      if is_root:
         print('\n\nroot at para number: ',i)
         paraText = entryObj.interogate__para_text()
         print('\t',paraText)
         root_ind_list.append(i)

      is_lemma = entryObj.single_run_feature_identify(featureConfig['lemma'])
      if is_lemma:
         entryObj.interogate__para_text()
         paraText = entryObj.interogate__para_text()
         print('\t\tp#',i,'\t\t',paraText)
         lemma_ind_list.append(i)

   except BaseException as e:
      rejected_ind_list.append(i)
      if not e.args[0][0].exc.args[0] == 'suppressed Validation Error':
         print('\npara number: ',i)

print('total paras: ',len(docx_object_list))
print('parsed paras: ',len(parsed_object_list))
print('failed paras: ',len(failed_paras_ind))
print('roots: ',len(root_ind_list))
print('lemmas: ',len(lemma_ind_list))
print('additional rejects: ',len(rejected_ind_list))

In [None]:
with open(output_name, 'w', encoding="utf-8") as f:
    f.write(cap.stdout)

In [None]:

#these frequencies were copied from a previous run, and only from successfully parsed objects
#the lowest frequencies were reviewed and selections pulled from those
   # low_freq_odd_chars = ('\t', 72), ('5', 67), ('`', 64), ('&', 49), ('ù', 30), ('ï', 26), ('X', 25), ('!', 15), ('"', 14), ('ò', 8), ('=', 4), ('Q', 4), ('\xa0', 1)
   # low_freq_odd_chars = [x[0] for x in low_freq_odd_chars]
#numbers do not appear to be used outside of scholarly references and some multiple-root instances
   # nums = list(range(10))
#X for example, is almost only in english or french glosses, or scholarly references)
   #('X', 25),

In [None]:
#cleaning notes
# `new Kunari' - region in western Niger ; `nouveau Kounari' - région dans l'ouest du Niger
   #here the ` seems to be used at the beginning of a quotation, and a normal apostrophe at the end

In [None]:
# # char_counts
sorted_char_val = sorted(char_counts.items(), key=lambda item: (-item[1], item[0]))
print(sorted_char_val)

[]
