In [1]:
import pickle
from itertools import tee
from typing import Optional, Dict, List, Any, Union, Tuple
from collections import Counter
import numpy as np
import re
import json
# import docx
# from docx import Document
# from pydantic_docx import * #type:ignore
from datetime import datetime

In [2]:
import logging
now = datetime.now()
current_time = now.strftime("%Y-%m-%d_-_%H-%M-%S")
logger_filename = f"logs_and_outputs/{current_time}docxFileParse.log"

handler = logging.FileHandler(logger_filename, 'w', 'utf-8') 
handler.setFormatter(logging.Formatter('%(asctime)s %(message)s'))

# logging.setLogRecordFactory(factory)
logging.basicConfig(handlers=[handler], level=logging.DEBUG)
logger = logging.getLogger()

### Helper Functions

In [1]:
def pairwise(iterable):
    # pairwise('ABCDEFG') --> AB BC CD DE EF FG
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)
    
def logger_root_validation_error_messages(e, logger_details, suppress = [], run_enumeration: Optional[int] = None) -> Union[RuntimeError, TypeError]:      
   #TODO add ability to handle assertion errors
   if run_enumeration is not None:
      run_num = f"|run#{run_enumeration}|" #type: ignore 
   else:
      run_num = ""
   try:
      for err in e.errors():
         if err['type'] in suppress['type'] or err['msg'] in suppress['msg']:
            logger.info(f"|SUPRESSED|{logger_details['function']}|{type(e)}|para#{logger_details['paragraph_enumeration']}{run_num}, with validation? error: {err}")
            return TypeError("suppressed Validation Error")
         else:
            logger.error(f"|unsuppressed|{logger_details['function']}|{type(e)}|para#{logger_details['paragraph_enumeration']}{run_num}, with validation? error: {err}")
            return TypeError("un-suppressed Validation Error")
   except:
      logger.error(f"|unsuppressed|{logger_details['function']}|{type(e)}|para#{logger_details['paragraph_enumeration']}{run_num}, with error: {e}")
      return RuntimeError("non-validation error")
   return RuntimeError("non-validation error")

def pular_str_strip_check(s:str) ->bool:
   in_len = len(s)
   new_s = s.strip()
   out_len = len(new_s)
   purported_whitespace: bool = in_len != out_len
   return purported_whitespace

def closest(ranger, target): #any target indeces occuring before the first ranger index will be ignored
   if not isinstance(target,np.ndarray):
      target = np.array(target)
   for a,b in ranger:
      begin = np.searchsorted(target,a)
      end = np.searchsorted(target,b)
      _, out, target = np.split(target, [begin,end])
      yield list(out)
   yield list(target)

NameError: name 'Optional' is not defined

## Post Processing of Extracted Features

In [4]:
with open('parsed_objectClass_outcomes_dict.pkl', 'rb') as file:
    # Call load method to deserialze
    parsed_objectClass_outcomes_dict = pickle.load(file, encoding='utf-8')

parsed_object_list = parsed_objectClass_outcomes_dict['parsed_object_list'] 
para_text_lookup = parsed_objectClass_outcomes_dict['para_text_lookup'] 
root_ind_list = parsed_objectClass_outcomes_dict['root_ind_list'] 
subroot_ind_list = parsed_objectClass_outcomes_dict['subroot_ind_list'] 
lemma_ind_list = parsed_objectClass_outcomes_dict['lemma_ind_list'] 
root_and_lemma_one_line = parsed_objectClass_outcomes_dict['root_and_lemma_one_line'] 
root_lookup = parsed_objectClass_outcomes_dict['root_lookup'] 
lemma_lookup = parsed_objectClass_outcomes_dict['lemma_lookup'] 
char_counts = parsed_objectClass_outcomes_dict['char_counts']
normal_para_ind_list = parsed_objectClass_outcomes_dict['normal_para_ind_list']
# 7/24/2022 5:34pm
# total paras:  32507
# parsed paras:  32040
# handled errors:  467
# failed paras:  0
# roots:  6381
# subroots:  713
# lemmas:  9610
# root_and_lemma_one_line:  1438
# additional cleaner rejects:  0
# additional error rejects:  0
# num entities:  15266
# num_good_paras_of_other_content:  16774

### trying better hierarchy

In [None]:
allroots = root_ind_list.copy()
allroots.extend(subroot_ind_list)
allroots = sorted(allroots)

root_aligned_lemmas = list(closest(pairwise(allroots), lemma_ind_list))
root_aligned_subroots = list(closest(pairwise(root_ind_list), subroot_ind_list))
lemma_aligned_paras = list(closest(pairwise(lemma_ind_list),normal_para_ind_list))


num_schema = {}
lemma_count = 0
last_root = 0
for i, r in enumerate(allroots):
   if r in root_ind_list:
      last_root = r
      num_schema[last_root] = {}
   # r = root_lookup.get(r)
   current_root = {}
   for j, lemma in enumerate(root_aligned_lemmas[i]):
      lemma = int(lemma)
      # lemma = lemma_lookup.get(lemma)
      current_root[lemma] = []
      # print('\n\t',lemma)
      for k, parag in enumerate(lemma_aligned_paras[lemma_count]):
         # print('\t\t',parag)
         current_root[lemma].append(int(parag))
      lemma_count +=1
   if r in subroot_ind_list:
      num_schema[last_root][r] = current_root
   else:
      num_schema[r] = current_root
   raise NotImplementedError()
# outcomes_dict['entity_index_schema'] = num_schema

In [None]:

# is_root, is_lemma, is_subroot, is_para = True, False, False, False
# rootRec = [i, False, False, False]
# subrootRec = ([prev,i,False,False]
# records = [
#    ['root', 'root_sub', 'lemma', 'para']
# ]
# if is_root and not is_subroot:
#    newRec = [False]*4
#    newRec[0] = i
# if is_subroot:
#    newRec = records[-1].copy()
#    newRec[1] = i
#    newRec[2] = False
#    newRec[3] = False
# if is_lemma:
#    if any(i == records[0], i == records[0]):
#       newRec
#    newRec = newRec.copy()
#    newRec[2] = i
#    newRec[3] = False
#    records.append(newRec)



### prev attempts

In [5]:
#Creating lists of indexes for use in remaining processing

# # print(len(parsed_object_list)) #32040
#note this parsed paras are packaged with the original para number in the docx
#therefore 32507 is the index to use, but len of that IS 32040, since ~500 were empty paragraphs that were not parsed
parsed_para_indexes = [i for i,p in parsed_object_list]

lim = 32507
# rootInds = [x for x in root_ind_list if x < lim]
rootInds=root_ind_list
# subRootInds = [x for x in subroot_ind_list if x < lim]
subRootInds=subroot_ind_list
# lemmaInds = [x for x in lemma_ind_list if x < lim]
lemmaInds=lemma_ind_list

allroots = rootInds.copy()
allroots.extend(subRootInds)
allroots = sorted(allroots)
allEntities = allroots.copy()
allEntities.extend(lemmaInds)
allEntities = sorted(set(allEntities))
print('allEntities: ',len(allEntities))


# all_paras = [i for i,obj in parsed_object_list if i < lim]
normal_para = parsed_para_indexes.copy()
[normal_para.remove(i) for i in allEntities]
print('normal_para: ',len(normal_para))

# normal_para = [x for x in parsed_para_indexes if all([x not in rootInds, x not in lemmaInds, x not in subRootInds])]
# print(normal_para)
# print(allroots)

allEntities:  15266
normal_para:  16774


In [6]:
#Creating Sub-Root combined text
root_aligned_subroots = list(closest(pairwise(rootInds), subRootInds))
sub_root_appended_text = []
join_point_indicator = '#'
root_lookup_concat = root_lookup.copy()
for ri,l in enumerate(root_aligned_subroots):
   if len(l) > 0:
      main_root_txt = root_lookup[rootInds[ri]]
      for sri in l:
         combined_text = main_root_txt + join_point_indicator + root_lookup[sri]
         sub_root_appended_text.append(combined_text)
         root_lookup_concat[sri] = combined_text
      # break
# sub_root_appended_text

In [7]:
#Helper functions / creating aligned index lists (nested)
def get_blank_dict_like_article() -> Dict[int,Any]:
   # article_entry_ind = list(range(3,len(lemma_ind_list)+3))
   return {i:'' for i in range(len(lemmaInds))}

# # from operator import itemgetter
# def get_para_text(lst: Union[int,List[int]]) -> Union[str,List[str]]:
#    if isinstance(lst,list):
#       result = [para_text_lookup[i] for i in lst]
#    else:
#       result = [para_text_lookup[i]]
#    return result

root_aligned_lemmas = list(closest(pairwise(allroots), lemmaInds))
lemma_aligned_paras = list(closest(pairwise(lemmaInds),normal_para))

In [8]:
# READ_ME

#    Dict_Index.pdf: Source file

#    Fula_Dictionary: Source file transformed into .txt format

# fula_dictionary_abbreviations.xlsx: Fula dictionary abbreviations, not used in parsing

#    Fula_Dictionary.doc: Source file

# FulaAnnotations.txt: Parsed file
#    Entry ID	Annotations if any, separated by tabs

# FulaDialects.txt: Parsed file
#    Entry ID	Annotation on Dialect, if any, separated by tabs

# FulaEntries.txt: Parsed file (raw; words, POS not cleaned here)
#    Entry ID	Fula word	English translation	French translation

# FulaInParentheses.txt: List of abbreviations found in between parentheses in the Fula word part of FulaEntries.txt; list gathered because we don’t know the meaning of these abbreviations

# FulaLemmas.txt: Parsed file
# Entry ID	Fula lemmas, separated by tabs, at least 1

# FulaPOSTags.txt: Parsed file
# Entry ID	POS Tags, if any, separated by tabs

# FulaRoots.txt: Parsed file
# Entry ID	Fula Root

# FulaSenseAnnotations.txt: Parsed file
# Entry ID	Sense ID	Annotations associated to the sense, if any, separated by tabs

# FulaSenseClassifications.txt: Parsed file
# Entry ID	Sense ID	Classification information (in Latin), if any, separated by tabs

# FulaSenseEnglish.txt: Parsed file
# Entry ID	Sense ID	English cleaned translation

# FulaSenseFrench.txt: Parsed file
# Entry ID	Sense ID	French cleaned translation

# FulaSynonyms.txt: Parsed file
# Entry ID	Synonyms if any, separated by tabs

# POS.txt: List of the POS tags, for recognition and classification

# Root Origins.txt: List of the Fula Root tags, for recognition and classification

# WordnetConnecter1.py: First round of connection to WordNet

# WordnetConnecter2.py: Second round of connection to WordNet

# WordnetEnglish1.txt: Result of WordnetConnecter1.py
# Entry ID	Sense ID	Confidence Score	WordNet Synset names, separated by tabs

# WordnetEnglish2.txt: Result of WordnetConnecter2.py
# Entry ID	Sense ID	Subsense ID	Confidence Score	WordNet Synset names, separated by tabs

# WordnetUnconnected1.txt: Fula Entries not connected to WordNet after WordnetConnecter1.py
# Entry ID	Sense ID	English cleaned translation

# WordnetUnconnected2.txt: Fula Entries not connected to WordNet after WordnetConnecter2.py
# Entry ID	Sense ID	English cleaned translation

In [9]:
# txt_FulaAnnotations = get_blank_dict_like_article() #any extra para after 2 paras of glosses
# txt_FulaEntries = get_blank_dict_like_article() 
# txt_FulaLemmas = get_blank_dict_like_article()

# txt_FulaRoots = get_blank_dict_like_article()

# txt_FulaSenseEnglish = get_blank_dict_like_article()
# txt_FulaSenseFrench = get_blank_dict_like_article()
# txt_FulaSenseClassifications = get_blank_dict_like_article()
# txt_FulaSenseEnglishAnnotations = get_blank_dict_like_article() #TODO composite sense [count] classifications and annotations in gloss line
# txt_FulaSenseFrenchAnnotations = get_blank_dict_like_article() #TODO composite sense [count] classifications and annotations in gloss line?

# # txt_FulaDialects = #TODO will need to process the lemma line
# # txt_FulaPOSTags = #TODO will need to process the lemma line
# # txt_FulaSynonyms = #TODO will need to process the lemma line

# #>>>txt_FulaInParenthesis is different structure, agg of ___
# #>>>txt_POS is different structure, agg of POS tags
# #>>>txt_RootOrigins is different structure, agg of origins (after root)

# # get_para_text(##takes docx index)
# from collections import defaultdict
# irregular_entries = defaultdict(list)
# lemmaCount = 0
# for rootCount,rootDocxInd in enumerate(allroots):
#    # rCt : current root number.ct
#    # root: docx index of root
#    lemmas = root_aligned_lemmas[rootCount] #the lemmas list for this root/subroot
#    for lemmaDocxInd in lemmas:
#       paras = lemma_aligned_paras[lemmaCount]
      
#       entryDocxIndexes = paras.copy() #[lemmaDocxInd,*paras]
#       entryDocxIndexes.insert(0,lemmaDocxInd)
#       entry_text_list = get_para_text(entryDocxIndexes)
#       txt_FulaEntries[lemmaCount] = entry_text_list
#       txt_FulaLemmas[lemmaCount] = lemma_lookup.get(entryDocxIndexes[0])
#       txt_FulaRoots[lemmaCount] = root_lookup_concat[rootDocxInd]
#       if rootDocxInd == lemmaDocxInd:
#          irregular_entries[lemmaCount].append('one-liner: root+lemma')
#       else: pass
#       if len(entryDocxIndexes) > 3:
#          txt_FulaAnnotations[lemmaCount] = get_para_text(entryDocxIndexes[3:])
#       else: pass

#       if len(entryDocxIndexes) == 1:
#          irregular_entries[lemmaCount].append('one-liner: lemma+entry') #will almost certainly need to ignore this since likely will have nothing. MAY be a mistake in parsing, so need to check
#       elif len(entryDocxIndexes) == 2:
#          irregular_entries[lemmaCount].append('#TODO - incomplete entry')
#       else:

#          englishGloss = entry_text_list[1]
#          frenchGloss = entry_text_list[2]
#          senses_English = englishGloss.split(';')
#          senses_French =frenchGloss.split(';')
#          if len(senses_English) != len(senses_French):
#             irregular_entries[lemmaCount].append('inconsistent number of senses')
#          else:
#             txt_FulaSenseClassifications[lemmaCount] = ''
#             txt_FulaSenseEnglish[lemmaCount] = ''
#             txt_FulaSenseEnglishAnnotations[lemmaCount] = ''
#             txt_FulaSenseFrenchAnnotations[lemmaCount] = ''
#             for sense in range(len(senses_English)):
#                txt_FulaSenseClassifications[lemmaCount] = txt_FulaSenseClassifications[lemmaCount] \
#                   + str(lemmaCount) + '\t' + sense + '\n'
#                txt_FulaSenseEnglish[lemmaCount] = txt_FulaSenseEnglish[lemmaCount] \
#                   + str(lemmaCount) + '\t' + sense + '\t' + senses_English[sense] + '\n'
#                txt_FulaSenseFrench[lemmaCount] = txt_FulaSenseFrench[lemmaCount] \
#                   + str(lemmaCount) + '\t' + sense + '\t' + senses_French[sense] + '\n' 
#                parenthesis_pattern = r"\([^\)]+\)"
#                txt_FulaSenseEnglishAnnotations[lemmaCount] = txt_FulaSenseEnglishAnnotations[lemmaCount] \
#                   + str(lemmaCount) + '\t' + sense + '\t' + '\t'.join(re.findall(parenthesis_pattern,senses_English[sense])) + '\n'
#                txt_FulaSenseFrenchAnnotations[lemmaCount] = txt_FulaSenseFrenchAnnotations[lemmaCount] \
#                   + str(lemmaCount) + '\t' + sense + '\t' + '\t'.join(re.findall(parenthesis_pattern,senses_French[sense])) + '\n'
#       #
         
#       lemmaCount +=1
#    if lemmaCount > 25:
#          break
# print(allroots[:25])
# print(list(enumerate(root_aligned_lemmas[:25])))
# # txt_FulaEntries

## Exploring Data Structure

In [10]:
#creating nested structure from indexes assiged different priority levels
# lemma_list_from_ind = list(map(lemma_lookup.get,lemma_ind_list))
# print(lemma_list_from_ind)
# root_list_from_ind = list(map(root_lookup.get,rootInds))
# print(root_list_from_ind)

num_schema = {}
lemma_count = 0
for i, r in enumerate(rootInds):
   # print('\n\n',r)
   # r = root_lookup.get(r)
   num_schema[r] = {}
   for j, lemma in enumerate(root_aligned_lemmas[i]):
      lemma = int(lemma)
      # lemma = lemma_lookup.get(lemma)
      num_schema[r][lemma] = []
      # print('\n\t',lemma)
      for k, parag in enumerate(lemma_aligned_paras[lemma_count]):
         # print('\t\t',parag)
         num_schema[r][lemma].append(int(parag))
      lemma_count +=1

# import pickle
# # Open a file and use dump()
# with open('entity_index_schema.pkl', 'wb') as file:
#     # A new file will be created
#     pickle.dump(num_schema, file)

print(json.dumps(num_schema, indent=4,ensure_ascii=False))

{
    "4": {
        "5": [
            6,
            7
        ],
        "8": [
            9,
            10
        ],
        "11": [
            12,
            13
        ],
        "14": [
            15,
            16
        ],
        "17": [
            18,
            19
        ],
        "20": [
            21,
            22
        ],
        "23": [
            24,
            25
        ],
        "26": [
            27,
            28
        ]
    },
    "29": {
        "30": [
            31,
            32
        ],
        "33": [
            34,
            35
        ]
    },
    "43": {
        "37": [
            38,
            39
        ],
        "40": [
            41,
            42
        ]
    },
    "50": {
        "44": [
            45,
            46
        ],
        "47": [
            48,
            49
        ]
    },
    "54": {
        "51": [
            52,
            53
        ]
    },
    "58": {
        "55": [
            56,


In [11]:
with open('entity_index_schema.pkl', 'rb') as file:
    # Call load method to deserialze
    entity_index_schema = pickle.load(file, encoding='utf-8')

In [12]:
entity_text_schema = {}
for r,ldict in entity_index_schema.items():
   root_text = root_lookup_concat.get(r,'False')
   entity_text_schema[root_text] = {}
   for l, plst in ldict.items():
      lemma_text = lemma_lookup.get(l,'False')
      entity_text_schema[root_text][lemma_text] = [para_text_lookup.get(p,'') for p in plst]
print(json.dumps(entity_text_schema, indent=4,ensure_ascii=False))

{
    "A": {
        "a": [
            "you (sg.)",
            "tu"
        ],
        "-a": [
            "your (sg.) (only with certain nouns such as those which refer to close family members)",
            "ton, ta, tes (seulement pour certains substantifs, tels ceux qui se rapportent aux membres de la proche famille)"
        ],
        "aan": [
            "you (sg.) (emphatic)",
            "toi (emphatique)"
        ],
        "aɗa": [
            "you (sg.)",
            "tu"
        ],
        "-ɗaa, -ɗa": [
            "you (sg.) (used with imperfect and subjunctive forms)",
            "tu (employé avec les formes imparfaites et subjonctives)"
        ],
        "-e": [
            "you (suffix used in certain grammatical conditions)",
            "te (suffixe employé dans certaines situations grammaticales)"
        ],
        "ma, maa": [
            "you (sg.)",
            "te"
        ],
        "ma, maa, maaɗa": [
            "your (sg.)",
            "ton, ta, tes"


In [13]:
len(entity_index_schema.keys())

6381

## Text Handling Validation - Fula Dictionary Text(current Corpus)  

### Python Build in Func/STR testing

#### Validate Whitespace behavior

In [14]:
white_space_chars = [k for k in char_counts if len(k.strip()) == 0]
print(white_space_chars)

[' ', '\xa0', '\t']


#### Upper/Lower Str handling

#Validate Upper/lower
conclusion: using str.upper()/lower() functions is safe. No character in the dataset causes an error when used in those functions, and the only characters that don't cooperate to a new case are non-alphabetical characters such as numbers and punctuation. 
conclusion: using str.upper()==str.lower() is a viable way to check if a character is alphabetical or not.

In [15]:
upperWITHlowerChars = set()
upper_chars = []
lower_chars = []
non_castable = []
error_casting = []
nons = []
found_as_one_case_only = []
for k in char_counts:
   try:
      up = k.upper()
      low = k.lower()
      upperWITHlowerChars.add((up,low))
      if up == low:
         non_castable.append(k)
         upperWITHlowerChars.remove((up,low))
      elif k == up:
         upper_chars.append(up)
      elif k == low:
         lower_chars.append(low)
      else:
         nons.append(k)
   except:
      error_casting.append(k)
print('\nupper_chars:     ',sorted(upper_chars))
print('\nlower_chars:     ',sorted(lower_chars))
print('\nnon_castable:    ',sorted(non_castable))
print('\nerror_casting:   ',sorted(error_casting))
print('\nsilent fails:    ',sorted(nons))

print('\nupperWITHlowerChars:    ',sorted(upperWITHlowerChars))
for u,l in upperWITHlowerChars:
   pair = [u,l]
   unseen_possible_case = False
   if l not in lower_chars and l not in non_castable:
      pair[1] = None
      unseen_possible_case = True
   if u not in upper_chars and u not in non_castable:
      pair[0] = None
      unseen_possible_case = True
   if unseen_possible_case:
      found_as_one_case_only.append(tuple(pair))
      # print("upper possible, but not present:     ",u)
print('\nfound_as_one_case_only:      ', sorted(found_as_one_case_only))


upper_chars:      ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'Ñ', 'Ŋ', 'Ɓ', 'Ɗ', 'Ƴ']

lower_chars:      ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'à', 'â', 'ç', 'è', 'é', 'ê', 'î', 'ï', 'ñ', 'ò', 'ô', 'ù', 'û', 'ŋ', 'ƴ', 'ɓ', 'ɗ']

non_castable:     ['\t', ' ', '!', '"', '&', "'", '(', ')', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '[', ']', '`', '\xa0']

error_casting:    []

silent fails:     []

upperWITHlowerChars:     [('A', 'a'), ('B', 'b'), ('C', 'c'), ('D', 'd'), ('E', 'e'), ('F', 'f'), ('G', 'g'), ('H', 'h'), ('I', 'i'), ('J', 'j'), ('K', 'k'), ('L', 'l'), ('M', 'm'), ('N', 'n'), ('O', 'o'), ('P', 'p'), ('Q', 'q'), ('R', 'r'), ('S', 's'), ('T', 't'), ('U', 'u'), ('V', 'v'), ('W', 'w'), ('X', 'x'), ('Y', 'y'), ('Z', 'z'), ('À', 'à'), ('Â', 

#### str Alphabetical determination correctness

In [16]:
alpha_chars = [x for x in char_counts.keys() if x.upper() != x.lower()]
stralpha = [x for x in alpha_chars if x.isalpha()]
assert stralpha == alpha_chars, 'note that str.isalpha does NOT work safely here'

### Regex and Unicode Testing

#### Unicode

In [17]:
def code_point(c):
   return "U+{:04X}".format(ord(c))
[(c,code_point(c),d,code_point(d)) for c,d in sorted(upperWITHlowerChars)]


[('A', 'U+0041', 'a', 'U+0061'),
 ('B', 'U+0042', 'b', 'U+0062'),
 ('C', 'U+0043', 'c', 'U+0063'),
 ('D', 'U+0044', 'd', 'U+0064'),
 ('E', 'U+0045', 'e', 'U+0065'),
 ('F', 'U+0046', 'f', 'U+0066'),
 ('G', 'U+0047', 'g', 'U+0067'),
 ('H', 'U+0048', 'h', 'U+0068'),
 ('I', 'U+0049', 'i', 'U+0069'),
 ('J', 'U+004A', 'j', 'U+006A'),
 ('K', 'U+004B', 'k', 'U+006B'),
 ('L', 'U+004C', 'l', 'U+006C'),
 ('M', 'U+004D', 'm', 'U+006D'),
 ('N', 'U+004E', 'n', 'U+006E'),
 ('O', 'U+004F', 'o', 'U+006F'),
 ('P', 'U+0050', 'p', 'U+0070'),
 ('Q', 'U+0051', 'q', 'U+0071'),
 ('R', 'U+0052', 'r', 'U+0072'),
 ('S', 'U+0053', 's', 'U+0073'),
 ('T', 'U+0054', 't', 'U+0074'),
 ('U', 'U+0055', 'u', 'U+0075'),
 ('V', 'U+0056', 'v', 'U+0076'),
 ('W', 'U+0057', 'w', 'U+0077'),
 ('X', 'U+0058', 'x', 'U+0078'),
 ('Y', 'U+0059', 'y', 'U+0079'),
 ('Z', 'U+005A', 'z', 'U+007A'),
 ('À', 'U+00C0', 'à', 'U+00E0'),
 ('Â', 'U+00C2', 'â', 'U+00E2'),
 ('Ç', 'U+00C7', 'ç', 'U+00E7'),
 ('È', 'U+00C8', 'è', 'U+00E8'),
 ('É', 'U+

In [18]:
[(c,code_point(c)) for c in sorted(non_castable)]

# ('\t', 'U+0009' -> ('`', 'U+0060')
#  ('\xa0', 'U+00A0'))
# ('A', 'U+0041' -> 'û', 'U+00FB')
# ('Ŋ', 'U+014A' -> 'ƴ', 'U+01B4')


[('\t', 'U+0009'),
 (' ', 'U+0020'),
 ('!', 'U+0021'),
 ('"', 'U+0022'),
 ('&', 'U+0026'),
 ("'", 'U+0027'),
 ('(', 'U+0028'),
 (')', 'U+0029'),
 ('+', 'U+002B'),
 (',', 'U+002C'),
 ('-', 'U+002D'),
 ('.', 'U+002E'),
 ('/', 'U+002F'),
 ('0', 'U+0030'),
 ('1', 'U+0031'),
 ('2', 'U+0032'),
 ('3', 'U+0033'),
 ('4', 'U+0034'),
 ('5', 'U+0035'),
 ('6', 'U+0036'),
 ('7', 'U+0037'),
 ('8', 'U+0038'),
 ('9', 'U+0039'),
 (':', 'U+003A'),
 (';', 'U+003B'),
 ('<', 'U+003C'),
 ('=', 'U+003D'),
 ('>', 'U+003E'),
 ('?', 'U+003F'),
 ('[', 'U+005B'),
 (']', 'U+005D'),
 ('`', 'U+0060'),
 ('\xa0', 'U+00A0')]

#### Regex

#Validate Regex Behavior

In [19]:
impossible_char = '\u0008' #utf backspace (\u0008) is unlikely to appear in a docx, and did not appear in this one.
s = impossible_char.join(char_counts.keys())
re_results = [False]*len(char_counts.keys())
for i, k in enumerate(char_counts):
   pattern = re.escape(k)
   # print(s)
   try:
      m = re.search(pattern,s) #type: ignore
      corrected_ind = m.start()/2
      # print(corrected_ind)
   except: print('exception: ',repr(i))
   # print(corrected_ind)
   if i == corrected_ind:
      re_results[i] = True
   else: print('failure: ',repr(i))
print(all(re_results))

True


### Notes on Cleaning and content

In [20]:

#these frequencies were copied from a previous run, and only from successfully parsed objects
#the lowest frequencies were reviewed and selections pulled from those
   # low_freq_odd_chars = ('\t', 72), ('5', 67), ('`', 64), ('&', 49), ('ù', 30), ('ï', 26), ('X', 25), ('!', 15), ('"', 14), ('ò', 8), ('=', 4), ('Q', 4), ('\xa0', 1)
   # low_freq_odd_chars = [x[0] for x in low_freq_odd_chars]
#numbers do not appear to be used outside of scholarly references and some multiple-root instances
   # nums = list(range(10))
#X for example, is almost only in english or french glosses, or scholarly references)
   #('X', 25),

In [21]:
#cleaning notes
# `new Kunari' - region in western Niger ; `nouveau Kounari' - région dans l'ouest du Niger
   #here the ` seems to be used at the beginning of a quotation, and a normal apostrophe at the end

In [22]:
# # # char_counts
# sorted_char_val = sorted(char_counts.items(), key=lambda item: (-item[1], item[0]))
# print(sorted_char_val)

#Inconsistencies

leading white spaces
entries with root and lemma on one line

"errors"
   whitespace paras