In [1]:
import stanza
from datetime import datetime
from pathlib import Path
from pprint import pprint

nb_start = datetime.now()
log_path = Path('data/libritts.log')
dep_path = Path('data/libritts.conllu')
input_path = Path("data/normalized_sample.txt")
#test input_path
input_path.is_file()


  from .autonotebook import tqdm as notebook_tqdm


True

You can specify the processors you want, which can speed things up. 
The processors required for dependency parsing are: 

In [2]:
restricted_nlp = stanza.Pipeline(
    lang='en',
    processors='tokenize,pos,lemma,depparse')

2022-03-29 15:17:07 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2022-03-29 15:17:07 INFO: Use device: cpu
2022-03-29 15:17:07 INFO: Loading: tokenize
2022-03-29 15:17:07 INFO: Loading: pos
2022-03-29 15:17:07 INFO: Loading: lemma
2022-03-29 15:17:07 INFO: Loading: depparse
2022-03-29 15:17:08 INFO: Done loading processors!



>`standfordNLP` does not have multi-word token (`mwt`) expansion
>for English, so `mwt` processor is not required for 
>dependency parsing, as discussed 
>[here](https://github.com/stanfordnlp/stanza/issues/297#issuecomment-627673245)

In [3]:
# nlp = stanza.Pipeline('en')
nlp = restricted_nlp
filenames = []
sents = []

 Process file input: 
 The test file I used here is not the architecture you were using, but just the output of `head *normalized*` in one of the train data directories, `103/1241/`. So it's a little different.

In [4]:
with input_path.open(mode='r', encoding='utf8') as ifile:
    for line in ifile:
        if line.startswith('==> '): 
            filename = line.strip('\n').strip('==>').strip(' ')
            filenames.append(filename)
            continue

        # with open(filename, "r", encoding='utf-8') as sfile:
        else:
            # don't need to remove whitespace characters or line breaks for stanza
            # unless they occurr midsentence, which might mess up the parsing 
            # (i.e. if stanza thinks they are different sentences)
            sents.append(line#.strip('\n')
                         )

Print it pretty just to see what the data is...

In [5]:
for fn, s in zip(filenames,sents): 
    print(f'{fn}:\n  + {s}' )

103_1241_000000_000001.normalized.txt:
  + matthew Cuthbert is surprised

103_1241_000004_000002.normalized.txt:
  + In fact, he had looked at twenty very much as he looked at sixty, lacking a little of the grayness.

103_1241_000007_000001.normalized.txt:
  + "But there was a passenger dropped off for you-a little girl.

103_1241_000008_000001.normalized.txt:
  + "It's a boy I've come for.

103_1241_000012_000002.normalized.txt:
  + Maybe they were out of boys of the brand you wanted."

103_1241_000014_000003.normalized.txt:
  + Her face was small, white and thin, also much freckled; her mouth was large and so were her eyes, which looked green in some lights and moods and gray in others.

103_1241_000017_000000.normalized.txt:
  + "I suppose you are mr matthew Cuthbert of Green Gables?" she said in a peculiarly clear, sweet voice.

103_1241_000017_000001.normalized.txt:
  + "I'm very glad to see you.

103_1241_000020_000000.normalized.txt:
  + "Oh, I can carry it," the child responded

The parsing step can be done from a list of sentences, as in `parse_list_of_sents()` below, but it can also be done from a whole block of text, as in `parse_multisentence_string()`.

In [6]:
def parse_list_of_sents(sents, log_path, dep_path):
    with log_path.open('a', encoding='utf8') as logstream:
        
        logstream.write(datetime.now().ctime())
        logstream.write('\n# Parsing from list of individual sentence strings'
                        f'\nsaving conllu formatted parsing to {dep_path}...')
        
        for sent in sents:
            logstream.write('\n'+sent)
            # `sent` still has its original linebreak
            #// log_path.write_text('\n')
            doc = nlp(sent)
            
            # These don't help in creating the grew-ready output, 
            #   only for processing within python
            #// dicts = doc.to_dict()
            #// conll = stanza.utils.conll.CoNLL.convert_dict(dicts)
            #// logstream.write(str(len(conll))+'\n')
            logstream.write(f'\n{len(doc.sentences)}\n')

            with dep_path.open('a', encoding='utf8') as depstream:
                #// dep_path.write_text(str(conll))
                depstream.write(stanza.utils.conll.CoNLL.doc2conll_text(doc))

In [7]:
def parse_multisentence_string(textstr, log_path, dep_path):
    
    with log_path.open('a', encoding='utf8') as logstream:

        logstream.write(datetime.now().ctime())
        logstream.write('\n# Parsing from single string'
                        f'\nsaving conllu formatted parsing to {dep_path}...\n')
        logstream.write(textstr)
        # `sent` still has its original linebreak
        #// log_path.write_text('\n')
        doc = nlp(textstr)
        
        # These don't help in creating the grew-ready output, 
        #   only for processing within python
        #// dicts = doc.to_dict()
        #// conll = stanza.utils.conll.CoNLL.convert_dict(dicts)

        logstream.write(f'\n{len(doc.sentences)}\n')

        with dep_path.open('a', encoding='utf8') as depstream:
            #// dep_path.write_text(str(conll))
            depstream.write(stanza.utils.conll.CoNLL.doc2conll_text(doc))
            depstream.write('\n')

In [8]:
t0 = datetime.now()

parse_list_of_sents(
    sents, log_path, 
    dep_path.with_name(f'from-list_{dep_path.name}'))

t1 = datetime.now()

parse_multisentence_string(
    ' '.join(sents), log_path, 
    dep_path.with_name(f'from-textblock_{dep_path.name}'))

t2 = datetime.now()
nb_code_end = datetime.now()

It works the same: the output files are identical, and if you're really concerned about stanza messing up the sentence parsing that has already been done, a more obvious "sentence break" string delimiter can be used for `join`, e.g. `'\n\n'.join(sents)`. 

**Plus** it's at least 2x faster (if not more) to parse from the single string.

In [9]:
list_time = t1 - t0
block_time = t2 - t1
list_time_message = f'\ntime from sentence list:\n  {round(list_time.total_seconds(), 3)} s'
block_time_message = f'\ntime from textblock:\n  {round(block_time.total_seconds(), 3)} s'
print(list_time_message + '\n' + block_time_message)

processors_string = '\n   '.join(str(p) for p in nlp.loaded_processors)
end_note = (f"\nprocessors in model:\n  {processors_string}"
            f"\ntotal time: {round((nb_code_end-nb_start).total_seconds(), 3)} s")
print(end_note)

with log_path.open('a') as logappend: 
    logappend.write(list_time_message)
    logappend.write(block_time_message)
    logappend.write(end_note)


time from sentence list:
  4.979 s

time from textblock:
  1.816 s

processors in model:
  <stanza.pipeline.tokenize_processor.TokenizeProcessor object at 0x7f4d6bfe8280>
   <stanza.pipeline.pos_processor.POSProcessor object at 0x7f4d8c33a700>
   <stanza.pipeline.lemma_processor.LemmaProcessor object at 0x7f4d4c1ca610>
   <stanza.pipeline.depparse_processor.DepparseProcessor object at 0x7f4d4bcc81f0>
total time: 8.085
