In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import nlppln

with nlppln.WorkflowGenerator() as wf:
    wf.load(steps_dir='../cwl/')
    print(wf.list_steps())

    gs_dir = wf.add_input(gs_dir='Directory')
    ocr_dir = wf.add_input(ocr_dir='Directory')
    lowercase = wf.add_input(lowercase='boolean?')
    align_metadata = wf.add_input(align_m='string?')
    align_changes = wf.add_input(align_c='string?')
    word_mapping_name = wf.add_input(wm_name='string?')

    gs_unnormalized = wf.ls(in_dir=gs_dir)
    ocr_unnormalized = wf.ls(in_dir=ocr_dir)

    gs = wf.normalize_whitespace_punctuation(meta_in=gs_unnormalized, scatter=['meta_in'], scatter_method='dotproduct')
    ocr = wf.normalize_whitespace_punctuation(meta_in=ocr_unnormalized, scatter=['meta_in'], scatter_method='dotproduct')

    alignments, changes, metadata = wf.align_texts_wf(gs=gs, ocr=ocr, align_c=align_changes, align_m=align_metadata)

    mappings = wf.create_word_mappings(alignments=alignments, txt=gs, lowercase=lowercase, 
                                       scatter=['alignments', 'txt'], scatter_method='dotproduct')
    merged = wf.merge_csv(in_files=mappings, name=word_mapping_name)

    wf.add_outputs(wm_mapping=merged)
    wf.add_outputs(txt_files=mappings)
    wf.save('../cwl/word-mapping-wf.cwl')

In [None]:
# create word mappings for test data
import nlppln

with nlppln.WorkflowGenerator() as wf:
    wf.load(steps_dir='../cwl/')
    print(wf.list_steps())

    gs_dir = wf.add_input(gs_dir='Directory')
    ocr_dir = wf.add_input(ocr_dir='Directory')
    gs_dir_name = wf.add_input(gs_dir_name='string', default='gs')
    ocr_dir_name = wf.add_input(ocr_dir_name='string', default='ocr')
    data_div = wf.add_input(data_div='File')
    lowercase = wf.add_input(lowercase='boolean?')
    align_metadata = wf.add_input(align_m='string?')
    align_changes = wf.add_input(align_c='string?')
    word_mapping_name = wf.add_input(wm_name='string?')

    test_gs = wf.select_test_files(datadivision=data_div, in_dir=gs_dir)
    test_ocr = wf.select_test_files(datadivision=data_div, in_dir=ocr_dir)

    gs_dir = wf.save_files_to_dir(dir_name=gs_dir_name, in_files=test_gs)
    ocr_dir = wf.save_files_to_dir(dir_name=ocr_dir_name, in_files=test_ocr)

    txt_files, wm_mapping = wf.word_mapping_wf(gs_dir=gs_dir, ocr_dir=ocr_dir, lowercase=lowercase, align_c=align_changes, align_m=align_metadata, wm_name=word_mapping_name)

    wf.add_outputs(wm_mapping=wm_mapping)
    wf.add_outputs(txt_files=txt_files)
    wf.save('../cwl/word-mapping-test-files-wf.cwl')