Store document structure from parser.

KonnexionsGmbH · Mar 31, 2022 · 732cac2 · 732cac2
1 parent 5972de3
commit 732cac2
Show file tree

Hide file tree

Showing 23 changed files with 75 additions and 47 deletions.
diff --git a/docs/developing_data_model.md b/docs/developing_data_model.md
@@ -29,7 +29,7 @@ This database table contains the document-related data:
 
 ### 2.3 Database Table **`journal`**
 
-This database table documents the document processing in detail:
+This database table contains document-related error message and performance data:
 
 ![](img/schema_dbt_journal.png)
 

diff --git a/docs/developing_version_planning.md b/docs/developing_version_planning.md
@@ -21,7 +21,6 @@
 
 ## 2. Next Development Steps
 
-- 2 combine **`pdf`** files - scanned **`pdf`** documents - after Tesseract OCR
 - 9 API documentation: Content improvement
 - 9 API documentation: Layout improvement
 - 9 Google Styleguide implementation
@@ -34,6 +33,7 @@
 - ~~PDFlib TET processing~~
 - ~~Tesseract OCR - Installation~~  
 - ~~clean up the auxiliary files in file directory inbox_accepted - keep the base document~~
+- ~~combine **`pdf`** files - scanned **`pdf`** documents - after Tesseract OCR~~
 - ~~convert the appropriate documents into the **`pdf`** format with Pandoc and TeX Live~~
 - ~~duplicate handling~~ 
 - ~~error correction version 0.9.0~~

diff --git a/docs/img/schema_dbt_content.png b/docs/img/schema_dbt_content.png
diff --git a/docs/img/schema_dbt_document.png b/docs/img/schema_dbt_document.png
diff --git a/docs/img/schema_dbt_journal.png b/docs/img/schema_dbt_journal.png
diff --git a/docs/img/schema_dbt_language.png b/docs/img/schema_dbt_language.png
diff --git a/docs/img/schema_dbt_run.png b/docs/img/schema_dbt_run.png
diff --git a/docs/img/schema_dbt_version.png b/docs/img/schema_dbt_version.png
diff --git a/docs/index.md b/docs/index.md
@@ -44,7 +44,7 @@ The processing logic is as follows:
 
 In the first step, the file directory **`inbox`** is checked for new document files. 
 An entry is created in the **`document`** database table for each new document, showing the current processing status of the document. 
-In addition, each processing step of a document is recorded in the database table **`journal`**.
+In addition, document-related error messages and performance data are recorded in the **`journal`** database table.
 
 The association of document and language is managed via subdirectories of the file folder **`inbox`**. 
 In the database table **`language`**, the column **`directory_name_inbox`** specifies per language in which subdirectory the documents in this language are to be supplied. 

diff --git a/run_dcr_dev.bat b/run_dcr_dev.bat
@@ -18,8 +18,8 @@ if ["%1"] EQU [""] (
     echo ---------------------------------------------------------
     echo p_i   - 1. Process the inbox directory.
     echo p_2_i - 2. Convert pdf documents to image files:               Poppler.
-    echo n_2_p - 2. Convert non-pdf documents to pdf files:             Pandoc
     echo ocr   - 3. Convert image documents to pdf files:               Tesseract OCR.
+    echo n_2_p - 2. Convert non-pdf documents to pdf files:             Pandoc
     echo tet   - 4. Extract text and metdata from pdf documents:        PDFlib TET.
     echo s_f_p - 5. Store the document structure from the parser result.
     echo ---------------------------------------------------------
@@ -157,17 +157,17 @@ echo.
         if ["%DCR_CHOICE_ACTION%"] EQU ["p_2_i"] (
             set DCR_CHOICE_ACTION=p_i %DCR_CHOICE_ACTION%
         )
-        if ["%DCR_CHOICE_ACTION%"] EQU ["n_2_p"] (
+        if ["%DCR_CHOICE_ACTION%"] EQU ["ocr"] (
             set DCR_CHOICE_ACTION=p_i p_2_i %DCR_CHOICE_ACTION%
         )
-        if ["%DCR_CHOICE_ACTION%"] EQU ["ocr"] (
-            set DCR_CHOICE_ACTION=p_i p_2_i n_2_p %DCR_CHOICE_ACTION%
+        if ["%DCR_CHOICE_ACTION%"] EQU ["n_2_p"] (
+            set DCR_CHOICE_ACTION=p_i p_2_i ocr %DCR_CHOICE_ACTION%
         )
         if ["%DCR_CHOICE_ACTION%"] EQU ["tet"] (
-            set DCR_CHOICE_ACTION=p_i p_2_i n_2_p ocr %DCR_CHOICE_ACTION%
+            set DCR_CHOICE_ACTION=p_i p_2_i ocr n_2_p %DCR_CHOICE_ACTION%
         )
         if ["%DCR_CHOICE_ACTION%"] EQU ["s_f_p"] (
-            set DCR_CHOICE_ACTION=p_i p_2_i n_2_p ocr tet %DCR_CHOICE_ACTION%
+            set DCR_CHOICE_ACTION=p_i p_2_i ocr n_2_p tet %DCR_CHOICE_ACTION%
         )
         pipenv run python src\dcr\dcr.py !DCR_CHOICE_ACTION!
         if ERRORLEVEL 1 (

diff --git a/run_dcr_dev.sh b/run_dcr_dev.sh
@@ -18,8 +18,8 @@ if [ -z "$1" ]; then
     echo "------------------------------------------------------------------------------"
     echo "p_i   - 1. Process the inbox directory."
     echo "p_2_i - 2. Convert pdf documents to image files:               Poppler."
-    echo "n_2_p - 2. Convert non-pdf documents to pdf files:             Pandoc."
     echo "ocr   - 3. Convert image documents to pdf files:               Tesseract OCR."
+    echo "n_2_p - 2. Convert non-pdf documents to pdf files:             Pandoc."
     echo "tet   - 4. Extract text and metadata from pdf documents:       PDFlib TET."
     echo "s_f_p - 5. Store the document structure from the parser result."
     echo "------------------------------------------------------------------------------"
@@ -82,17 +82,17 @@ case "${DCR_CHOICE_ACTION}" in
       p_2_i)
         export DCR_CHOICE_ACTION=p_i ${DCR_CHOICE_ACTION}
         ;;
-      n_2_p)
+      ocr)
         export DCR_CHOICE_ACTION=p_i p_2_i ${DCR_CHOICE_ACTION}
         ;;
-      tet)
-        export DCR_CHOICE_ACTION=p_i p_2_i ${DCR_CHOICE_ACTION}
+      n_2_p)
+        export DCR_CHOICE_ACTION=p_i p_2_i ocr ${DCR_CHOICE_ACTION}
         ;;
       tet)
-        export DCR_CHOICE_ACTION=p_i p_2_i n_2_p ocr ${DCR_CHOICE_ACTION}
+        export DCR_CHOICE_ACTION=p_i p_2_i ocr n_2_p ${DCR_CHOICE_ACTION}
         ;;
       s_f_p)
-        export DCR_CHOICE_ACTION=p_i p_2_i n_2_p ocr tet ${DCR_CHOICE_ACTION}
+        export DCR_CHOICE_ACTION=p_i p_2_i ocr n_2_p tet ${DCR_CHOICE_ACTION}
         ;;
       *)
         ;;

diff --git a/run_dcr_prod.bat b/run_dcr_prod.bat
@@ -18,8 +18,8 @@ if ["%1"] EQU [""] (
     echo ---------------------------------------------------------
     echo p_i   - 1. Process the inbox directory.
     echo p_2_i - 2. Convert pdf documents to image files:               Poppler.
-    echo n_2_p - 2. Convert non-pdf documents to pdf files:             Pandoc
     echo ocr   - 3. Convert image documents to pdf files:               Tesseract OCR.
+    echo n_2_p - 2. Convert non-pdf documents to pdf files:             Pandoc
     echo tet   - 4. Extract text and metdata from pdf documents:        PDFlib TET.
     echo s_f_p - 5. Store the document structure from the parser result.
     echo ---------------------------------------------------------
@@ -139,17 +139,17 @@ echo.
         if ["%DCR_CHOICE_ACTION%"] EQU ["p_2_i"] (
             set DCR_CHOICE_ACTION=p_i %DCR_CHOICE_ACTION%
         )
-        if ["%DCR_CHOICE_ACTION%"] EQU ["n_2_p"] (
+        if ["%DCR_CHOICE_ACTION%"] EQU ["ocr"] (
             set DCR_CHOICE_ACTION=p_i p_2_i %DCR_CHOICE_ACTION%
         )
-        if ["%DCR_CHOICE_ACTION%"] EQU ["ocr"] (
-            set DCR_CHOICE_ACTION=p_i p_2_i n_2_p %DCR_CHOICE_ACTION%
+        if ["%DCR_CHOICE_ACTION%"] EQU ["n_2_p"] (
+            set DCR_CHOICE_ACTION=p_i p_2_i ocr %DCR_CHOICE_ACTION%
         )
         if ["%DCR_CHOICE_ACTION%"] EQU ["tet"] (
-            set DCR_CHOICE_ACTION=p_i p_2_i n_2_p ocr %DCR_CHOICE_ACTION%
+            set DCR_CHOICE_ACTION=p_i p_2_i ocr n_2_p %DCR_CHOICE_ACTION%
         )
         if ["%DCR_CHOICE_ACTION%"] EQU ["s_f_p"] (
-            set DCR_CHOICE_ACTION=p_i p_2_i n_2_p ocr tet %DCR_CHOICE_ACTION%
+            set DCR_CHOICE_ACTION=p_i p_2_i ocr n_2_p tet %DCR_CHOICE_ACTION%
         )
         pipenv run python src\dcr\dcr.py !DCR_CHOICE_ACTION!
         if ERRORLEVEL 1 (

diff --git a/run_dcr_prod.sh b/run_dcr_prod.sh
@@ -18,8 +18,8 @@ if [ -z "$1" ]; then
     echo "------------------------------------------------------------------------------"
     echo "p_i   - 1. Process the inbox directory."
     echo "p_2_i - 2. Convert pdf documents to image files:               Poppler."
-    echo "n_2_p - 2. Convert non-pdf documents to pdf files:             Pandoc."
     echo "ocr   - 3. Convert image documents to pdf files:               Tesseract OCR."
+    echo "n_2_p - 2. Convert non-pdf documents to pdf files:             Pandoc."
     echo "tet   - 4. Extract text and metadata from pdf documents:       PDFlib TET."
     echo "s_f_p - 5. Store the document structure from the parser result."
     echo "------------------------------------------------------------------------------"
@@ -87,17 +87,17 @@ case "${DCR_CHOICE_ACTION}" in
       p_2_i)
         export DCR_CHOICE_ACTION=p_i ${DCR_CHOICE_ACTION}
         ;;
-      n_2_p)
+      ocr)
         export DCR_CHOICE_ACTION=p_i p_2_i ${DCR_CHOICE_ACTION}
         ;;
-      tet)
-        export DCR_CHOICE_ACTION=p_i p_2_i ${DCR_CHOICE_ACTION}
+      n_2_p)
+        export DCR_CHOICE_ACTION=p_i p_2_i ocr ${DCR_CHOICE_ACTION}
         ;;
       tet)
-        export DCR_CHOICE_ACTION=p_i p_2_i n_2_p ocr ${DCR_CHOICE_ACTION}
+        export DCR_CHOICE_ACTION=p_i p_2_i ocr n_2_p ${DCR_CHOICE_ACTION}
         ;;
       s_f_p)
-        export DCR_CHOICE_ACTION=p_i p_2_i n_2_p ocr tet ${DCR_CHOICE_ACTION}
+        export DCR_CHOICE_ACTION=p_i p_2_i ocr n_2_p tet ${DCR_CHOICE_ACTION}
         ;;
       *)
         ;;

diff --git a/src/dcr/dcr.py b/src/dcr/dcr.py
@@ -333,7 +333,9 @@ def process_convert_image_2_pdf() -> None:
     )
     libs.utils.progress_msg("End  : Convert image documents to pdf files ...")
 
-    libs.utils.progress_msg_empty_before("Start: Reunite the related pdf files ... Tesseract OCR")
+    libs.cfg.document_current_step = libs.db.cfg.DOCUMENT_STEP_PYPDF4
+
+    libs.utils.progress_msg_empty_before("Start: Reunite the related pdf files ... PyPDF4")
     libs.cfg.run_id = libs.db.orm.dml.insert_dbt_row(
         libs.db.cfg.DBT_RUN,
         {
@@ -443,7 +445,7 @@ def process_documents(args: dict[str, bool]) -> None:
     # Process the documents in the inbox file directory.
     if args[libs.cfg.RUN_ACTION_PROCESS_INBOX]:
         start_time_process = time.perf_counter_ns()
-        libs.cfg.document_processing_step = libs.db.cfg.DOCUMENT_STEP_INBOX
+        libs.cfg.document_current_step = libs.db.cfg.DOCUMENT_STEP_INBOX
         process_inbox_directory()
         libs.utils.progress_msg(
             f"Time : {round((time.perf_counter_ns() - start_time_process)/1000000000,2) :10.2f} s"
@@ -452,7 +454,7 @@ def process_documents(args: dict[str, bool]) -> None:
     # Convert the scanned image pdf documents to image files.
     if args[libs.cfg.RUN_ACTION_PDF_2_IMAGE]:
         start_time_process = time.perf_counter_ns()
-        libs.cfg.document_processing_step = libs.db.cfg.DOCUMENT_STEP_PDF2IMAGE
+        libs.cfg.document_current_step = libs.db.cfg.DOCUMENT_STEP_PDF2IMAGE
         process_convert_pdf_2_image()
         libs.utils.progress_msg(
             f"Time : {round((time.perf_counter_ns() - start_time_process)/1000000000,2) :10.2f} s"
@@ -461,7 +463,7 @@ def process_documents(args: dict[str, bool]) -> None:
     # Convert the image documents to pdf files.
     if args[libs.cfg.RUN_ACTION_IMAGE_2_PDF]:
         start_time_process = time.perf_counter_ns()
-        libs.cfg.document_processing_step = libs.db.cfg.DOCUMENT_STEP_TESSERACT
+        libs.cfg.document_current_step = libs.db.cfg.DOCUMENT_STEP_TESSERACT
         process_convert_image_2_pdf()
         libs.utils.progress_msg(
             f"Time : {round((time.perf_counter_ns() - start_time_process)/1000000000,2) :10.2f} s"
@@ -470,7 +472,7 @@ def process_documents(args: dict[str, bool]) -> None:
     # Convert the non-pdf documents to pdf files.
     if args[libs.cfg.RUN_ACTION_NON_PDF_2_PDF]:
         start_time_process = time.perf_counter_ns()
-        libs.cfg.document_processing_step = libs.db.cfg.DOCUMENT_STEP_PANDOC
+        libs.cfg.document_current_step = libs.db.cfg.DOCUMENT_STEP_PANDOC
         process_convert_non_pdf_2_pdf()
         libs.utils.progress_msg(
             f"Time : {round((time.perf_counter_ns() - start_time_process)/1000000000,2) :10.2f} s"
@@ -479,7 +481,7 @@ def process_documents(args: dict[str, bool]) -> None:
     # Extract text and metadata from pdf documents.
     if args[libs.cfg.RUN_ACTION_TEXT_FROM_PDF]:
         start_time_process = time.perf_counter_ns()
-        libs.cfg.document_processing_step = libs.db.cfg.DOCUMENT_STEP_PDFLIB
+        libs.cfg.document_current_step = libs.db.cfg.DOCUMENT_STEP_PDFLIB
         process_extract_text_from_pdf()
         libs.utils.progress_msg(
             f"Time : {round((time.perf_counter_ns() - start_time_process)/1000000000,2) :10.2f} s"
@@ -488,7 +490,7 @@ def process_documents(args: dict[str, bool]) -> None:
     # Store the document structure from the parser result.
     if args[libs.cfg.RUN_ACTION_STORE_FROM_PARSER]:
         start_time_process = time.perf_counter_ns()
-        libs.cfg.document_processing_step = libs.db.cfg.DOCUMENT_STEP_PARSER
+        libs.cfg.document_current_step = libs.db.cfg.DOCUMENT_STEP_PARSER
         process_store_from_parser()
         libs.utils.progress_msg(
             f"Time : {round((time.perf_counter_ns() - start_time_process)/1000000000,2) :10.2f} s"

diff --git a/src/dcr/libs/cfg.py b/src/dcr/libs/cfg.py
@@ -145,6 +145,7 @@
 document_child_no: sqlalchemy.Integer | None
 document_child_status: str
 document_child_stem_name: str
+document_current_step: str
 document_directory_name: str
 document_directory_type: str
 document_error_code: str | None
@@ -155,7 +156,6 @@
 document_id_parent: sqlalchemy.Integer | None
 document_language_id: sqlalchemy.Integer
 document_next_step: str | None
-document_processing_step: str
 document_sha256: str | None
 document_status: str
 document_stem_name: str

diff --git a/src/dcr/libs/cfg.pyi b/src/dcr/libs/cfg.pyi
@@ -145,6 +145,7 @@ document_child_next_step: str | None
 document_child_no: sqlalchemy.Integer | None
 document_child_status: str
 document_child_stem_name: str
+document_current_step: str
 document_directory_name: str
 document_directory_type: str
 document_error_code: str | None
@@ -155,7 +156,6 @@ document_id_base: sqlalchemy.Integer | None
 document_id_parent: sqlalchemy.Integer | None
 document_language_id: sqlalchemy.Integer
 document_next_step: str | None
-document_processing_step: str
 document_sha256: str | None
 document_status: str
 document_stem_name: str

diff --git a/src/dcr/libs/db/cfg.py b/src/dcr/libs/db/cfg.py
@@ -18,6 +18,7 @@
 DBC_CODE_SPACY: str = "code_spacy"
 DBC_CODE_TESSERACT: str = "code_tesseract"
 DBC_CREATED_AT: str = "created_at"
+DBC_CURRENT_STEP: str = "current_step"
 DBC_DIRECTORY_NAME: str = "directory_name"
 DBC_DIRECTORY_NAME_INBOX: str = "directory_name_inbox"
 DBC_DIRECTORY_TYPE: str = "directory_type"
@@ -39,7 +40,6 @@
 DBC_NEXT_STEP: str = "next_step"
 DBC_PAGE_IN_DOCUMENT: str = "page_in_document"
 DBC_PARA_IN_PAGE: str = "para_in_page"
-DBC_PROCESSING_STEP: str = "processing_step"
 DBC_RUN_ID: str = "run_id"
 DBC_SENTENCE_IN_PARA: str = "sentence_in_para"
 DBC_SHA256: str = "sha256"
@@ -112,11 +112,12 @@
 DOCUMENT_STATUS_ERROR: str = "error"
 DOCUMENT_STATUS_START: str = "start"
 
-DOCUMENT_STEP_INBOX: str = "Inbox Processing"
+DOCUMENT_STEP_INBOX: str = "Inbox"
 DOCUMENT_STEP_PANDOC: str = "Pandoc & TeX Live"
 DOCUMENT_STEP_PARSER: str = "Parser"
 DOCUMENT_STEP_PDF2IMAGE: str = "pdf2image"
 DOCUMENT_STEP_PDFLIB: str = "PDFlib TET"
+DOCUMENT_STEP_PYPDF4: str = "PyPDF4"
 DOCUMENT_STEP_TESSERACT: str = "Tesseract OCR"
 
 ERROR_01_901: str = (
@@ -175,7 +176,7 @@
 ERROR_41_904: str = "41.904 Issue (ocr): The target file '{file_name}' already exists."
 
 ERROR_51_901: str = (
-    "51.901 Issue (tet): Issues with opening document '{file_name}' - "
+    "51.901 Issue (tet): Opening document '{file_name}' - "
     + "error no: '{error_no}' - api: '{api_name}' - error: '{error}'."
 )
 ERROR_51_902: str = (

diff --git a/src/dcr/libs/db/cfg.pyi b/src/dcr/libs/db/cfg.pyi
@@ -18,6 +18,7 @@ DBC_CODE_ISO_639_3: str
 DBC_CODE_SPACY: str
 DBC_CODE_TESSERACT: str
 DBC_CREATED_AT: str
+DBC_CURRENT_STEP: str
 DBC_DIRECTORY_NAME: str
 DBC_DIRECTORY_NAME_INBOX: str
 DBC_DIRECTORY_TYPE: str
@@ -39,7 +40,6 @@ DBC_MODULE_NAME: str
 DBC_NEXT_STEP: str
 DBC_PAGE_IN_DOCUMENT: str
 DBC_PARA_IN_PAGE: str
-DBC_PROCESSING_STEP: str
 DBC_RUN_ID: str
 DBC_SENTENCE_IN_PARA: str
 DBC_SHA256: str
@@ -98,6 +98,7 @@ DOCUMENT_STEP_PANDOC: str
 DOCUMENT_STEP_PARSER: str
 DOCUMENT_STEP_PDF2IMAGE: str
 DOCUMENT_STEP_PDFLIB: str
+DOCUMENT_STEP_PYPDF4: str
 DOCUMENT_STEP_TESSERACT: str
 
 ERROR_01_901: str

diff --git a/src/dcr/libs/db/orm/ddl.py b/src/dcr/libs/db/orm/ddl.py
@@ -250,6 +250,7 @@ def create_dbt_document(table_name: str) -> None:
             sqlalchemy.Integer,
             nullable=True,
         ),
+        sqlalchemy.Column(libs.db.cfg.DBC_CURRENT_STEP, sqlalchemy.String, nullable=False),
         sqlalchemy.Column(libs.db.cfg.DBC_DIRECTORY_NAME, sqlalchemy.String, nullable=False),
         sqlalchemy.Column(libs.db.cfg.DBC_DIRECTORY_TYPE, sqlalchemy.String, nullable=False),
         sqlalchemy.Column(
@@ -310,6 +311,7 @@ def create_dbt_journal(table_name: str) -> None:
             libs.db.cfg.DBC_CREATED_AT,
             sqlalchemy.DateTime,
         ),
+        sqlalchemy.Column(libs.db.cfg.DBC_CURRENT_STEP, sqlalchemy.String, nullable=False),
         sqlalchemy.Column(
             libs.db.cfg.DBC_DOCUMENT_ID,
             sqlalchemy.Integer,
@@ -319,7 +321,6 @@ def create_dbt_journal(table_name: str) -> None:
         sqlalchemy.Column(libs.db.cfg.DBC_DURATION_NS, sqlalchemy.BigInteger, nullable=False),
         sqlalchemy.Column(libs.db.cfg.DBC_ERROR_CODE, sqlalchemy.String, nullable=True),
         sqlalchemy.Column(libs.db.cfg.DBC_ERROR_TEXT, sqlalchemy.String, nullable=True),
-        sqlalchemy.Column(libs.db.cfg.DBC_PROCESSING_STEP, sqlalchemy.String, nullable=False),
         sqlalchemy.Column(
             libs.db.cfg.DBC_RUN_ID,
             sqlalchemy.Integer,

diff --git a/src/dcr/libs/db/orm/dml.py b/src/dcr/libs/db/orm/dml.py
@@ -63,11 +63,11 @@ def insert_journal_error(
     insert_dbt_row(
         libs.db.cfg.DBT_JOURNAL,
         {
-            libs.db.cfg.DBC_ERROR_CODE: error[0:6],
-            libs.db.cfg.DBC_ERROR_TEXT: error[7:],
+            libs.db.cfg.DBC_CURRENT_STEP: libs.cfg.document_current_step,
             libs.db.cfg.DBC_DOCUMENT_ID: document_id,
             libs.db.cfg.DBC_DURATION_NS: duration_ns,
-            libs.db.cfg.DBC_PROCESSING_STEP: libs.cfg.document_processing_step,
+            libs.db.cfg.DBC_ERROR_CODE: error[0:6],
+            libs.db.cfg.DBC_ERROR_TEXT: error[7:],
             libs.db.cfg.DBC_RUN_ID: libs.cfg.run_run_id,
         },
     )
@@ -97,9 +97,9 @@ def insert_journal_statistics(
     insert_dbt_row(
         libs.db.cfg.DBT_JOURNAL,
         {
+            libs.db.cfg.DBC_CURRENT_STEP: libs.cfg.document_current_step,
             libs.db.cfg.DBC_DOCUMENT_ID: document_id,
             libs.db.cfg.DBC_DURATION_NS: duration_ns,
-            libs.db.cfg.DBC_PROCESSING_STEP: libs.cfg.document_processing_step,
             libs.db.cfg.DBC_RUN_ID: libs.cfg.run_run_id,
         },
     )