Skip to content
This repository has been archived by the owner on May 7, 2024. It is now read-only.

Commit

Permalink
Convert pdf documents to image files.
Browse files Browse the repository at this point in the history
  • Loading branch information
walter-weinmann committed Feb 23, 2022
1 parent ab78db1 commit 0e26579
Show file tree
Hide file tree
Showing 31 changed files with 764 additions and 1,015 deletions.
4 changes: 2 additions & 2 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -316,13 +316,13 @@ ignore-comments=yes
ignore-docstrings=yes

# Imports are removed from the similarity computation
ignore-imports=no
ignore-imports=yes

# Signatures are removed from the similarity computation
ignore-signatures=no

# Minimum lines number of a similarity.
min-similarity-lines=4
min-similarity-lines=5


[SPELLING]
Expand Down
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,11 @@ help:
export DCR_ENVIRONMENT_TYPE=test

ifeq ($(OS),Windows_NT)
DCR_DOCKER_CONTAINER=scripts\\run_setup_postgresql.bat test
export MYPYPATH=src\\dcr
export PYTHONPATH=src\\dcr
else
DCR_DOCKER_CONTAINER=./scripts/run_setup_postgresql.sh test
export MYPYPATH=src/dcr
export PYTHONPATH=src/dcr:src/dcr
endif
Expand Down Expand Up @@ -174,12 +176,14 @@ pylint: ## Lint the code with Pylint.
# Configuration file: pyproject.toml
pytest: ## Run all tests with pytest.
@echo "Info ********** Start: pytest **************************************"
$(DCR_DOCKER_CONTAINER)
pipenv run pytest --version
pipenv run pytest --dead-fixtures tests
pipenv run pytest --cov=src --cov-report term-missing:skip-covered --random-order -v tests
@echo "Info ********** End: pytest **************************************"
pytest-ci: ## Run all tests with pytest after test tool installation.
@echo "Info ********** Start: pytest **************************************"
$(DCR_DOCKER_CONTAINER)
pipenv install pytest
pipenv install pytest-cov
pipenv install pytest-deadfixtures
Expand Down
3 changes: 2 additions & 1 deletion scripts/run_setup_postgresql.bat
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,10 @@ docker create -e POSTGRES_DB=dcr_db_%DCR_ENVIRONMENT_TYPE%_admin ^
-e POSTGRES_USER=dcr_user_admin ^
--name dcr_db_%DCR_ENVIRONMENT_TYPE% ^
-p %DCR_CONNECTION_PORT%:%DCR_CONTAINER_PORT% ^
--restart always ^
postgres:%DCR_VERSION%

echo Docker start dcr_db_%DCR_ENVIRONMENT_TYPE% (PostgreSQL %DCR_VERSION%) ...
echo Docker start dcr_db_%DCR_ENVIRONMENT_TYPE% (PostgreSQL %DCR_VERSION%) ...
docker start dcr_db_%DCR_ENVIRONMENT_TYPE%

ping -n 30 127.0.0.1>nul
Expand Down
1 change: 1 addition & 0 deletions scripts/run_setup_postgresql.sh
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ docker create -e POSTGRES_DB=dcr_db_${DCR_ENVIRONMENT_TYPE}_admin \
-e POSTGRES_USER=dcr_user_admin \
--name dcr_db_${DCR_ENVIRONMENT_TYPE} \
-p "${DCR_CONNECTION_PORT}":"${DCR_CONTAINER_PORT}" \
--restart always \
postgres:"${DCR_VERSION}"

echo "Docker start dcr_db_${DCR_ENVIRONMENT_TYPE} (PostgreSQL ${DCR_VERSION}) ..."
Expand Down
5 changes: 1 addition & 4 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,23 @@ directory_inbox = data/inbox
directory_inbox_accepted = data/inbox_accepted
directory_inbox_rejected = data/inbox_rejected
ignore_duplicates = false
pdf2image_type = JPEG
pdf2image_type = jpeg
verbose = true

[dcr_dev]
db_connection_port = 5432
db_database = dcr_db_dev
db_database_admin = dcr_db_dev_admin
db_docker_container = dcr_db_dev

[dcr_prod]
db_connection_port = 5433
db_database = dcr_db_prod
db_database_admin = dcr_db_prod_admin
db_docker_container = dcr_db_prod

[dcr_test]
db_connection_port = 5434
db_database = dcr_db_test
db_database_admin = dcr_db_test_admin
db_docker_container = dcr_db_test

[flake8]
count = True
Expand Down
6 changes: 0 additions & 6 deletions src/dcr/dcr.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,12 +297,6 @@ def process_documents(args: dict[str, bool]) -> None:
# -----------------------------------------------------------------------------
def validate_config() -> None:
"""Validate the configuration parameters."""
# -------------------------------------------------------------------------
# Parameter: db_docker_container
#
if libs.cfg.DCR_CFG_DB_DOCKER_CONTAINER not in libs.cfg.config:
libs.cfg.is_docker_container = False

# -------------------------------------------------------------------------
# Parameter: ignore_duplicates
#
Expand Down
8 changes: 4 additions & 4 deletions src/dcr/libs/cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
DCR_CFG_DB_DATABASE: str = "db_database"
DCR_CFG_DB_DATABASE_ADMIN: str = "db_database_admin"
DCR_CFG_DB_DIALECT: str = "db_dialect"
DCR_CFG_DB_DOCKER_CONTAINER: str = "db_docker_container"
DCR_CFG_DB_HOST: str = "db_host"
DCR_CFG_DB_PASSWORD: str = "db_password"
DCR_CFG_DB_PASSWORD_ADMIN: str = "db_password_admin"
Expand All @@ -38,8 +37,8 @@
DCR_CFG_FILE: str = "setup.cfg"
DCR_CFG_IGNORE_DUPLICATES: str = "ignore_duplicates"
DCR_CFG_PDF2IMAGE_TYPE: str = "pdf2image_type"
DCR_CFG_PDF2IMAGE_TYPE_JPEG: str = "JPEG"
DCR_CFG_PDF2IMAGE_TYPE_PNG: str = "PNG"
DCR_CFG_PDF2IMAGE_TYPE_JPEG: str = "jpeg"
DCR_CFG_PDF2IMAGE_TYPE_PNG: str = "png"
DCR_CFG_SECTION: str = "dcr"
DCR_CFG_SECTION_DEV: str = "dcr_dev"
DCR_CFG_SECTION_PROD: str = "dcr_prod"
Expand Down Expand Up @@ -89,6 +88,7 @@
directory_inbox_accepted: PathLike[str] | str
directory_inbox_rejected: PathLike[str] | str

document_child_child_no: sqlalchemy.Integer | None
document_child_directory_name: str
document_child_directory_type: str
document_child_error_code: str | None
Expand All @@ -103,6 +103,7 @@
document_child_status: str
document_child_stem_name: str

document_child_no: sqlalchemy.Integer | None
document_directory_name: str
document_directory_type: str
document_error_code: str | None
Expand All @@ -118,7 +119,6 @@

environment_type: str

is_docker_container: bool = True
is_ignore_duplicates: bool = False
is_verbose: bool = True

Expand Down
5 changes: 2 additions & 3 deletions src/dcr/libs/cfg.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ DCR_CFG_DCR_VERSION: str
DCR_CFG_DIRECTORY_INBOX: str
DCR_CFG_DIRECTORY_INBOX_ACCEPTED: str
DCR_CFG_DIRECTORY_INBOX_REJECTED: str
DCR_CFG_DB_DOCKER_CONTAINER: str
DCR_CFG_FILE: str
DCR_CFG_IGNORE_DUPLICATES: str
DCR_CFG_PDF2IMAGE_TYPE: str
Expand Down Expand Up @@ -89,6 +88,7 @@ directory_inbox: PathLike[str] | str
directory_inbox_accepted: PathLike[str] | str
directory_inbox_rejected: PathLike[str] | str

document_child_child_no: sqlalchemy.Integer | None
document_child_directory_name: str
document_child_directory_type: str
document_child_error_code: str | None
Expand All @@ -98,11 +98,11 @@ document_child_id: sqlalchemy.Integer
document_child_id_base: sqlalchemy.Integer | None
document_child_id_parent: sqlalchemy.Integer | None
document_child_next_step: str | None
document_child_no: sqlalchemy.Integer
document_child_sha256: str | None
document_child_status: str
document_child_stem_name: str

document_child_no: sqlalchemy.Integer | None
document_directory_name: str
document_directory_type: str
document_error_code: str | None
Expand All @@ -118,7 +118,6 @@ document_stem_name: str

environment_type: str

is_docker_container: bool
is_ignore_duplicates: bool
is_verbose: bool

Expand Down
34 changes: 21 additions & 13 deletions src/dcr/libs/db/cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
"""
from typing import List

from psycopg2.extensions import connection
from psycopg2.extensions import cursor
from sqlalchemy import MetaData
from sqlalchemy.engine import Engine

Expand All @@ -16,6 +18,7 @@
DBC_ACTION: str = "action"
DBC_ACTION_CODE: str = "action_code"
DBC_ACTION_TEXT: str = "action_text"
DBC_CHILD_NO: str = "child_no"
DBC_CREATED_AT: str = "created_at"
DBC_DIRECTORY_NAME: str = "directory_name"
DBC_DIRECTORY_TYPE: str = "directory_type"
Expand Down Expand Up @@ -52,7 +55,8 @@
DOCUMENT_ERROR_CODE_REJECTED_FILE_DUPL: str = "Duplicate file"
DOCUMENT_ERROR_CODE_REJECTED_FILE_ERROR: str = "rejected_file_error"
DOCUMENT_ERROR_CODE_REJECTED_FILE_EXT: str = "Unknown file extension"
DOCUMENT_ERROR_CODE_REJECTED_FILE_PERMISSION: str = "rejected_file_permission"
DOCUMENT_ERROR_CODE_REJECTED_FILE_MOVE: str = "Issue with file move"
DOCUMENT_ERROR_CODE_REJECTED_FILE_RIGHTS: str = "Issue with file permissions"
DOCUMENT_ERROR_CODE_REJECTED_NO_PDF_FORMAT: str = "No 'pdf' format"
DOCUMENT_ERROR_CODE_REJECTED_PDF2IMAGE: str = "Issue with pdf2image"

Expand Down Expand Up @@ -100,10 +104,17 @@
DOCUMENT_STATUS_START: str = "start"

JOURNAL_ACTION_01_001: str = (
"01.001 Start (p_i): New document detected in the 'inbox' file directory."
"01.001 Start (p_i): Document file '{file_name}' detected " + "in the 'inbox' file directory."
)
JOURNAL_ACTION_01_002: str = (
"01.002 End (p_i): Document file '{source_file}' successfully moved to file '{target_file}'."
)
JOURNAL_ACTION_01_003: str = (
"01.003 Next (p_i): Ready to convert document file '{file_name}' "
+ "to '{type}' format using pdf2image."
)
JOURNAL_ACTION_01_901: str = (
"01.901 Issue: Document rejected because of unknown file extension='{extension}'."
"01.901 Issue (p_i): Document rejected because of unknown file extension='{extension}'."
)
JOURNAL_ACTION_01_902: str = (
"01.902 Issue (p_i): Moving '{source_file}' to '{target_file}' "
Expand All @@ -126,19 +137,16 @@
"11.002 Ready to convert the document to 'pdf' format using Tesseract OCR."
)
JOURNAL_ACTION_11_003: str = "11.003 Ready to process the 'pdf' document using PDFlib TET."
JOURNAL_ACTION_11_004: str = (
"11.004 Ready to convert the 'pdf' document to '{type}' format using pdf2image."
)
JOURNAL_ACTION_21_001: str = (
"21.001 Start (p_2_i): The 'pdf' document must be converted into image file(s) "
"21.001 Start (p_2_i): The document file '{file_name}' must be converted into image file(s) "
+ "for further processing."
)
JOURNAL_ACTION_21_002: str = (
"21.002 End (p_2_i): The original 'pdf' document has been successfully converted "
"21.002 End (p_2_i): The document file '{file_name}' has been successfully converted "
+ "to {child_no} image file(s)."
)
JOURNAL_ACTION_21_003: str = (
"21.003 End (p_2_i): The created image file {file_name} "
"21.003 Next (p_2_i): The created image file '{file_name}' "
+ "is ready to be processed with Tesseract OCR."
)
JOURNAL_ACTION_21_901: str = (
Expand All @@ -160,7 +168,7 @@

db_current_database: str
db_current_user: str

engine: Engine

metadata: MetaData | None = None
db_driver_conn: connection | None = None
db_driver_cur: cursor | None = None
db_orm_engine: Engine | None = None
db_orm_metadata: MetaData | None = None
17 changes: 11 additions & 6 deletions src/dcr/libs/db/cfg.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ Returns:
"""
from typing import List

from psycopg2.extensions import connection
from psycopg2.extensions import cursor
from sqlalchemy import MetaData
from sqlalchemy.engine import Engine

Expand All @@ -16,6 +18,7 @@ DB_DIALECT_POSTGRESQL: str
DBC_ACTION: str
DBC_ACTION_CODE: str
DBC_ACTION_TEXT: str
DBC_CHILD_NO: str
DBC_CREATED_AT: str
DBC_DIRECTORY_NAME: str
DBC_DIRECTORY_TYPE: str
Expand Down Expand Up @@ -52,7 +55,8 @@ DOCUMENT_ERROR_CODE_REJECTED_ERROR: str
DOCUMENT_ERROR_CODE_REJECTED_FILE_DUPL: str
DOCUMENT_ERROR_CODE_REJECTED_FILE_ERROR: str
DOCUMENT_ERROR_CODE_REJECTED_FILE_EXT: str
DOCUMENT_ERROR_CODE_REJECTED_FILE_PERMISSION: str
DOCUMENT_ERROR_CODE_REJECTED_FILE_MOVE: str
DOCUMENT_ERROR_CODE_REJECTED_FILE_RIGHTS: str
DOCUMENT_ERROR_CODE_REJECTED_NO_PDF_FORMAT: str
DOCUMENT_ERROR_CODE_REJECTED_PDF2IMAGE: str

Expand All @@ -73,6 +77,8 @@ DOCUMENT_STATUS_ERROR: str
DOCUMENT_STATUS_START: str

JOURNAL_ACTION_01_001: str
JOURNAL_ACTION_01_002: str
JOURNAL_ACTION_01_003: str
JOURNAL_ACTION_01_901: str
JOURNAL_ACTION_01_902: str
JOURNAL_ACTION_01_903: str
Expand All @@ -81,7 +87,6 @@ JOURNAL_ACTION_01_905: str
JOURNAL_ACTION_11_001: str
JOURNAL_ACTION_11_002: str
JOURNAL_ACTION_11_003: str
JOURNAL_ACTION_11_004: str
JOURNAL_ACTION_21_001: str
JOURNAL_ACTION_21_002: str
JOURNAL_ACTION_21_003: str
Expand All @@ -97,7 +102,7 @@ RUN_STATUS_START: str

db_current_database: str
db_current_user: str

engine: Engine

metadata: MetaData | None = None
db_driver_conn: connection | None = None
db_driver_cur: cursor | None = None
db_orm_engine: Engine | None = None
db_orm_metadata: MetaData | None = None
Loading

0 comments on commit 0e26579

Please sign in to comment.