From 36aec65e371c10096aa5610037319a7ad848d595 Mon Sep 17 00:00:00 2001 From: Mark Jordan Date: Tue, 20 Mar 2018 06:38:29 -0700 Subject: [PATCH 1/4] Work on #421. --- src/inputvalidators/CsvBooks.php | 65 ++++++++++++++++++++++++++++++-- src/writers/CsvBooks.php | 36 ++++++++++++++++-- 2 files changed, 94 insertions(+), 7 deletions(-) diff --git a/src/inputvalidators/CsvBooks.php b/src/inputvalidators/CsvBooks.php index 82eefc1..95a3097 100644 --- a/src/inputvalidators/CsvBooks.php +++ b/src/inputvalidators/CsvBooks.php @@ -35,6 +35,14 @@ public function __construct($settings) $this->page_sequence_separator = '-'; } $this->page_sequence_separator = preg_quote($this->page_sequence_separator); + + $this->ocr_extension = '.txt'; + // Default is to not log the absence of page-level OCR files. + if (isset($settings['WRITER']['log_missing_ocr_files'])) { + $this->log_missing_ocr_files= $settings['WRITER']['log_missing_ocr_files']; + } else { + $this->log_missing_ocr_files = false; + } } /** @@ -74,7 +82,7 @@ public function validateAll() * The package's record key. * * @param $package_path string - * The the package's input directory name (not full path). + * The package's input directory name (not full path). * * @return boolean * True if all tests pass for the package, false if any tests failed. @@ -135,6 +143,18 @@ public function validatePackage($record_key, $package_path) $cumulative_validation_results[] = false; } + if (!$this->checkOcrFiles($package_path, $pages)) { + $this->log->addError( + "Input validation failed", + array( + 'record ID' => $record_key, + 'issue directory' => $package_path, + 'error' => 'Book directory is missing one or more OCR files' + ) + ); + $cumulative_validation_results[] = false; + } + // Files in book directory must be named such that their last // filename segment is numeric. if (!$this->checkPageSequenceNumbers($pages)) { @@ -180,18 +200,28 @@ public function validatePackage($record_key, $package_path) */ private function getPageFiles($dir) { + $page_files = array(); $files = $this->readDir($dir); foreach ($files as &$file) { $file = basename($file); + foreach ($files as $file) { + $pathinfo = pathinfo($file); + $page_file = $pathinfo['basename']; + $ext = $pathinfo['extension']; + if (in_array($ext, array('tif','tiff', 'jp2'))) { + $page_files[] = $page_file; + } + } } - return $files; + return $page_files; } /** * Validates the extensions of the pages in the book-level directory. * * @param $files array - * A list of all the page file names. + * A list of all the page file names.Files must have one of + * following extensions: tif, tiff, jp2. * * @return boolean * True if all files have an allowed file extension, false if not. @@ -232,4 +262,33 @@ private function checkPageSequenceNumbers($files) } return $valid; } + + /** + * Checks for the existence of page-level OCR files. + * + * @param $book_directory_path string + * The absolute path to the book-level directory. + * @param $files array + * A list of all the page file names in the directory. + * + * @return boolean + * True if all image files have corresponding OCR files. + */ + private function checkOcrFiles($book_directory_path, $files) + { + $valid = true; + if (!$this->log_missing_ocr_files) { + return $valid; + } + foreach ($files as $file) { + $pathinfo = pathinfo($file); + $filename = $pathinfo['filename']; + $path_to_ocr_file = realpath($book_directory_path) . DIRECTORY_SEPARATOR . + $filename . $this->ocr_extension; + if (!file_exists($path_to_ocr_file)) { + $valid = false; + } + } + return $valid; + } } diff --git a/src/writers/CsvBooks.php b/src/writers/CsvBooks.php index a7ea726..86b1b22 100644 --- a/src/writers/CsvBooks.php +++ b/src/writers/CsvBooks.php @@ -54,6 +54,14 @@ public function __construct($settings) Logger::INFO ); $this->log->pushHandler($this->logStreamHandler); + + $this->ocr_extension = '.txt'; + // Default is to not log the absence of page-level OCR files. + if (isset($settings['WRITER']['log_missing_ocr_files'])) { + $this->log_missing_ocr_files= $settings['WRITER']['log_missing_ocr_files']; + } else { + $this->log_missing_ocr_files = false; + } } /** @@ -113,9 +121,8 @@ public function writePackages($metadata, $pages, $record_id) } // @todo: Add error handling on mkdir and copy. - // @todo: Write page level MODS.xml file, after testing ingest as is. foreach ($pages as $page_path) { - // Get the page number from the filename. It is the last segment. + // Get the sequence number from the last segment of the filename. $pathinfo = pathinfo($page_path); $filename_segments = explode($this->page_sequence_separator, $pathinfo['filename']); @@ -126,9 +133,9 @@ public function writePackages($metadata, $pages, $record_id) $OBJ_expected = in_array('OBJ', $this->datastreams); if ($OBJ_expected xor $no_datastreams_setting_flag) { $extension = $pathinfo['extension']; - $page_output_file_path = $page_level_output_dir . DIRECTORY_SEPARATOR . + $page_output_path = $page_level_output_dir . DIRECTORY_SEPARATOR . 'OBJ.' . $extension; - copy($page_path, $page_output_file_path); + copy($page_path, $page_output_path); } if ($MODS_expected xor $no_datastreams_setting_flag) { @@ -136,6 +143,27 @@ public function writePackages($metadata, $pages, $record_id) $this->writePageMetadataFile($metadata, $page_number, $page_level_output_dir); } } + + // If the datastreams list is comprised of only 'MODS' we're generating metadata only. + if ($this->datastreams != array('MODS')) { + $OCR_expected = in_array('OCR', $this->datastreams); + if ($OCR_expected xor $no_datastreams_setting_flag) { + $ocr_input_path = $pathinfo['dirname'] . DIRECTORY_SEPARATOR . + $pathinfo['filename'] . $this->ocr_extension; + $ocr_output_path = $page_level_output_dir . DIRECTORY_SEPARATOR . + 'OCR' . $this->ocr_extension; + if (file_exists($ocr_input_path)) { + copy($ocr_input_path, $ocr_output_path); + } else { + if ($this->log_missing_ocr_files) { + $this->log->addWarning( + "CSV Books warning", + array('Page-level OCR file does not exist' => $ocr_input_path) + ); + } + } + } + } } } From 6eb8370acd10f2bffc05b1c7c5c055cc83f169ff Mon Sep 17 00:00:00 2001 From: Mark Jordan Date: Tue, 20 Mar 2018 08:05:40 -0700 Subject: [PATCH 2/4] Work on #421. --- src/inputvalidators/CsvBooks.php | 9 +++------ tests/inputvalidators/CsvInputValidatorsTest.php | 5 +++++ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/inputvalidators/CsvBooks.php b/src/inputvalidators/CsvBooks.php index 95a3097..380931e 100644 --- a/src/inputvalidators/CsvBooks.php +++ b/src/inputvalidators/CsvBooks.php @@ -190,13 +190,13 @@ public function validatePackage($record_key, $package_path) } /** - * Gets the filenames of the page files in the book-level directory. + * Gets the filenames of the files in the book-level directory. * * @param $dir string * The full path to the book-level directory. * * @return array - * A list of all the page file names. + * A list of all the file names (not just page images). */ private function getPageFiles($dir) { @@ -207,10 +207,7 @@ private function getPageFiles($dir) foreach ($files as $file) { $pathinfo = pathinfo($file); $page_file = $pathinfo['basename']; - $ext = $pathinfo['extension']; - if (in_array($ext, array('tif','tiff', 'jp2'))) { - $page_files[] = $page_file; - } + $page_files[] = $page_file; } } return $page_files; diff --git a/tests/inputvalidators/CsvInputValidatorsTest.php b/tests/inputvalidators/CsvInputValidatorsTest.php index 280f23b..2f6f225 100644 --- a/tests/inputvalidators/CsvInputValidatorsTest.php +++ b/tests/inputvalidators/CsvInputValidatorsTest.php @@ -216,6 +216,11 @@ public function testCsvBooksInputValidator() $log_file_entries[2], "CSV Books input validator did not detect unwanted files" ); + $this->assertContains( + 'files/book3","error":"Some files in the book object directory have invalid extensions"', + $log_file_entries[3], + "CSV Books input validator did not find invalid page file extensions" + ); $this->assertContains( 'files/book4","error":"Book object directory not found"', $log_file_entries[4], From 063e6e7e484a9d85cbbed6a914a90a276a546b60 Mon Sep 17 00:00:00 2001 From: Mark Jordan Date: Tue, 20 Mar 2018 09:26:09 -0700 Subject: [PATCH 3/4] Work on #421. --- src/inputvalidators/CsvBooks.php | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/inputvalidators/CsvBooks.php b/src/inputvalidators/CsvBooks.php index 380931e..68541a2 100644 --- a/src/inputvalidators/CsvBooks.php +++ b/src/inputvalidators/CsvBooks.php @@ -39,7 +39,7 @@ public function __construct($settings) $this->ocr_extension = '.txt'; // Default is to not log the absence of page-level OCR files. if (isset($settings['WRITER']['log_missing_ocr_files'])) { - $this->log_missing_ocr_files= $settings['WRITER']['log_missing_ocr_files']; + $this->log_missing_ocr_files = $settings['WRITER']['log_missing_ocr_files']; } else { $this->log_missing_ocr_files = false; } @@ -229,7 +229,14 @@ private function checkPageExtensions($files) foreach ($files as $file) { $pathinfo = pathinfo($file); $ext = $pathinfo['extension']; - if (!in_array($ext, $this->fileGetter->allowed_file_extensions_for_OBJ)) { + if ($this->log_missing_ocr_files) { + $ocr_extension = ltrim($this->ocr_extension, '.'); + $allowed_extensions = array_merge($this->fileGetter->allowed_file_extensions_for_OBJ, array($ocr_extension)); + } + else { + $allowed_extensions = $this->fileGetter->allowed_file_extensions_for_OBJ; + } + if (!in_array($ext, $allowed_extensions)) { $valid = false; } } From 4cfe6b59a660a48bb2f8445ecffa5b0db00ac37e Mon Sep 17 00:00:00 2001 From: Mark Jordan Date: Tue, 20 Mar 2018 09:35:55 -0700 Subject: [PATCH 4/4] Work on #421 - coding standards. --- src/inputvalidators/CsvBooks.php | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/inputvalidators/CsvBooks.php b/src/inputvalidators/CsvBooks.php index 68541a2..7ee6c71 100644 --- a/src/inputvalidators/CsvBooks.php +++ b/src/inputvalidators/CsvBooks.php @@ -231,9 +231,11 @@ private function checkPageExtensions($files) $ext = $pathinfo['extension']; if ($this->log_missing_ocr_files) { $ocr_extension = ltrim($this->ocr_extension, '.'); - $allowed_extensions = array_merge($this->fileGetter->allowed_file_extensions_for_OBJ, array($ocr_extension)); - } - else { + $allowed_extensions = array_merge( + $this->fileGetter->allowed_file_extensions_for_OBJ, + array($ocr_extension) + ); + } else { $allowed_extensions = $this->fileGetter->allowed_file_extensions_for_OBJ; } if (!in_array($ext, $allowed_extensions)) {