Skip to content

Commit

Permalink
optional consolidated OCR for paged content items and Solr page removal
Browse files Browse the repository at this point in the history
  • Loading branch information
qadan committed Nov 4, 2015
1 parent 3468257 commit c53546b
Show file tree
Hide file tree
Showing 7 changed files with 206 additions and 14 deletions.
20 changes: 20 additions & 0 deletions includes/admin.form.inc
Expand Up @@ -117,6 +117,26 @@ function islandora_paged_content_admin_settings_form(array $form, array &$form_s
'#description' => t('The sequence number of each page will be used to set its label.'),
'#default_value' => $get_default_value('islandora_paged_content_page_label', FALSE),
),
'islandora_paged_content_solr_results_alter' => array(
'#type' => 'fieldset',
'#title' => t('Solr Results Altering'),
'islandora_paged_content_hide_pages_solr' => array(
'#type' => 'checkbox',
'#title' => t('Hide page objects from search results?'),
'#default_value' => $get_default_value('islandora_paged_content_hide_pages_solr', FALSE),
),
'islandora_paged_content_solr_fq' => array(
'#type' => 'textfield',
'#title' => t('Paged Content Solr Filter Query'),
'#description' => t('Enter a string representing a query to use to filter pages from Solr results.'),
'#default_value' => $get_default_value('islandora_paged_content_solr_fq', '-RELS_EXT_isPageOf_uri_ms:[* TO *]'),
'#states' => array(
'invisible' => array(
':input[name="islandora_paged_content_hide_pages_solr"]' => array('checked' => FALSE),
),
),
),
),
);
return system_settings_form($form);
}
Expand Down
59 changes: 53 additions & 6 deletions includes/batch.inc
Expand Up @@ -212,10 +212,11 @@ function islandora_paged_content_create_pdf_paged_content_batch_operation($paged
* The batch definition.
*/
function islandora_paged_content_create_ocr_batch(AbstractObject $paged_content, array $pages, array $options) {
return array(
$return = array(
'operations' => array(
array('islandora_paged_content_create_ocr_batch_operation', array(
$pages, $options)),
$pages, $options)),
array('islandora_paged_content_create_ocr_paged_content_batch_operation', array($paged_content->id)),
),
'finished' => 'islandora_paged_content_batch_finished',
'title' => t('Performing OCR on @label ...', array('@label' => $paged_content->label)),
Expand All @@ -224,15 +225,19 @@ function islandora_paged_content_create_ocr_batch(AbstractObject $paged_content,
'error_message' => t('An error has occurred.'),
'file' => drupal_get_path('module', 'islandora_paged_content') . '/includes/batch.inc',
);
if ($options['aggregate_ocr']) {
$return['operations'][] = array('islandora_paged_content_create_ocr_paged_content_batch_operation', array($paged_content->id));
}
return $return;
}

/**
* Batch operation for createing the OCR and HOCR.
* Batch operation for creating the OCR and HOCR.
*
* @param array $pages
* All the pages to perform OCR on.
* @param array $options
* Options to pass into tesseract/gimp.
* Options to pass into tesseract/
* @param array $context
* The context of this batch operation.
*/
Expand All @@ -241,14 +246,56 @@ function islandora_paged_content_create_ocr_batch_operation(array $pages, array
$page = islandora_paged_content_start_batch_operation($pages, $context);
if ($page) {
$page = islandora_object_load($page);
$context['results']['successful'][] = islandora_paged_content_page_derive_ocr_datastreams($page, $options);
islandora_paged_content_end_batch_operation($context, 1, array('Performed OCR on @successful of @total pages.'));
$ocr = islandora_paged_content_page_derive_ocr_datastreams($page, $options, FALSE);
foreach ($ocr as $dsid => $result) {
if ($result !== FALSE) {
$context['results']['files'][] = $result;
// OCR files will be required for concatenation.
if ($dsid == 'OCR') {
$context['results']['pages'][] = $result;
}
$context['results']['successful'][] = islandora_paged_content_update_datastream($page, $result, $dsid);
}
}
islandora_paged_content_end_batch_operation($context, 1, array('Derived @successful of @total OCR/HOCR datastreams.'));
$context['message'] = t('Performing OCR on page @page of @total ...', array(
'@page' => $context['sandbox']['progress'],
'@total' => $context['sandbox']['total']));
}
}

/**
* Batch operation for consolidating OCR.
*
* @param string $paged_content_id
* The paged content identifier to store the OCR.
* @param array $context
* The context of this batch operation.
*/
function islandora_paged_content_create_ocr_paged_content_batch_operation($paged_content_id, &$context) {
module_load_include('inc', 'islandora_paged_content', 'includes/utilities');
$paged_content = islandora_object_load($paged_content_id);
if (empty($context['results']['pages'])) {
return;
}
islandora_paged_content_start_batch_operation($context['results']['pages'], $context);
// Opting to write the consolidated OCR to the filesystem to avoid issues with
// PHP's memory limit on massive books.
$consolidated_ocr = drupal_realpath("temporary://{$paged_content->id}_OCR.txt");
$files = array_slice($context['results']['pages'], $context['sandbox']['progress'], 10);
$append = $context['sandbox']['progress'] != 0;
$context['results']['successful'][] = $append ? islandora_paged_content_ocr_append($consolidated_ocr, $files) : islandora_paged_content_ocr_combine($files, $consolidated_ocr);
$finished_message = array('Created OCR with @pages pages.', array('@pages' => count($context['results']['pages'])));
if (islandora_paged_content_end_batch_operation($context, count($files), $finished_message)) {
islandora_paged_content_update_datastream($paged_content, $consolidated_ocr, 'OCR');
$context['results']['files'][] = $consolidated_ocr;
}
$context['message'] = t('Adding page @page of @total to the OCR ...', array(
'@page' => $context['sandbox']['progress'],
'@total' => $context['sandbox']['total']));
}


/**
* Gets the batch definition for creating TN, JPG, JP2 datastreams.
*
Expand Down
38 changes: 38 additions & 0 deletions includes/derivatives.inc
Expand Up @@ -127,6 +127,24 @@ function islandora_paged_content_aggregate_pdf_derivative(AbstractObject $object
}
}

/**
* Creates an OCR file of all the pages on a paged content object.
*/
function islandora_paged_content_aggregate_ocr_derivative(AbstractObject $object, $force = TRUE) {
module_load_include('inc', 'islandora_paged_content', 'includes/utilities');
module_load_include('inc', 'islandora_paged_content', 'includes/batch');
if ($object->relationships->get(ISLANDORA_RELS_EXT_URI, 'create_ocr', 'true', TRUE) || $force) {
$batch = islandora_paged_content_create_ocr_batch(
$object,
array_keys(islandora_paged_content_get_pages($object)),
array()
);
}

$batch['operations'][] = array('islandora_paged_content_remove_ocr_flag', array($object));
batch_set($batch);
}

/**
* Removes the PDF derivative creation flag.
*
Expand All @@ -146,3 +164,23 @@ function islandora_paged_content_remove_pdf_flag(AbstractObject $object) {
function islandora_paged_content_set_pdf_flag(AbstractObject $object) {
$object->relationships->add(ISLANDORA_RELS_EXT_URI, 'create_pdf', 'true', RELS_TYPE_PLAIN_LITERAL);
}

/**
* Removes the OCR derivative creation flag.
*
* @param AbstractObject $object
* The object to remove the relationship from.
*/
function islandora_paged_content_remove_ocr_flag(AbstractObject $object) {
$object->relationships->remove(ISLANDORA_RELS_EXT_URI, 'create_ocr', 'true', RELS_TYPE_PLAIN_LITERAL);
}

/**
* Sets the OCR derivative creation flag.
*
* @param AbstractObject $object
* The object to set the relationship on.
*/
function islandora_paged_content_set_ocr_flag(AbstractObject $object) {
$object->relationships->add(ISLANDORA_RELS_EXT_URI, 'create_ocr', 'true', RELS_TYPE_PLAIN_LITERAL);
}
6 changes: 5 additions & 1 deletion includes/manage_page.inc
Expand Up @@ -118,7 +118,11 @@ function islandora_paged_content_manage_page_ocr_form_submit(array $form, array
'language' => $form_state['values']['language'],
'preprocess' => FALSE,
);
if (islandora_paged_content_page_derive_ocr_datastreams($object, $options)) {
$ocr = islandora_paged_content_page_derive_ocr_datastreams($object, $options);
if ($ocr !== FALSE) {
foreach ($ocr as $dsid => $file) {
islandora_paged_content_update_datastream($object, $file, $dsid);
}
drupal_set_message(t('Successfully performed OCR.'), 'status');
}
else {
Expand Down
8 changes: 8 additions & 0 deletions includes/manage_pages.inc
Expand Up @@ -183,6 +183,13 @@ function islandora_paged_content_manage_pages_ocr_form(array $form, array &$form
'#description' => t('Please select the language the pages are written in.'),
'#options' => $languages,
),
'aggregate_ocr' => array(
'#access' => $can_derive,
'#type' => 'checkbox',
'#title' => t('Aggregate OCR to the parent?'),
'#description' => t('Check this to aggregate a consolidated OCR datastream generated from each page and append it to the current object.'),
'#default_value' => FALSE,
),
'submit' => array(
'#disabled' => !$can_derive,
'#type' => 'submit',
Expand All @@ -205,6 +212,7 @@ function islandora_paged_content_manage_pages_ocr_form_submit(array $form, array
$options = array(
'language' => $form_state['values']['language'],
'preprocess' => FALSE,
'aggregate_ocr' => $form_state['values']['aggregate_ocr'],
);
$batch = islandora_paged_content_create_ocr_batch($object, $pages, $options);
batch_set($batch);
Expand Down
77 changes: 70 additions & 7 deletions includes/utilities.inc
Expand Up @@ -425,6 +425,53 @@ function islandora_paged_content_pdf_combine(array $files, $out) {
return TRUE;
}

/**
* Appends a series of OCR files to a consolidated OCR file.
*
* @param string $consolidated_ocr
* The consolidated OCR path.
* @param array $files
* The files to be appended to the OCR.
*
* @return bool
* TRUE on success, FALSE on failure.
*/
function islandora_paged_content_ocr_append($consolidated_ocr, array $files) {
$temp_file = "$consolidated_ocr.temp.txt";
copy($file, $temp_file);
array_unshift($files, $temp_file);
$ret = islandora_paged_content_ocr_combine($files, $file);
file_unmanaged_delete($temp_file);
return $ret;
}

/**
* Combines the given OCR files into one output file.
*
* @param array $files
* The OCR files to be combined, in order.
* @param string $out
* The absolute path to the consolidated OCR file.
*
* @return bool
* TRUE on success, FALSE on fail.
*/
function islandora_paged_content_ocr_combine(array $files, $out) {
$success = TRUE;
$combined_ocr = '';
foreach ($files as $file) {
$ocr = file_get_contents($file);
if ($ocr == FALSE) {
$success = FALSE;
}
else {
$combined_ocr .= "$ocr\n\n";
}
}
$result = file_put_contents($out, rtrim($combined_ocr), FILE_APPEND);
return $success && ($result !== FALSE);
}

/**
* Creates a PDF derivative for the given Page object.
*
Expand Down Expand Up @@ -604,11 +651,16 @@ function islandora_paged_content_update_paged_content_thumbnail(AbstractObject $
* The page object that the derivatives will be generated for.
* @param array $options
* The options for tesseract/gimp.
*
* @return bool
* TRUE on success, FALSE otherwise.
* @param bool $ingest
* Whether or not to ingest immediately or simply return references to files.
*
* @return bool|array
* If $ingest is TRUE, returns TRUE on success, FALSE otherwise. If $ingest
* is false, returns an associative array containing 'OCR' and 'HOCR', each
* paired with a string representing the path to the derived file on success,
* or FALSE on failure.
*/
function islandora_paged_content_page_derive_ocr_datastreams(AbstractObject $object, array $options = NULL) {
function islandora_paged_content_page_derive_ocr_datastreams(AbstractObject $object, array $options = NULL, $ingest = TRUE) {
module_load_include('inc', 'islandora_ocr', 'includes/utilities');
$rels_ext = $object->relationships;
$options = isset($options) ? $options : array(
Expand All @@ -619,10 +671,8 @@ function islandora_paged_content_page_derive_ocr_datastreams(AbstractObject $obj
);

module_load_include('inc', 'islandora_ocr', 'includes/derivatives');
$ret = islandora_ocr_derive_datastreams($object['OBJ'], $options);
$ret = islandora_ocr_derive_datastreams($object['OBJ'], $options, $ingest);

// Check the depth of the image, to see if it can be processed with Tesseract.
// If it can not, reduce the depth and grayscale it...
$language = $options['language'];
islandora_paged_content_set_relationship($rels_ext,
ISLANDORA_RELS_EXT_URI,
Expand Down Expand Up @@ -768,6 +818,19 @@ function islandora_paged_content_paged_object_derivatives($context) {
)
);
}
if ($derive['ocr']) {
$derivatives = array_merge(
$derivatives,
array(
array(
'source_dsid' => NULL,
'destination_dsid' => 'OCR',
'function' => array('islandora_paged_content_aggregate_ocr_derivative'),
'file' => "$paged_content_module_path/includes/derivatives.inc",
),
)
);
}
return $derivatives;
}

Expand Down
12 changes: 12 additions & 0 deletions islandora_paged_content.module
Expand Up @@ -251,3 +251,15 @@ function islandora_paged_content_islandora_object_ingested(AbstractObject $objec
islandora_paged_content_cleanup_source_pdf($object->id);
}
}

/**
* Implements hook_islandora_solr_query().
*/
function islandora_paged_content_islandora_solr_query($islandora_solr_query) {
if (variable_get('islandora_paged_content_hide_pages_solr', FALSE)) {
$fq = variable_get('islandora_paged_content_solr_fq', '-RELS_EXT_isPageOf_uri_ms:[* TO *]');
if (!empty($fq)) {
$islandora_solr_query->solrParams['fq'][] = $fq;
}
}
}

0 comments on commit c53546b

Please sign in to comment.