From defc515d6efc439ea3a9535d8355d14c6b6432e5 Mon Sep 17 00:00:00 2001 From: Horia Chiorean Date: Wed, 7 Dec 2016 17:36:49 +0200 Subject: [PATCH] MODE-2648 Upgrades Apache Tika to 1.14 and PdfBox to 2.0.3 to avoid CVEs The major update to PdfBox does trigger some other changes around the PDF sequencer: previous versions allowed some metadata to be extracted for encrypted PDFs, while the current version doesn't even allow reading the stream. As such, the behavior of the sequencer has been updated to reflect this. --- boms/modeshape-bom-embedded/pom.xml | 4 +- .../org/apache/tika/{1.12 => 1.14}/module.xml | 26 +++-- .../modeshape/sequencer/pdf/main/module.xml | 5 +- .../extractor/tika/TikaTextExtractor.java | 106 +++++++++--------- .../resources/assemblies/jboss-wf-dist.xml | 13 +-- modeshape-parent/pom.xml | 60 +++++----- .../sequencer/pdf/PdfBasicMetadata.java | 9 +- .../sequencer/pdf/PdfMetadataSequencer.java | 20 +++- .../pdf/PdfMetadataSequencerTest.java | 31 ++--- 9 files changed, 137 insertions(+), 137 deletions(-) rename deploy/jbossas/kit/jboss-wf/org/apache/tika/{1.12 => 1.14}/module.xml (67%) diff --git a/boms/modeshape-bom-embedded/pom.xml b/boms/modeshape-bom-embedded/pom.xml index 18ace5e4e6..dd1044a81c 100644 --- a/boms/modeshape-bom-embedded/pom.xml +++ b/boms/modeshape-bom-embedded/pom.xml @@ -63,7 +63,7 @@ 2.14.0 2.0.0-rc2 1.11.24 - 1.12 + 1.14 3.3.0-v20070426 3.3.0-v_771 3.3.0-v20070604 @@ -81,7 +81,7 @@ 2.4.1 3.18.1-GA 2.0.3 - 3.13 + 3.15 1.5 1.6.3 6.0.0 diff --git a/deploy/jbossas/kit/jboss-wf/org/apache/tika/1.12/module.xml b/deploy/jbossas/kit/jboss-wf/org/apache/tika/1.14/module.xml similarity index 67% rename from deploy/jbossas/kit/jboss-wf/org/apache/tika/1.12/module.xml rename to deploy/jbossas/kit/jboss-wf/org/apache/tika/1.14/module.xml index 0411e3867a..870890dd58 100644 --- a/deploy/jbossas/kit/jboss-wf/org/apache/tika/1.12/module.xml +++ b/deploy/jbossas/kit/jboss-wf/org/apache/tika/1.14/module.xml @@ -16,32 +16,34 @@ ~ See the License for the specific language governing permissions and ~ limitations under the License. --> - + - - + + - - + + + + - - - - + + + - + - - + + + diff --git a/deploy/jbossas/kit/jboss-wf/org/modeshape/sequencer/pdf/main/module.xml b/deploy/jbossas/kit/jboss-wf/org/modeshape/sequencer/pdf/main/module.xml index dee389e69c..70dec6f43f 100644 --- a/deploy/jbossas/kit/jboss-wf/org/modeshape/sequencer/pdf/main/module.xml +++ b/deploy/jbossas/kit/jboss-wf/org/modeshape/sequencer/pdf/main/module.xml @@ -19,13 +19,12 @@ - - - + + diff --git a/extractors/modeshape-extractor-tika/src/main/java/org/modeshape/extractor/tika/TikaTextExtractor.java b/extractors/modeshape-extractor-tika/src/main/java/org/modeshape/extractor/tika/TikaTextExtractor.java index 0f0a2879a1..ef55d1f922 100644 --- a/extractors/modeshape-extractor-tika/src/main/java/org/modeshape/extractor/tika/TikaTextExtractor.java +++ b/extractors/modeshape-extractor-tika/src/main/java/org/modeshape/extractor/tika/TikaTextExtractor.java @@ -16,15 +16,15 @@ package org.modeshape.extractor.tika; import java.io.IOException; -import java.io.InputStream; import java.util.HashSet; import java.util.Map; import java.util.Set; -import java.util.concurrent.locks.Lock; -import java.util.concurrent.locks.ReentrantLock; +import java.util.concurrent.atomic.AtomicReference; import javax.jcr.RepositoryException; +import org.apache.tika.config.ServiceLoader; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.parser.DefaultParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; @@ -83,9 +83,9 @@ public class TikaTextExtractor extends TextExtractor { MediaType.application("x-tar"), MediaType.application("zip"), MediaType.application("vnd.teiid.vdb"), MediaType.image("*"), MediaType.audio("*"), MediaType.video("*")); - private final Set excludedMediaTypes = new HashSet(); - private final Set includedMediaTypes = new HashSet(); - private final Set parserSupportedMediaTypes = new HashSet(); + private final Set excludedMediaTypes = new HashSet<>(); + private final Set includedMediaTypes = new HashSet<>(); + private final Set parserSupportedMediaTypes = new HashSet<>(); /** * The write limit for the Tika parser, representing the maximum number of characters that should be extracted by the @@ -93,8 +93,7 @@ public class TikaTextExtractor extends TextExtractor { */ private Integer writeLimit; - private final Lock initLock = new ReentrantLock(); - private DefaultParser parser; + private final AtomicReference parser = new AtomicReference<>(); /** * No-arg constructor is required because this is instantiated by reflection. @@ -130,33 +129,30 @@ public void extractFrom( final Binary binary, final DefaultParser parser = initialize(); final Integer writeLimit = this.writeLimit; - processStream(binary, new BinaryOperation() { - @Override - public Object execute( InputStream stream ) throws Exception { - Metadata metadata = prepareMetadata(binary, context); - //TODO author=Horia Chiorean date=1/30/13 description=//TIKA 1.2 TXTParser seems to have a bug, always adding 1 ignorable whitespace to the actual chars to be parsed - //https://issues.apache.org/jira/browse/TIKA-1069 - ContentHandler textHandler = writeLimit == null ? new BodyContentHandler() : new BodyContentHandler(writeLimit + 1); - try { - LOGGER.debug("Using TikaTextExtractor to extract text"); - // Parse the input stream ... - parser.parse(stream, textHandler, metadata, new ParseContext()); - } catch (SAXException sae) { - LOGGER.warn(TikaI18n.parseExceptionWhileExtractingText, sae.getMessage()); - } catch (NoClassDefFoundError ncdfe) { - LOGGER.warn(TikaI18n.warnNoClassDefFound, ncdfe.getMessage()); - } catch (Throwable e) { - LOGGER.error(e, TikaI18n.errorWhileExtractingTextFrom, e.getMessage()); - } finally { - // Record all of the text in the body ... - String text = textHandler.toString().trim(); - if (!StringUtil.isBlank(text)) { - output.recordText(text); - LOGGER.debug("TikaTextExtractor found text: " + text); - } + processStream(binary, stream -> { + Metadata metadata = prepareMetadata(binary, context); + //TODO author=Horia Chiorean date=1/30/13 description=//TIKA 1.2 TXTParser seems to have a bug, always adding 1 ignorable whitespace to the actual chars to be parsed + //https://issues.apache.org/jira/browse/TIKA-1069 + ContentHandler textHandler = writeLimit == null ? new BodyContentHandler() : new BodyContentHandler(writeLimit + 1); + try { + LOGGER.debug("Using TikaTextExtractor to extract text"); + // Parse the input stream ... + parser.parse(stream, textHandler, metadata, new ParseContext()); + } catch (SAXException sae) { + LOGGER.warn(TikaI18n.parseExceptionWhileExtractingText, sae.getMessage()); + } catch (NoClassDefFoundError ncdfe) { + LOGGER.warn(TikaI18n.warnNoClassDefFound, ncdfe.getMessage()); + } catch (Throwable e) { + LOGGER.error(e, TikaI18n.errorWhileExtractingTextFrom, e.getMessage()); + } finally { + // Record all of the text in the body ... + String text = textHandler.toString().trim(); + if (!StringUtil.isBlank(text)) { + output.recordText(text); + LOGGER.debug("TikaTextExtractor found text: " + text); } - return null; } + return null; }); } @@ -185,36 +181,34 @@ protected final Metadata prepareMetadata( final Binary binary, } return metadata; } - + /** * This class lazily initializes the {@link DefaultParser} instance. - * + * * @return the default parser; same as {@link #parser} */ protected DefaultParser initialize() { - if (parser == null) { - initLock.lock(); - try { - if (parser == null) { - parser = new DefaultParser(this.getClass().getClassLoader()); - } - LOGGER.debug("Initializing Tika Text Extractor"); - Map parsers = parser.getParsers(); - LOGGER.debug("Tika parsers found: {0}",parsers.size()); - for (MediaType mediaType : parsers.keySet()) { - parserSupportedMediaTypes.add(mediaType); - LOGGER.debug("Tika Text Extractor will support the {0} media-type",mediaType); - } - convertStringMimeTypesToMediaTypes(getExcludedMimeTypes(), excludedMediaTypes); - convertStringMimeTypesToMediaTypes(getIncludedMimeTypes(), includedMediaTypes); - LOGGER.debug("Initialized {0}", this); - } finally { - initLock.unlock(); - } + parser.compareAndSet(null, newDefaultParser()); + return parser.get(); + } + + private DefaultParser newDefaultParser() { + ServiceLoader serviceLoader = new ServiceLoader(this.getClass().getClassLoader(), + (classname, throwable) -> LOGGER.debug(throwable, "error while loading parser for {0}", classname)); + DefaultParser defaultParser = new DefaultParser(MediaTypeRegistry.getDefaultRegistry(), serviceLoader); + LOGGER.debug("Initializing Tika Text Extractor"); + Map parsers = defaultParser.getParsers(); + LOGGER.debug("Tika parsers found: {0}",parsers.size()); + for (MediaType mediaType : parsers.keySet()) { + parserSupportedMediaTypes.add(mediaType); + LOGGER.debug("Tika Text Extractor will support the {0} media-type",mediaType); } - return parser; + convertStringMimeTypesToMediaTypes(getExcludedMimeTypes(), excludedMediaTypes); + convertStringMimeTypesToMediaTypes(getIncludedMimeTypes(), includedMediaTypes); + LOGGER.debug("Initialized {0}", this); + return defaultParser; } - + private void convertStringMimeTypesToMediaTypes(Set mimeTypes, Set mediaTypes) { for (String mimeTypeEntry : mimeTypes) { //allow each mime type entry to be an array in itself diff --git a/modeshape-assembly-descriptors/src/main/resources/assemblies/jboss-wf-dist.xml b/modeshape-assembly-descriptors/src/main/resources/assemblies/jboss-wf-dist.xml index 123b3bec82..7205ee40b5 100644 --- a/modeshape-assembly-descriptors/src/main/resources/assemblies/jboss-wf-dist.xml +++ b/modeshape-assembly-descriptors/src/main/resources/assemblies/jboss-wf-dist.xml @@ -179,6 +179,7 @@ org.ccil.cowan.tagsoup:*:jar org.gagravarr:*:jar org.ow2.asm:asm:jar + com.fasterxml.jackson.core:jackson-core:jar + 3.15 1.5 1.6.3 1.1.2 @@ -170,7 +170,7 @@ 2.2.3 2.4.1 - 1.12 + 1.14 2.6.2 6.0.0 0.2.1 @@ -184,7 +184,7 @@ 1.0.13.Final 2.0.0-rc2 2.0.3 - 1.8.10 + 2.0.3 1.4.191 9.2-1002.jdbc4 @@ -1695,11 +1695,11 @@ com.healthmarketscience.jackcess jackcess - + com.healthmarketscience.jackcess jackcess-encrypt - + org.codelibs jhighlight @@ -1707,27 +1707,31 @@ org.apache.cxf cxf-rt-rs-client - + org.apache.opennlp opennlp-tools - + org.apache.opennlp opennlp-tools - + org.json json - + com.google.code.gson gson - + + + com.googlecode.json-simple + json-simple + org.apache.sis.core sis-utility - + org.apache.sis.core sis-metadata @@ -1735,11 +1739,11 @@ org.apache.sis.storage sis-netcdf - + org.opengis geoapi - + com.googlecode.mp4parser isoparser @@ -1747,7 +1751,7 @@ com.github.junrar junrar - + org.apache.commons commons-csv @@ -1767,15 +1771,15 @@ edu.ucar netcdf4 - + edu.ucar grib - + edu.ucar cdm - + edu.ucar httpservices @@ -1816,12 +1820,12 @@ We exclude them by default. --> - org.bouncycastle - bcmail-jdk15on + org.bouncycastle + bcmail-jdk15on - org.bouncycastle - bcprov-jdk15on + org.bouncycastle + bcprov-jdk15on + Matlab files are likely not used, so exclude this library by default. + --> - net.sourceforge.jmatio + org.tallison jmatio + + + org.apache.pdfbox + pdfbox-tools +