MODE-2648 Upgrades Apache Tika to 1.14 and PdfBox to 2.0.3 to avoid CVEs

The major update to PdfBox does trigger some other changes around the PDF sequencer: previous versions allowed some metadata to be extracted for encrypted PDFs, while the current version doesn't even allow reading the stream. As such, the behavior of the sequencer has been updated to reflect this.
ModeShape · Dec 7, 2016 · defc515 · defc515
1 parent 382e8eb
commit defc515
Show file tree

Hide file tree

Showing 9 changed files with 137 additions and 137 deletions.
diff --git a/boms/modeshape-bom-embedded/pom.xml b/boms/modeshape-bom-embedded/pom.xml
@@ -63,7 +63,7 @@
         <version.org.mongodb.mongo-java-driver>2.14.0</version.org.mongodb.mongo-java-driver>
         <version.com.datastax.cassandra>2.0.0-rc2</version.com.datastax.cassandra>
         <version.com.amazonaws>1.11.24</version.com.amazonaws>
-        <version.org.apache.tika>1.12</version.org.apache.tika>
+        <version.org.apache.tika>1.14</version.org.apache.tika>
         <version.org.eclipse.equinox.common>3.3.0-v20070426</version.org.eclipse.equinox.common>
         <version.org.eclipse.jdt.core>3.3.0-v_771</version.org.eclipse.jdt.core>
         <version.org.eclipse.core.resources>3.3.0-v20070604</version.org.eclipse.core.resources>
@@ -81,7 +81,7 @@
         <version.org.eclipse.emf.ecore-xmi>2.4.1</version.org.eclipse.emf.ecore-xmi>
         <version.org.javassist>3.18.1-GA</version.org.javassist>
         <version.org.jaudiotagger>2.0.3</version.org.jaudiotagger>
-        <version.org.apache.poi>3.13</version.org.apache.poi>
+        <version.org.apache.poi>3.15</version.org.apache.poi>
         <version.com.beust.jcommander>1.5</version.com.beust.jcommander>
         <version.wsdl4j>1.6.3</version.wsdl4j>
         <version.org.apache.lucene>6.0.0</version.org.apache.lucene>

diff --git a/.../jboss-wf/org/apache/tika/1.12/module.xml → .../jboss-wf/org/apache/tika/1.14/module.xml b/.../jboss-wf/org/apache/tika/1.12/module.xml → .../jboss-wf/org/apache/tika/1.14/module.xml
@@ -16,32 +16,34 @@
   ~ See the License for the specific language governing permissions and
   ~ limitations under the License.
 -->
-<module xmlns="urn:jboss:module:1.3" name="org.apache.tika" slot="1.12">
+<module xmlns="urn:jboss:module:1.3" name="org.apache.tika" slot="${version.org.apache.tika}">
     <resources>
-        <resource-root path="tika-core-1.12.jar" />
-        <resource-root path="tika-parsers-1.12.jar" />
+        <resource-root path="tika-core-${version.org.apache.tika}.jar" />
+        <resource-root path="tika-parsers-${version.org.apache.tika}.jar" />
         <!--All the following are specific to Tika Should the tika version change, those need to change as well-->
         <resource-root path="asm-5.0.4.jar" />
         <resource-root path="apache-mime4j-core-0.7.2.jar" />
         <resource-root path="apache-mime4j-dom-0.7.2.jar" />
-        <resource-root path="commons-codec-1.9.jar" />
-        <resource-root path="commons-compress-1.10.jar" />
+        <resource-root path="commons-codec-1.10.jar" />
+        <resource-root path="commons-compress-1.12.jar" />
+        <resource-root path="commons-collections4-4.1.jar" />
+        <resource-root path="commons-logging-1.2.jar" />
         <resource-root path="commons-exec-1.3.jar" />
-        <resource-root path="commons-io-2.4.jar" />
-        <resource-root path="commons-logging-1.1.1.jar" />
-        <resource-root path="fontbox-1.8.10.jar" />
-        <resource-root path="jempbox-1.8.10.jar" />
+        <resource-root path="commons-io-2.5.jar" />
+        <resource-root path="fontbox-${version.org.apache.pdfbox}.jar" />
+        <resource-root path="jempbox-1.8.12.jar" />
         <resource-root path="juniversalchardet-1.0.3.jar" />
-        <resource-root path="pdfbox-1.8.10.jar" />
+        <resource-root path="pdfbox-${version.org.apache.pdfbox}.jar" />
         <resource-root path="poi-${version.org.apache.poi}.jar" />
         <resource-root path="poi-ooxml-${version.org.apache.poi}.jar" />
         <resource-root path="poi-ooxml-schemas-${version.org.apache.poi}.jar" />
         <resource-root path="poi-scratchpad-${version.org.apache.poi}.jar" />
         <resource-root path="xmlbeans-2.6.0.jar" />
         <resource-root path="tagsoup-1.2.1.jar" />
-        <resource-root path="vorbis-java-core-0.6.jar" />
-        <resource-root path="vorbis-java-tika-0.6.jar" />
+        <resource-root path="vorbis-java-core-0.8.jar" />
+        <resource-root path="vorbis-java-tika-0.8.jar" />
         <resource-root path="xz-1.5.jar" />
+        <resource-root path="jackson-core-2.8.1.jar" />
     </resources>
     <dependencies>
         <module name="javax.api" export="true"/>

diff --git a/deploy/jbossas/kit/jboss-wf/org/modeshape/sequencer/pdf/main/module.xml b/deploy/jbossas/kit/jboss-wf/org/modeshape/sequencer/pdf/main/module.xml
@@ -19,13 +19,12 @@
     <resources>
         <resource-root path="modeshape-sequencer-pdf-${project.version}.jar" />
         <resource-root path="pdfbox-${version.org.apache.pdfbox}.jar" />
-        <resource-root path="jempbox-${version.org.apache.pdfbox}.jar" />
         <resource-root path="xmpbox-${version.org.apache.pdfbox}.jar" />
-        <resource-root path="fontbox-${version.org.apache.pdfbox}.jar" />
-        <resource-root path="commons-logging-1.1.1.jar" />
+        <resource-root path="commons-logging-1.2.jar" />
     </resources>
 
     <dependencies>
+        <module name="javax.xml.bind.api"/>
         <module name="org.modeshape.jcr.api"/>
         <module name="org.modeshape.common"/>
     </dependencies>

diff --git a/...odeshape-extractor-tika/src/main/java/org/modeshape/extractor/tika/TikaTextExtractor.java b/...odeshape-extractor-tika/src/main/java/org/modeshape/extractor/tika/TikaTextExtractor.java
@@ -16,15 +16,15 @@
 package org.modeshape.extractor.tika;
 
 import java.io.IOException;
-import java.io.InputStream;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;
-import java.util.concurrent.locks.Lock;
-import java.util.concurrent.locks.ReentrantLock;
+import java.util.concurrent.atomic.AtomicReference;
 import javax.jcr.RepositoryException;
+import org.apache.tika.config.ServiceLoader;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
 import org.apache.tika.parser.DefaultParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
@@ -83,18 +83,17 @@ public class TikaTextExtractor extends TextExtractor {
             MediaType.application("x-tar"), MediaType.application("zip"), MediaType.application("vnd.teiid.vdb"),
             MediaType.image("*"), MediaType.audio("*"), MediaType.video("*"));
 
-    private final Set<MediaType> excludedMediaTypes = new HashSet<MediaType>();
-    private final Set<MediaType> includedMediaTypes = new HashSet<MediaType>();
-    private final Set<MediaType> parserSupportedMediaTypes = new HashSet<MediaType>();
+    private final Set<MediaType> excludedMediaTypes = new HashSet<>();
+    private final Set<MediaType> includedMediaTypes = new HashSet<>();
+    private final Set<MediaType> parserSupportedMediaTypes = new HashSet<>();
 
     /**
      * The write limit for the Tika parser, representing the maximum number of characters that should be extracted by the
      * TIKA parser; set via reflection
      */
     private Integer writeLimit;
 
-    private final Lock initLock = new ReentrantLock();
-    private DefaultParser parser;
+    private final AtomicReference<DefaultParser> parser = new AtomicReference<>();
 
     /**
      * No-arg constructor is required because this is instantiated by reflection.
@@ -130,33 +129,30 @@ public void extractFrom( final Binary binary,
 
         final DefaultParser parser = initialize();
         final Integer writeLimit = this.writeLimit;
-        processStream(binary, new BinaryOperation<Object>() {
-            @Override
-            public Object execute( InputStream stream ) throws Exception {
-                Metadata metadata = prepareMetadata(binary, context);
-                //TODO author=Horia Chiorean date=1/30/13 description=//TIKA 1.2 TXTParser seems to have a bug, always adding 1 ignorable whitespace to the actual chars to be parsed
-                //https://issues.apache.org/jira/browse/TIKA-1069
-                ContentHandler textHandler = writeLimit == null ? new BodyContentHandler() : new BodyContentHandler(writeLimit + 1);
-                try {
-                    LOGGER.debug("Using TikaTextExtractor to extract text");
-                    // Parse the input stream ...
-                    parser.parse(stream, textHandler, metadata, new ParseContext());
-                } catch (SAXException sae) {
-                    LOGGER.warn(TikaI18n.parseExceptionWhileExtractingText, sae.getMessage());
-                } catch (NoClassDefFoundError ncdfe) {
-                    LOGGER.warn(TikaI18n.warnNoClassDefFound, ncdfe.getMessage());
-                } catch (Throwable e) {
-                    LOGGER.error(e, TikaI18n.errorWhileExtractingTextFrom, e.getMessage());
-                } finally {
-                    // Record all of the text in the body ...
-                    String text = textHandler.toString().trim();
-                    if (!StringUtil.isBlank(text)) {
-                        output.recordText(text);
-                        LOGGER.debug("TikaTextExtractor found text: " + text);
-                    }
+        processStream(binary, stream -> {
+            Metadata metadata = prepareMetadata(binary, context);
+            //TODO author=Horia Chiorean date=1/30/13 description=//TIKA 1.2 TXTParser seems to have a bug, always adding 1 ignorable whitespace to the actual chars to be parsed
+            //https://issues.apache.org/jira/browse/TIKA-1069
+            ContentHandler textHandler = writeLimit == null ? new BodyContentHandler() : new BodyContentHandler(writeLimit + 1);
+            try {
+                LOGGER.debug("Using TikaTextExtractor to extract text");
+                // Parse the input stream ...
+                parser.parse(stream, textHandler, metadata, new ParseContext());
+            } catch (SAXException sae) {
+                LOGGER.warn(TikaI18n.parseExceptionWhileExtractingText, sae.getMessage());
+            } catch (NoClassDefFoundError ncdfe) {
+                LOGGER.warn(TikaI18n.warnNoClassDefFound, ncdfe.getMessage());
+            } catch (Throwable e) {
+                LOGGER.error(e, TikaI18n.errorWhileExtractingTextFrom, e.getMessage());
+            } finally {
+                // Record all of the text in the body ...
+                String text = textHandler.toString().trim();
+                if (!StringUtil.isBlank(text)) {
+                    output.recordText(text);
+                    LOGGER.debug("TikaTextExtractor found text: " + text);
                 }
-                return null;
             }
+            return null;
         });
 
     }
@@ -185,36 +181,34 @@ protected final Metadata prepareMetadata( final Binary binary,
         }
         return metadata;
     }
-
+    
     /**
      * This class lazily initializes the {@link DefaultParser} instance.
-     * 
+     *
      * @return the default parser; same as {@link #parser}
      */
     protected DefaultParser initialize() {
-        if (parser == null) {
-            initLock.lock();
-            try {
-                if (parser == null) {
-                    parser = new DefaultParser(this.getClass().getClassLoader());
-                }
-                LOGGER.debug("Initializing Tika Text Extractor");
-                Map<MediaType, Parser> parsers = parser.getParsers();
-                LOGGER.debug("Tika parsers found: {0}",parsers.size());
-                for (MediaType mediaType : parsers.keySet()) {
-                    parserSupportedMediaTypes.add(mediaType);
-                    LOGGER.debug("Tika Text Extractor will support the {0} media-type",mediaType);
-                }
-                convertStringMimeTypesToMediaTypes(getExcludedMimeTypes(), excludedMediaTypes);
-                convertStringMimeTypesToMediaTypes(getIncludedMimeTypes(), includedMediaTypes);
-                LOGGER.debug("Initialized {0}", this);
-            } finally {
-                initLock.unlock();
-            }
+        parser.compareAndSet(null, newDefaultParser());
+        return parser.get();
+    }
+
+    private DefaultParser newDefaultParser() {
+        ServiceLoader serviceLoader = new ServiceLoader(this.getClass().getClassLoader(),
+                                                        (classname, throwable) -> LOGGER.debug(throwable, "error while loading parser for {0}", classname));
+        DefaultParser defaultParser = new DefaultParser(MediaTypeRegistry.getDefaultRegistry(), serviceLoader);
+        LOGGER.debug("Initializing Tika Text Extractor");
+        Map<MediaType, Parser> parsers = defaultParser.getParsers();
+        LOGGER.debug("Tika parsers found: {0}",parsers.size());
+        for (MediaType mediaType : parsers.keySet()) {
+            parserSupportedMediaTypes.add(mediaType);
+            LOGGER.debug("Tika Text Extractor will support the {0} media-type",mediaType);
         }
-        return parser;
+        convertStringMimeTypesToMediaTypes(getExcludedMimeTypes(), excludedMediaTypes);
+        convertStringMimeTypesToMediaTypes(getIncludedMimeTypes(), includedMediaTypes);
+        LOGGER.debug("Initialized {0}", this);
+        return defaultParser;
     }
-
+    
     private void convertStringMimeTypesToMediaTypes(Set<String> mimeTypes, Set<MediaType> mediaTypes) {
         for (String mimeTypeEntry : mimeTypes) {
             //allow each mime type entry to be an array in itself

diff --git a/modeshape-assembly-descriptors/src/main/resources/assemblies/jboss-wf-dist.xml b/modeshape-assembly-descriptors/src/main/resources/assemblies/jboss-wf-dist.xml
@@ -179,6 +179,7 @@
                 <include>org.ccil.cowan.tagsoup:*:jar</include>
                 <include>org.gagravarr:*:jar</include>
                 <include>org.ow2.asm:asm:jar</include>
+                <include>com.fasterxml.jackson.core:jackson-core:jar</include>
                 <!-- 
                 The following are not currently included by default; see ModeShape's parent POM
                 <include>org.bouncycastle:*:jar</include>
@@ -263,16 +264,6 @@
                 <include>org:jaudiotagger:jar</include>
             </includes>
         </dependencySet>
-
-        <dependencySet>
-            <useProjectArtifact>false</useProjectArtifact>
-            <outputDirectory>modules/org/modeshape/sequencer/pdf/main</outputDirectory>
-            <includes>
-                <include>org.modeshape:modeshape-sequencer-pdf:jar</include>
-                <include>org.apache.pdfbox:*:jar</include>
-                <include>commons-logging:commons-logging:jar</include>
-            </includes>
-        </dependencySet> 
 
         <dependencySet>
             <useProjectArtifact>false</useProjectArtifact>
@@ -367,11 +358,13 @@
         </dependencySet>
 
         <dependencySet>
+            <useProjectArtifact>false</useProjectArtifact>
             <outputDirectory>modules/org/modeshape/sequencer/pdf/main</outputDirectory>
             <includes>
                 <include>org.modeshape:modeshape-sequencer-pdf:jar</include>
                 <include>org.apache.pdfbox:pdfbox:jar</include>
                 <include>org.apache.pdfbox:xmpbox:jar</include>
+                <include>commons-logging:commons-logging:jar</include>
             </includes>
         </dependencySet>