Skip to content

Commit

Permalink
MODE-2648 Upgrades Apache Tika to 1.14 and PdfBox to 2.0.3 to avoid CVEs
Browse files Browse the repository at this point in the history
The major update to PdfBox does trigger some other changes around the PDF sequencer: previous versions allowed some metadata to be extracted for encrypted PDFs, while the current version doesn't even allow reading the stream. As such, the behavior of the sequencer has been updated to reflect this.
  • Loading branch information
Horia Chiorean committed Dec 7, 2016
1 parent 382e8eb commit defc515
Show file tree
Hide file tree
Showing 9 changed files with 137 additions and 137 deletions.
4 changes: 2 additions & 2 deletions boms/modeshape-bom-embedded/pom.xml
Expand Up @@ -63,7 +63,7 @@
<version.org.mongodb.mongo-java-driver>2.14.0</version.org.mongodb.mongo-java-driver>
<version.com.datastax.cassandra>2.0.0-rc2</version.com.datastax.cassandra>
<version.com.amazonaws>1.11.24</version.com.amazonaws>
<version.org.apache.tika>1.12</version.org.apache.tika>
<version.org.apache.tika>1.14</version.org.apache.tika>
<version.org.eclipse.equinox.common>3.3.0-v20070426</version.org.eclipse.equinox.common>
<version.org.eclipse.jdt.core>3.3.0-v_771</version.org.eclipse.jdt.core>
<version.org.eclipse.core.resources>3.3.0-v20070604</version.org.eclipse.core.resources>
Expand All @@ -81,7 +81,7 @@
<version.org.eclipse.emf.ecore-xmi>2.4.1</version.org.eclipse.emf.ecore-xmi>
<version.org.javassist>3.18.1-GA</version.org.javassist>
<version.org.jaudiotagger>2.0.3</version.org.jaudiotagger>
<version.org.apache.poi>3.13</version.org.apache.poi>
<version.org.apache.poi>3.15</version.org.apache.poi>
<version.com.beust.jcommander>1.5</version.com.beust.jcommander>
<version.wsdl4j>1.6.3</version.wsdl4j>
<version.org.apache.lucene>6.0.0</version.org.apache.lucene>
Expand Down
Expand Up @@ -16,32 +16,34 @@
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<module xmlns="urn:jboss:module:1.3" name="org.apache.tika" slot="1.12">
<module xmlns="urn:jboss:module:1.3" name="org.apache.tika" slot="${version.org.apache.tika}">
<resources>
<resource-root path="tika-core-1.12.jar" />
<resource-root path="tika-parsers-1.12.jar" />
<resource-root path="tika-core-${version.org.apache.tika}.jar" />
<resource-root path="tika-parsers-${version.org.apache.tika}.jar" />
<!--All the following are specific to Tika Should the tika version change, those need to change as well-->
<resource-root path="asm-5.0.4.jar" />
<resource-root path="apache-mime4j-core-0.7.2.jar" />
<resource-root path="apache-mime4j-dom-0.7.2.jar" />
<resource-root path="commons-codec-1.9.jar" />
<resource-root path="commons-compress-1.10.jar" />
<resource-root path="commons-codec-1.10.jar" />
<resource-root path="commons-compress-1.12.jar" />
<resource-root path="commons-collections4-4.1.jar" />
<resource-root path="commons-logging-1.2.jar" />
<resource-root path="commons-exec-1.3.jar" />
<resource-root path="commons-io-2.4.jar" />
<resource-root path="commons-logging-1.1.1.jar" />
<resource-root path="fontbox-1.8.10.jar" />
<resource-root path="jempbox-1.8.10.jar" />
<resource-root path="commons-io-2.5.jar" />
<resource-root path="fontbox-${version.org.apache.pdfbox}.jar" />
<resource-root path="jempbox-1.8.12.jar" />
<resource-root path="juniversalchardet-1.0.3.jar" />
<resource-root path="pdfbox-1.8.10.jar" />
<resource-root path="pdfbox-${version.org.apache.pdfbox}.jar" />
<resource-root path="poi-${version.org.apache.poi}.jar" />
<resource-root path="poi-ooxml-${version.org.apache.poi}.jar" />
<resource-root path="poi-ooxml-schemas-${version.org.apache.poi}.jar" />
<resource-root path="poi-scratchpad-${version.org.apache.poi}.jar" />
<resource-root path="xmlbeans-2.6.0.jar" />
<resource-root path="tagsoup-1.2.1.jar" />
<resource-root path="vorbis-java-core-0.6.jar" />
<resource-root path="vorbis-java-tika-0.6.jar" />
<resource-root path="vorbis-java-core-0.8.jar" />
<resource-root path="vorbis-java-tika-0.8.jar" />
<resource-root path="xz-1.5.jar" />
<resource-root path="jackson-core-2.8.1.jar" />
</resources>
<dependencies>
<module name="javax.api" export="true"/>
Expand Down
Expand Up @@ -19,13 +19,12 @@
<resources>
<resource-root path="modeshape-sequencer-pdf-${project.version}.jar" />
<resource-root path="pdfbox-${version.org.apache.pdfbox}.jar" />
<resource-root path="jempbox-${version.org.apache.pdfbox}.jar" />
<resource-root path="xmpbox-${version.org.apache.pdfbox}.jar" />
<resource-root path="fontbox-${version.org.apache.pdfbox}.jar" />
<resource-root path="commons-logging-1.1.1.jar" />
<resource-root path="commons-logging-1.2.jar" />
</resources>

<dependencies>
<module name="javax.xml.bind.api"/>
<module name="org.modeshape.jcr.api"/>
<module name="org.modeshape.common"/>
</dependencies>
Expand Down
Expand Up @@ -16,15 +16,15 @@
package org.modeshape.extractor.tika;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.concurrent.atomic.AtomicReference;
import javax.jcr.RepositoryException;
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
Expand Down Expand Up @@ -83,18 +83,17 @@ public class TikaTextExtractor extends TextExtractor {
MediaType.application("x-tar"), MediaType.application("zip"), MediaType.application("vnd.teiid.vdb"),
MediaType.image("*"), MediaType.audio("*"), MediaType.video("*"));

private final Set<MediaType> excludedMediaTypes = new HashSet<MediaType>();
private final Set<MediaType> includedMediaTypes = new HashSet<MediaType>();
private final Set<MediaType> parserSupportedMediaTypes = new HashSet<MediaType>();
private final Set<MediaType> excludedMediaTypes = new HashSet<>();
private final Set<MediaType> includedMediaTypes = new HashSet<>();
private final Set<MediaType> parserSupportedMediaTypes = new HashSet<>();

/**
* The write limit for the Tika parser, representing the maximum number of characters that should be extracted by the
* TIKA parser; set via reflection
*/
private Integer writeLimit;

private final Lock initLock = new ReentrantLock();
private DefaultParser parser;
private final AtomicReference<DefaultParser> parser = new AtomicReference<>();

/**
* No-arg constructor is required because this is instantiated by reflection.
Expand Down Expand Up @@ -130,33 +129,30 @@ public void extractFrom( final Binary binary,

final DefaultParser parser = initialize();
final Integer writeLimit = this.writeLimit;
processStream(binary, new BinaryOperation<Object>() {
@Override
public Object execute( InputStream stream ) throws Exception {
Metadata metadata = prepareMetadata(binary, context);
//TODO author=Horia Chiorean date=1/30/13 description=//TIKA 1.2 TXTParser seems to have a bug, always adding 1 ignorable whitespace to the actual chars to be parsed
//https://issues.apache.org/jira/browse/TIKA-1069
ContentHandler textHandler = writeLimit == null ? new BodyContentHandler() : new BodyContentHandler(writeLimit + 1);
try {
LOGGER.debug("Using TikaTextExtractor to extract text");
// Parse the input stream ...
parser.parse(stream, textHandler, metadata, new ParseContext());
} catch (SAXException sae) {
LOGGER.warn(TikaI18n.parseExceptionWhileExtractingText, sae.getMessage());
} catch (NoClassDefFoundError ncdfe) {
LOGGER.warn(TikaI18n.warnNoClassDefFound, ncdfe.getMessage());
} catch (Throwable e) {
LOGGER.error(e, TikaI18n.errorWhileExtractingTextFrom, e.getMessage());
} finally {
// Record all of the text in the body ...
String text = textHandler.toString().trim();
if (!StringUtil.isBlank(text)) {
output.recordText(text);
LOGGER.debug("TikaTextExtractor found text: " + text);
}
processStream(binary, stream -> {
Metadata metadata = prepareMetadata(binary, context);
//TODO author=Horia Chiorean date=1/30/13 description=//TIKA 1.2 TXTParser seems to have a bug, always adding 1 ignorable whitespace to the actual chars to be parsed
//https://issues.apache.org/jira/browse/TIKA-1069
ContentHandler textHandler = writeLimit == null ? new BodyContentHandler() : new BodyContentHandler(writeLimit + 1);
try {
LOGGER.debug("Using TikaTextExtractor to extract text");
// Parse the input stream ...
parser.parse(stream, textHandler, metadata, new ParseContext());
} catch (SAXException sae) {
LOGGER.warn(TikaI18n.parseExceptionWhileExtractingText, sae.getMessage());
} catch (NoClassDefFoundError ncdfe) {
LOGGER.warn(TikaI18n.warnNoClassDefFound, ncdfe.getMessage());
} catch (Throwable e) {
LOGGER.error(e, TikaI18n.errorWhileExtractingTextFrom, e.getMessage());
} finally {
// Record all of the text in the body ...
String text = textHandler.toString().trim();
if (!StringUtil.isBlank(text)) {
output.recordText(text);
LOGGER.debug("TikaTextExtractor found text: " + text);
}
return null;
}
return null;
});

}
Expand Down Expand Up @@ -185,36 +181,34 @@ protected final Metadata prepareMetadata( final Binary binary,
}
return metadata;
}

/**
* This class lazily initializes the {@link DefaultParser} instance.
*
*
* @return the default parser; same as {@link #parser}
*/
protected DefaultParser initialize() {
if (parser == null) {
initLock.lock();
try {
if (parser == null) {
parser = new DefaultParser(this.getClass().getClassLoader());
}
LOGGER.debug("Initializing Tika Text Extractor");
Map<MediaType, Parser> parsers = parser.getParsers();
LOGGER.debug("Tika parsers found: {0}",parsers.size());
for (MediaType mediaType : parsers.keySet()) {
parserSupportedMediaTypes.add(mediaType);
LOGGER.debug("Tika Text Extractor will support the {0} media-type",mediaType);
}
convertStringMimeTypesToMediaTypes(getExcludedMimeTypes(), excludedMediaTypes);
convertStringMimeTypesToMediaTypes(getIncludedMimeTypes(), includedMediaTypes);
LOGGER.debug("Initialized {0}", this);
} finally {
initLock.unlock();
}
parser.compareAndSet(null, newDefaultParser());
return parser.get();
}

private DefaultParser newDefaultParser() {
ServiceLoader serviceLoader = new ServiceLoader(this.getClass().getClassLoader(),
(classname, throwable) -> LOGGER.debug(throwable, "error while loading parser for {0}", classname));
DefaultParser defaultParser = new DefaultParser(MediaTypeRegistry.getDefaultRegistry(), serviceLoader);
LOGGER.debug("Initializing Tika Text Extractor");
Map<MediaType, Parser> parsers = defaultParser.getParsers();
LOGGER.debug("Tika parsers found: {0}",parsers.size());
for (MediaType mediaType : parsers.keySet()) {
parserSupportedMediaTypes.add(mediaType);
LOGGER.debug("Tika Text Extractor will support the {0} media-type",mediaType);
}
return parser;
convertStringMimeTypesToMediaTypes(getExcludedMimeTypes(), excludedMediaTypes);
convertStringMimeTypesToMediaTypes(getIncludedMimeTypes(), includedMediaTypes);
LOGGER.debug("Initialized {0}", this);
return defaultParser;
}

private void convertStringMimeTypesToMediaTypes(Set<String> mimeTypes, Set<MediaType> mediaTypes) {
for (String mimeTypeEntry : mimeTypes) {
//allow each mime type entry to be an array in itself
Expand Down
Expand Up @@ -179,6 +179,7 @@
<include>org.ccil.cowan.tagsoup:*:jar</include>
<include>org.gagravarr:*:jar</include>
<include>org.ow2.asm:asm:jar</include>
<include>com.fasterxml.jackson.core:jackson-core:jar</include>
<!--
The following are not currently included by default; see ModeShape's parent POM
<include>org.bouncycastle:*:jar</include>
Expand Down Expand Up @@ -263,16 +264,6 @@
<include>org:jaudiotagger:jar</include>
</includes>
</dependencySet>

<dependencySet>
<useProjectArtifact>false</useProjectArtifact>
<outputDirectory>modules/org/modeshape/sequencer/pdf/main</outputDirectory>
<includes>
<include>org.modeshape:modeshape-sequencer-pdf:jar</include>
<include>org.apache.pdfbox:*:jar</include>
<include>commons-logging:commons-logging:jar</include>
</includes>
</dependencySet>

<dependencySet>
<useProjectArtifact>false</useProjectArtifact>
Expand Down Expand Up @@ -367,11 +358,13 @@
</dependencySet>

<dependencySet>
<useProjectArtifact>false</useProjectArtifact>
<outputDirectory>modules/org/modeshape/sequencer/pdf/main</outputDirectory>
<includes>
<include>org.modeshape:modeshape-sequencer-pdf:jar</include>
<include>org.apache.pdfbox:pdfbox:jar</include>
<include>org.apache.pdfbox:xmpbox:jar</include>
<include>commons-logging:commons-logging:jar</include>
</includes>
</dependencySet>

Expand Down

0 comments on commit defc515

Please sign in to comment.