diff --git a/boms/modeshape-bom-embedded/pom.xml b/boms/modeshape-bom-embedded/pom.xml index 18ace5e4e6..dd1044a81c 100644 --- a/boms/modeshape-bom-embedded/pom.xml +++ b/boms/modeshape-bom-embedded/pom.xml @@ -63,7 +63,7 @@ 2.14.0 2.0.0-rc2 1.11.24 - 1.12 + 1.14 3.3.0-v20070426 3.3.0-v_771 3.3.0-v20070604 @@ -81,7 +81,7 @@ 2.4.1 3.18.1-GA 2.0.3 - 3.13 + 3.15 1.5 1.6.3 6.0.0 diff --git a/deploy/jbossas/kit/jboss-wf/org/apache/tika/1.12/module.xml b/deploy/jbossas/kit/jboss-wf/org/apache/tika/1.14/module.xml similarity index 67% rename from deploy/jbossas/kit/jboss-wf/org/apache/tika/1.12/module.xml rename to deploy/jbossas/kit/jboss-wf/org/apache/tika/1.14/module.xml index 0411e3867a..870890dd58 100644 --- a/deploy/jbossas/kit/jboss-wf/org/apache/tika/1.12/module.xml +++ b/deploy/jbossas/kit/jboss-wf/org/apache/tika/1.14/module.xml @@ -16,32 +16,34 @@ ~ See the License for the specific language governing permissions and ~ limitations under the License. --> - + - - + + - - + + + + - - - - + + + - + - - + + + diff --git a/deploy/jbossas/kit/jboss-wf/org/modeshape/sequencer/pdf/main/module.xml b/deploy/jbossas/kit/jboss-wf/org/modeshape/sequencer/pdf/main/module.xml index dee389e69c..70dec6f43f 100644 --- a/deploy/jbossas/kit/jboss-wf/org/modeshape/sequencer/pdf/main/module.xml +++ b/deploy/jbossas/kit/jboss-wf/org/modeshape/sequencer/pdf/main/module.xml @@ -19,13 +19,12 @@ - - - + + diff --git a/extractors/modeshape-extractor-tika/src/main/java/org/modeshape/extractor/tika/TikaTextExtractor.java b/extractors/modeshape-extractor-tika/src/main/java/org/modeshape/extractor/tika/TikaTextExtractor.java index 0f0a2879a1..ef55d1f922 100644 --- a/extractors/modeshape-extractor-tika/src/main/java/org/modeshape/extractor/tika/TikaTextExtractor.java +++ b/extractors/modeshape-extractor-tika/src/main/java/org/modeshape/extractor/tika/TikaTextExtractor.java @@ -16,15 +16,15 @@ package org.modeshape.extractor.tika; import java.io.IOException; -import java.io.InputStream; import java.util.HashSet; import java.util.Map; import java.util.Set; -import java.util.concurrent.locks.Lock; -import java.util.concurrent.locks.ReentrantLock; +import java.util.concurrent.atomic.AtomicReference; import javax.jcr.RepositoryException; +import org.apache.tika.config.ServiceLoader; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.parser.DefaultParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; @@ -83,9 +83,9 @@ public class TikaTextExtractor extends TextExtractor { MediaType.application("x-tar"), MediaType.application("zip"), MediaType.application("vnd.teiid.vdb"), MediaType.image("*"), MediaType.audio("*"), MediaType.video("*")); - private final Set excludedMediaTypes = new HashSet(); - private final Set includedMediaTypes = new HashSet(); - private final Set parserSupportedMediaTypes = new HashSet(); + private final Set excludedMediaTypes = new HashSet<>(); + private final Set includedMediaTypes = new HashSet<>(); + private final Set parserSupportedMediaTypes = new HashSet<>(); /** * The write limit for the Tika parser, representing the maximum number of characters that should be extracted by the @@ -93,8 +93,7 @@ public class TikaTextExtractor extends TextExtractor { */ private Integer writeLimit; - private final Lock initLock = new ReentrantLock(); - private DefaultParser parser; + private final AtomicReference parser = new AtomicReference<>(); /** * No-arg constructor is required because this is instantiated by reflection. @@ -130,33 +129,30 @@ public void extractFrom( final Binary binary, final DefaultParser parser = initialize(); final Integer writeLimit = this.writeLimit; - processStream(binary, new BinaryOperation() { - @Override - public Object execute( InputStream stream ) throws Exception { - Metadata metadata = prepareMetadata(binary, context); - //TODO author=Horia Chiorean date=1/30/13 description=//TIKA 1.2 TXTParser seems to have a bug, always adding 1 ignorable whitespace to the actual chars to be parsed - //https://issues.apache.org/jira/browse/TIKA-1069 - ContentHandler textHandler = writeLimit == null ? new BodyContentHandler() : new BodyContentHandler(writeLimit + 1); - try { - LOGGER.debug("Using TikaTextExtractor to extract text"); - // Parse the input stream ... - parser.parse(stream, textHandler, metadata, new ParseContext()); - } catch (SAXException sae) { - LOGGER.warn(TikaI18n.parseExceptionWhileExtractingText, sae.getMessage()); - } catch (NoClassDefFoundError ncdfe) { - LOGGER.warn(TikaI18n.warnNoClassDefFound, ncdfe.getMessage()); - } catch (Throwable e) { - LOGGER.error(e, TikaI18n.errorWhileExtractingTextFrom, e.getMessage()); - } finally { - // Record all of the text in the body ... - String text = textHandler.toString().trim(); - if (!StringUtil.isBlank(text)) { - output.recordText(text); - LOGGER.debug("TikaTextExtractor found text: " + text); - } + processStream(binary, stream -> { + Metadata metadata = prepareMetadata(binary, context); + //TODO author=Horia Chiorean date=1/30/13 description=//TIKA 1.2 TXTParser seems to have a bug, always adding 1 ignorable whitespace to the actual chars to be parsed + //https://issues.apache.org/jira/browse/TIKA-1069 + ContentHandler textHandler = writeLimit == null ? new BodyContentHandler() : new BodyContentHandler(writeLimit + 1); + try { + LOGGER.debug("Using TikaTextExtractor to extract text"); + // Parse the input stream ... + parser.parse(stream, textHandler, metadata, new ParseContext()); + } catch (SAXException sae) { + LOGGER.warn(TikaI18n.parseExceptionWhileExtractingText, sae.getMessage()); + } catch (NoClassDefFoundError ncdfe) { + LOGGER.warn(TikaI18n.warnNoClassDefFound, ncdfe.getMessage()); + } catch (Throwable e) { + LOGGER.error(e, TikaI18n.errorWhileExtractingTextFrom, e.getMessage()); + } finally { + // Record all of the text in the body ... + String text = textHandler.toString().trim(); + if (!StringUtil.isBlank(text)) { + output.recordText(text); + LOGGER.debug("TikaTextExtractor found text: " + text); } - return null; } + return null; }); } @@ -185,36 +181,34 @@ protected final Metadata prepareMetadata( final Binary binary, } return metadata; } - + /** * This class lazily initializes the {@link DefaultParser} instance. - * + * * @return the default parser; same as {@link #parser} */ protected DefaultParser initialize() { - if (parser == null) { - initLock.lock(); - try { - if (parser == null) { - parser = new DefaultParser(this.getClass().getClassLoader()); - } - LOGGER.debug("Initializing Tika Text Extractor"); - Map parsers = parser.getParsers(); - LOGGER.debug("Tika parsers found: {0}",parsers.size()); - for (MediaType mediaType : parsers.keySet()) { - parserSupportedMediaTypes.add(mediaType); - LOGGER.debug("Tika Text Extractor will support the {0} media-type",mediaType); - } - convertStringMimeTypesToMediaTypes(getExcludedMimeTypes(), excludedMediaTypes); - convertStringMimeTypesToMediaTypes(getIncludedMimeTypes(), includedMediaTypes); - LOGGER.debug("Initialized {0}", this); - } finally { - initLock.unlock(); - } + parser.compareAndSet(null, newDefaultParser()); + return parser.get(); + } + + private DefaultParser newDefaultParser() { + ServiceLoader serviceLoader = new ServiceLoader(this.getClass().getClassLoader(), + (classname, throwable) -> LOGGER.debug(throwable, "error while loading parser for {0}", classname)); + DefaultParser defaultParser = new DefaultParser(MediaTypeRegistry.getDefaultRegistry(), serviceLoader); + LOGGER.debug("Initializing Tika Text Extractor"); + Map parsers = defaultParser.getParsers(); + LOGGER.debug("Tika parsers found: {0}",parsers.size()); + for (MediaType mediaType : parsers.keySet()) { + parserSupportedMediaTypes.add(mediaType); + LOGGER.debug("Tika Text Extractor will support the {0} media-type",mediaType); } - return parser; + convertStringMimeTypesToMediaTypes(getExcludedMimeTypes(), excludedMediaTypes); + convertStringMimeTypesToMediaTypes(getIncludedMimeTypes(), includedMediaTypes); + LOGGER.debug("Initialized {0}", this); + return defaultParser; } - + private void convertStringMimeTypesToMediaTypes(Set mimeTypes, Set mediaTypes) { for (String mimeTypeEntry : mimeTypes) { //allow each mime type entry to be an array in itself diff --git a/modeshape-assembly-descriptors/src/main/resources/assemblies/jboss-wf-dist.xml b/modeshape-assembly-descriptors/src/main/resources/assemblies/jboss-wf-dist.xml index 123b3bec82..7205ee40b5 100644 --- a/modeshape-assembly-descriptors/src/main/resources/assemblies/jboss-wf-dist.xml +++ b/modeshape-assembly-descriptors/src/main/resources/assemblies/jboss-wf-dist.xml @@ -179,6 +179,7 @@ org.ccil.cowan.tagsoup:*:jar org.gagravarr:*:jar org.ow2.asm:asm:jar + com.fasterxml.jackson.core:jackson-core:jar + 3.15 1.5 1.6.3 1.1.2 @@ -170,7 +170,7 @@ 2.2.3 2.4.1 - 1.12 + 1.14 2.6.2 6.0.0 0.2.1 @@ -184,7 +184,7 @@ 1.0.13.Final 2.0.0-rc2 2.0.3 - 1.8.10 + 2.0.3 1.4.191 9.2-1002.jdbc4 @@ -1695,11 +1695,11 @@ com.healthmarketscience.jackcess jackcess - + com.healthmarketscience.jackcess jackcess-encrypt - + org.codelibs jhighlight @@ -1707,27 +1707,31 @@ org.apache.cxf cxf-rt-rs-client - + org.apache.opennlp opennlp-tools - + org.apache.opennlp opennlp-tools - + org.json json - + com.google.code.gson gson - + + + com.googlecode.json-simple + json-simple + org.apache.sis.core sis-utility - + org.apache.sis.core sis-metadata @@ -1735,11 +1739,11 @@ org.apache.sis.storage sis-netcdf - + org.opengis geoapi - + com.googlecode.mp4parser isoparser @@ -1747,7 +1751,7 @@ com.github.junrar junrar - + org.apache.commons commons-csv @@ -1767,15 +1771,15 @@ edu.ucar netcdf4 - + edu.ucar grib - + edu.ucar cdm - + edu.ucar httpservices @@ -1816,12 +1820,12 @@ We exclude them by default. --> - org.bouncycastle - bcmail-jdk15on + org.bouncycastle + bcmail-jdk15on - org.bouncycastle - bcprov-jdk15on + org.bouncycastle + bcprov-jdk15on + Matlab files are likely not used, so exclude this library by default. + --> - net.sourceforge.jmatio + org.tallison jmatio + + + org.apache.pdfbox + pdfbox-tools +