Merge branch 'enhancement_blockUnkwon-default-true' of github.com:Mar…

…cusSorealheis/lucene-solr into enhancement_blockUnkwon-default-true * 'enhancement_blockUnkwon-default-true' of github.com:MarcusSorealheis/lucene-solr: (51 commits) Harden AliasIntegrationTest.testClusterStateProviderAPI SOLR-13694: IndexSizeEstimator NullPointerException. adding <SpanPositionRange> into XML Query Parser SOLR-13693: Use strongly-typed setters for cache parameters. LUCENE-8933: Use 'expectThrows' instead of 'expected'. (apache#830) LUCENE-8933: Validate JapaneseTokenizer user dictionary entry (apache#809) SOLR-13240: make operation-not-null checks consistent in TestPolicy.testNodeLostMultipleReplica (Richard Goodman via Christine Poerschke) SOLR-13688: Run the bin/solr export command multithreaded SOLR-13464: fix javadoc typo that precommit somehow missed? SOLR-13464: Test work arounds SOLR-13399: Adding splitByPrefix param to IndexSizeTrigger; some splitByPrefix test and code cleanup SOLR-13647: Default solr.in.sh contains incorrect default value SOLR-13568: Precommit fail Java var until 9x. Fail var... SOLR-13573: Add SolrRangeQuery getters for bounds SOLR-13593: Allow to look up analyzer components by their SPI names in field type configuration. LUCENE-8948: Change 'name' argument in ICU factories to 'form'. SOLR-13680: use try-with-resource to close closeable resources SOLR-13682: command line option to export documents to a file SOLR-13682: precommit errors SOLR-13682: command line option to export documents to a file ...
MarcusSorealheis · Aug 20, 2019 · 545dc36 · 545dc36
2 parents 0e6c5c5 + a4f44b1
commit 545dc36
Show file tree

Hide file tree

Showing 209 changed files with 7,567 additions and 5,144 deletions.
diff --git a/.gitignore b/.gitignore
@@ -28,4 +28,5 @@ pom.xml
 .pydevproject
 __pycache__
 /dev-tools/scripts/scripts.iml
+.DS_Store
 
diff --git a/dev-tools/scripts/addVersion.py b/dev-tools/scripts/addVersion.py
@@ -25,7 +25,7 @@
 from configparser import ConfigParser, ExtendedInterpolation
 from textwrap import dedent
 
-def update_changes(filename, new_version, init_changes = '(No Changes)\n\n'):
+def update_changes(filename, new_version, init_changes, headers):
   print('  adding new section to %s...' % filename, end='', flush=True)
   matcher = re.compile(r'\d+\.\d+\.\d+\s+===')
   def edit(buffer, match, line):
@@ -35,6 +35,8 @@ def edit(buffer, match, line):
     if match is not None:
       buffer.append(line.replace(match.group(0), new_version.dot))
       buffer.append(init_changes)
+      for header in headers:
+        buffer.append('%s\n---------------------\n(No changes)\n\n' % header)
     buffer.append(line)
     return match is not None
 
@@ -206,19 +208,21 @@ def get_solr_init_changes():
     Apache ZooKeeper %(/org.apache.zookeeper/zookeeper)s
     Jetty %(org.eclipse.jetty.version)s
 
-
-    (No Changes)\n\n
     ''' % parse_properties_file('lucene/ivy-versions.properties'))
 
 def main():
   if not os.path.exists('lucene/version.properties'):
     sys.exit("Tool must be run from the root of a source checkout.")
   current_version = Version.parse(find_current_version())
   newconf = read_config(current_version)
+  is_bugfix = newconf.version.is_bugfix_release()
 
   print('\nAdding new version %s' % newconf.version)
-  update_changes('lucene/CHANGES.txt', newconf.version)
-  update_changes('solr/CHANGES.txt', newconf.version, get_solr_init_changes())
+  # See LUCENE-8883 for some thoughts on which categories to use
+  update_changes('lucene/CHANGES.txt', newconf.version, '\n',
+                 ['Bug Fixes'] if is_bugfix else ['API Changes', 'New Features', 'Improvements', 'Optimizations', 'Bug Fixes', 'Other'])
+  update_changes('solr/CHANGES.txt', newconf.version, get_solr_init_changes(),
+                 ['Bug Fixes'] if is_bugfix else ['Upgrade Notes', 'New Features', 'Improvements', 'Optimizations', 'Bug Fixes', 'Other Changes'])
 
   latest_or_backcompat = newconf.is_latest_version or current_version.is_back_compat_with(newconf.version)
   if latest_or_backcompat:

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -30,6 +30,11 @@ API Changes
 * LUCENE-8909: The deprecated IndexWriter#getFieldNames() method has been removed.
   (Adrien Grand, Munendra S N)
 
+* LUCENE-8948: Change "name" argument in ICU factories to "form". Here, "form" is
+  named after "Unicode Normalization Form". (Tomoko Uchida)
+
+* LUCENE-8933: Validate JapaneseTokenizer user dictionary entry. (Tomoko Uchida)
+
 Improvements
 
 * LUCENE-8757: When provided with an ExecutorService to run queries across
@@ -63,6 +68,13 @@ New Features
 
 * LUCENE-8936: Add SpanishMinimalStemFilter (vinod kumar via Tomoko Uchida)
 
+* LUCENE-8764: Add "export all terms" feature to Luke. (Leonardo Menezes via Tomoko Uchida)
+
+* LUCENE-8747: Composite Matches from multiple subqueries now allow access to
+  their submatches, and a new NamedMatches API allows marking of subqueries
+  and a simple way to find which subqueries have matched on a given document
+  (Alan Woodward, Jim Ferenczi)
+
 Improvements
 
 * LUCENE-8874: Show SPI names instead of class names in Luke Analysis tab. (Tomoko Uchida)
@@ -81,6 +93,8 @@ Improvements
 * LUCENE-8906: Expose Lucene50PostingsFormat.IntBlockTermState as public so that other postings formats can re-use it.
   (Bruno Roustant)
 
+* SOLR-13663: Introduce <SpanPositionRange> into XML Query Parser (Alessandro Benedetti via Mikhail Khludnev)
+
 Optimizations
 
 * LUCENE-8922: DisjunctionMaxQuery more efficiently leverages impacts to skip
@@ -89,6 +103,9 @@ Optimizations
 * LUCENE-8935: BooleanQuery with no scoring clause can now early terminate the query when
 the total hits is not requested.
 
+* LUCENE-8941: Matches on wildcard queries will defer building their full
+  disjunction until a MatchesIterator is pulled (Alan Woodward)
+
 Other
 
 * LUCENE-8778 LUCENE-8911: Define analyzer SPI names as static final fields and document the names in Javadocs.

diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt
@@ -1,5 +1,32 @@
 # Apache Lucene Migration Guide
 
+## Kuromoji user dictionary now forbids illegal segmentation (LUCENE-8933) ##
+
+User dictionary now strictly validates if the (concatenated) segment is the same as the surface form. This change avoids
+unexpected runtime exceptions or behaviours.
+For example, these entries are not allowed at all and an exception is thrown when loading the dictionary file.
+
+# concatenated "日本経済新聞" does not match the surface form "日経新聞"
+日経新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞
+
+# concatenated "日経新聞" does not match the surface form "日本経済新聞"
+日本経済新聞,日経 新聞,ニッケイ シンブン,カスタム名詞
+
+## Analysis factories now have customizable symbolic names (LUCENE-8778) ##
+
+The SPI names for concrete subclasses of TokenizerFactory, TokenFilterFactory, and CharfilterFactory are no longer
+derived from their class name. Instead, each factory must have a static "NAME" field like this:
+
+  /** o.a.l.a.standard.StandardTokenizerFactory's SPI name */
+  public static final String NAME = "standard";
+
+A factory can be resolved/instantiated with its NAME by using methods such as TokenizerFactory#lookupClass(String)
+or TokenizerFactory#forName(String, Map<String,String>).
+
+If there are any user-defined factory classes that don't have proper NAME field, an exception will be thrown
+when (re)loading factories. e.g., when calling TokenizerFactory#reloadTokenizers(ClassLoader).
+
+
 ## TermsEnum is now fully abstract (LUCENE-8292) ##
 
 TermsEnum has been changed to be fully abstract, so non-abstract subclass must implement all it's methods.

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AbstractAnalysisFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AbstractAnalysisFactory.java
@@ -83,6 +83,7 @@ protected AbstractAnalysisFactory(Map<String,String> args) {
       }
     }
     args.remove(CLASS_NAME);  // consume the class arg
+    args.remove(SPI_NAME);    // consume the spi arg
   }
 
   public final Map<String,String> getOriginalArgs() {
@@ -316,6 +317,8 @@ protected final List<String> splitAt(char separator, String list) {
   }
 
   private static final String CLASS_NAME = "class";
+
+  private static final String SPI_NAME = "name";
 
   /**
    * @return the string used to specify the concrete class name in a serialized representation: the class arg.  

diff --git a/...analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilterFactory.java b/...analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilterFactory.java
@@ -31,7 +31,7 @@
  * <p>
  * Supports the following attributes:
  * <ul>
- *   <li>name: A <a href="http://unicode.org/reports/tr15/">Unicode Normalization Form</a>, 
+ *   <li>form: A <a href="http://unicode.org/reports/tr15/">Unicode Normalization Form</a>,
  *       one of 'nfc','nfkc', 'nfkc_cf'. Default is nfkc_cf.
  *   <li>mode: Either 'compose' or 'decompose'. Default is compose. Use "decompose" with nfc
  *       or nfkc, to get nfd or nfkd, respectively.
@@ -55,10 +55,10 @@ public class ICUNormalizer2CharFilterFactory extends CharFilterFactory {
   /** Creates a new ICUNormalizer2CharFilterFactory */
   public ICUNormalizer2CharFilterFactory(Map<String,String> args) {
     super(args);
-    String name = get(args, "name", "nfkc_cf");
+    String form = get(args, "form", "nfkc_cf");
     String mode = get(args, "mode", Arrays.asList("compose", "decompose"), "compose");
     Normalizer2 normalizer = Normalizer2.getInstance
-        (null, name, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
+        (null, form, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
 
     String filter = get(args, "filter");
     if (filter != null) {

diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2FilterFactory.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2FilterFactory.java
@@ -31,7 +31,7 @@
  * <p>
  * Supports the following attributes:
  * <ul>
- *   <li>name: A <a href="http://unicode.org/reports/tr15/">Unicode Normalization Form</a>, 
+ *   <li>form: A <a href="http://unicode.org/reports/tr15/">Unicode Normalization Form</a>,
  *       one of 'nfc','nfkc', 'nfkc_cf'. Default is nfkc_cf.
  *   <li>mode: Either 'compose' or 'decompose'. Default is compose. Use "decompose" with nfc
  *       or nfkc, to get nfd or nfkd, respectively.
@@ -54,10 +54,10 @@ public class ICUNormalizer2FilterFactory extends TokenFilterFactory {
   /** Creates a new ICUNormalizer2FilterFactory */
   public ICUNormalizer2FilterFactory(Map<String,String> args) {
     super(args);
-    String name = get(args, "name", "nfkc_cf");
+    String form = get(args, "form", "nfkc_cf");
     String mode = get(args, "mode", Arrays.asList("compose", "decompose"), "compose");
     Normalizer2 normalizer = Normalizer2.getInstance
-        (null, name, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
+        (null, form, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
 
     String filter = get(args, "filter");
     if (filter != null) {

diff --git a/...ysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilterFactory.java b/...ysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilterFactory.java
@@ -20,6 +20,7 @@
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.HashMap;
+import java.util.Map;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
@@ -35,7 +36,18 @@ public void testDefaults() throws Exception {
     TokenStream stream = whitespaceMockTokenizer(reader);
     assertTokenStreamContents(stream, new String[] { "this", "is", "a", "test" });
   }
-
+
+  /** Test nfkc form */
+  public void testFormArgument() throws Exception {
+    Reader reader = new StringReader("This is a Ｔｅｓｔ");
+    Map<String, String> args = new HashMap<>();
+    args.put("form", "nfkc");
+    ICUNormalizer2CharFilterFactory factory = new ICUNormalizer2CharFilterFactory(args);
+    reader = factory.create(reader);
+    TokenStream stream = whitespaceMockTokenizer(reader);
+    assertTokenStreamContents(stream, new String[] { "This", "is", "a", "Test" });
+  }
+
   /** Test that bogus arguments result in exception */
   public void testBogusArguments() throws Exception {
     IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {

diff --git a/...analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2FilterFactory.java b/...analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2FilterFactory.java
@@ -20,6 +20,7 @@
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.HashMap;
+import java.util.Map;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
@@ -35,6 +36,17 @@ public void testDefaults() throws Exception {
     stream = factory.create(stream);
     assertTokenStreamContents(stream, new String[] { "this", "is", "a", "test" });
   }
+
+  /** Test nfkc form */
+  public void testFormArgument() throws Exception {
+    Reader reader = new StringReader("This is a Ｔｅｓｔ");
+    Map<String, String> args = new HashMap<>();
+    args.put("form", "nfkc");
+    ICUNormalizer2FilterFactory factory = new ICUNormalizer2FilterFactory(args);
+    TokenStream stream = whitespaceMockTokenizer(reader);
+    stream = factory.create(stream);
+    assertTokenStreamContents(stream, new String[] { "This", "is", "a", "Test" });
+  }
 
   /** Test that bogus arguments result in exception */
   public void testBogusArguments() throws Exception {

diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
@@ -104,6 +104,8 @@ public int compare(String[] left, String[] right) {
     long ord = 0;
 
     for (String[] values : featureEntries) {
+      String surface = values[0].replaceAll("\\s", "");
+      String concatenatedSegment = values[1].replaceAll("\\s", "");
       String[] segmentation = values[1].replaceAll("  *", " ").split(" ");
       String[] readings = values[2].replaceAll("  *", " ").split(" ");
       String pos = values[3];
@@ -113,6 +115,12 @@ public int compare(String[] left, String[] right) {
                                    " - the number of segmentations (" + segmentation.length + ")" +
                                    " does not the match number of readings (" + readings.length + ")");
       }
+
+      if (!surface.equals(concatenatedSegment)) {
+        throw new RuntimeException("Illegal user dictionary entry " + values[0] +
+                                   " - the concatenated segmentation (" + concatenatedSegment + ")" +
+                                   " does not match the surface form (" + surface + ")");
+      }
 
       int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
       wordIdAndLength[0] = wordId;

diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/UserDictionaryTest.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/UserDictionaryTest.java
@@ -18,6 +18,7 @@
 
 
 import java.io.IOException;
+import java.io.StringReader;
 
 import org.apache.lucene.analysis.ja.TestJapaneseTokenizer;
 import org.apache.lucene.util.LuceneTestCase;
@@ -77,4 +78,25 @@ public void testRead() throws IOException {
     UserDictionary dictionary = TestJapaneseTokenizer.readDict();
     assertNotNull(dictionary);
   }
+
+  @Test
+  public void testReadInvalid1() throws IOException {
+    // the concatenated segment must be the same as the surface form
+    String invalidEntry = "日経新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞";
+    RuntimeException e = expectThrows(RuntimeException.class,
+        "RuntimeException should be thrown when passed an invalid dictionary entry.",
+        () -> UserDictionary.open(new StringReader(invalidEntry)));
+    assertTrue(e.getMessage().contains("does not match the surface form"));
+  }
+
+  @Test
+  public void testReadInvalid2() throws IOException {
+    // the concatenated segment must be the same as the surface form
+    String invalidEntry = "日本経済新聞,日経 新聞,ニッケイ シンブン,カスタム名詞";
+    RuntimeException e = expectThrows(RuntimeException.class,
+        "RuntimeException should be thrown when passed an invalid dictionary entry.",
+        () -> UserDictionary.open(new StringReader(invalidEntry)));
+    assertTrue(e.getMessage().contains("does not match the surface form"));
+  }
+
 }