diff --git a/.gitignore b/.gitignore
index 264d7caf87d2..d816f14d803d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@ lucene/**/*.iml
parent.iml
*.ipr
*.iws
+/*.iml
/.project
/.classpath
/.settings
diff --git a/build.gradle b/build.gradle
index 58afa841dbc9..5fc36093668a 100644
--- a/build.gradle
+++ b/build.gradle
@@ -21,7 +21,7 @@ import java.time.format.DateTimeFormatter
plugins {
id "base"
id "com.palantir.consistent-versions" version "1.14.0"
- id 'de.thetaphi.forbiddenapis' version '3.0' apply false
+ id 'de.thetaphi.forbiddenapis' version '3.0.1' apply false
id "org.owasp.dependencycheck" version "5.3.0"
id "de.undercouch.download" version "4.0.2" apply false
}
@@ -29,6 +29,9 @@ plugins {
// Project version.
version = "9.0.0-SNAPSHOT"
+// General metadata.
+description = 'Grandparent project for Apache Lucene Core and Apache Solr'
+
// Propagate version and derived properties across projects.
allprojects {
version = rootProject.version
@@ -43,6 +46,16 @@ ext {
}
return m[0][1]
}()
+ // "majorVersion" is an integer with just the major version. Compute it.
+ majorVersion = {
+ def m = (version =~ /^(\d+)\.\d+\.\d+(-(.+))?/)
+ if (!m) {
+ throw GradleException("Can't strip version to just major version: " + rootProject.version)
+ }
+ return m[0][1] as int
+ }
+ // snapshot build marker used in scripts.
+ snapshotBuild = version.contains("SNAPSHOT")
// Build timestamp.
def tstamp = ZonedDateTime.now()
@@ -58,6 +71,7 @@ ext {
"javacc": "5.0",
"jflex": "1.7.0",
"jgit": "5.3.0.201903130848-r",
+ "flexmark": "0.61.24",
]
}
@@ -77,7 +91,6 @@ apply from: file('gradle/ant-compat/folder-layout.gradle')
// (java, tests)
apply from: file('gradle/defaults.gradle')
apply from: file('gradle/defaults-java.gradle')
-apply from: file('gradle/render-javadoc.gradle')
apply from: file('gradle/testing/defaults-tests.gradle')
apply from: file('gradle/testing/randomization.gradle')
apply from: file('gradle/testing/fail-on-no-tests.gradle')
@@ -104,6 +117,7 @@ apply from: file('gradle/validation/ecj-lint.gradle')
apply from: file('gradle/validation/gradlew-scripts-tweaked.gradle')
apply from: file('gradle/validation/missing-docs-check.gradle')
apply from: file('gradle/validation/validate-log-calls.gradle')
+apply from: file('gradle/validation/check-broken-links.gradle')
// Source or data regeneration tasks
apply from: file('gradle/generation/jflex.gradle')
@@ -134,3 +148,5 @@ apply from: file('gradle/ant-compat/forbidden-api-rules-in-sync.gradle')
apply from: file('gradle/documentation/documentation.gradle')
apply from: file('gradle/documentation/changes-to-html.gradle')
+apply from: file('gradle/documentation/markdown.gradle')
+apply from: file('gradle/render-javadoc.gradle')
diff --git a/dev-tools/doap/lucene.rdf b/dev-tools/doap/lucene.rdf
index 378562c88ff0..1cdf5521900c 100644
--- a/dev-tools/doap/lucene.rdf
+++ b/dev-tools/doap/lucene.rdf
@@ -67,6 +67,13 @@
+
+
+ lucene-8.5.2
+ 2020-05-26
+ 8.5.2
+
+ lucene-8.5.1
@@ -137,6 +144,13 @@
8.0.0
+
+
+ lucene-7.7.3
+ 2020-04-28
+ 7.7.3
+
+ lucene-7.7.2
diff --git a/dev-tools/doap/solr.rdf b/dev-tools/doap/solr.rdf
index e25a578631fb..d3e097f98549 100644
--- a/dev-tools/doap/solr.rdf
+++ b/dev-tools/doap/solr.rdf
@@ -67,6 +67,13 @@
+
+
+ solr-8.5.2
+ 2020-05-26
+ 8.5.2
+
+ solr-8.5.1
@@ -137,6 +144,13 @@
8.0.0
+
+
+ solr-7.7.3
+ 2020-04-28
+ 7.7.3
+
+ solr-7.7.2
diff --git a/dev-tools/maven/pom.xml.template b/dev-tools/maven/pom.xml.template
index b5bd2df1144f..b5e98b190ec1 100644
--- a/dev-tools/maven/pom.xml.template
+++ b/dev-tools/maven/pom.xml.template
@@ -159,7 +159,7 @@
de.thetaphiforbiddenapis
- 2.7
+ 3.0.1
+
+
+
+
+
+Lucene 6.0 file format.
+
+
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70SegmentInfoFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene70/Lucene70SegmentInfoFormat.java
similarity index 63%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70SegmentInfoFormat.java
rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene70/Lucene70SegmentInfoFormat.java
index ed5577011564..ab54012eb20c 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70SegmentInfoFormat.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene70/Lucene70SegmentInfoFormat.java
@@ -24,9 +24,9 @@
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexFileNames;
-import org.apache.lucene.index.IndexWriter; // javadocs
-import org.apache.lucene.index.SegmentInfo; // javadocs
-import org.apache.lucene.index.SegmentInfos; // javadocs
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedNumericSelector;
@@ -34,10 +34,9 @@
import org.apache.lucene.search.SortedSetSelector;
import org.apache.lucene.search.SortedSetSortField;
import org.apache.lucene.store.ChecksumIndexInput;
-import org.apache.lucene.store.DataOutput; // javadocs
+import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
-import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.Version;
/**
@@ -271,164 +270,7 @@ public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOConte
@Override
public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOException {
- final String fileName = IndexFileNames.segmentFileName(si.name, "", Lucene70SegmentInfoFormat.SI_EXTENSION);
-
- try (IndexOutput output = dir.createOutput(fileName, ioContext)) {
- // Only add the file once we've successfully created it, else IFD assert can trip:
- si.addFile(fileName);
- CodecUtil.writeIndexHeader(output,
- Lucene70SegmentInfoFormat.CODEC_NAME,
- Lucene70SegmentInfoFormat.VERSION_CURRENT,
- si.getId(),
- "");
- Version version = si.getVersion();
- if (version.major < 7) {
- throw new IllegalArgumentException("invalid major version: should be >= 7 but got: " + version.major + " segment=" + si);
- }
- // Write the Lucene version that created this segment, since 3.1
- output.writeInt(version.major);
- output.writeInt(version.minor);
- output.writeInt(version.bugfix);
-
- // Write the min Lucene version that contributed docs to the segment, since 7.0
- if (si.getMinVersion() != null) {
- output.writeByte((byte) 1);
- Version minVersion = si.getMinVersion();
- output.writeInt(minVersion.major);
- output.writeInt(minVersion.minor);
- output.writeInt(minVersion.bugfix);
- } else {
- output.writeByte((byte) 0);
- }
-
- assert version.prerelease == 0;
- output.writeInt(si.maxDoc());
-
- output.writeByte((byte) (si.getUseCompoundFile() ? SegmentInfo.YES : SegmentInfo.NO));
- output.writeMapOfStrings(si.getDiagnostics());
- Set files = si.files();
- for (String file : files) {
- if (!IndexFileNames.parseSegmentName(file).equals(si.name)) {
- throw new IllegalArgumentException("invalid files: expected segment=" + si.name + ", got=" + files);
- }
- }
- output.writeSetOfStrings(files);
- output.writeMapOfStrings(si.getAttributes());
-
- Sort indexSort = si.getIndexSort();
- int numSortFields = indexSort == null ? 0 : indexSort.getSort().length;
- output.writeVInt(numSortFields);
- for (int i = 0; i < numSortFields; ++i) {
- SortField sortField = indexSort.getSort()[i];
- SortField.Type sortType = sortField.getType();
- output.writeString(sortField.getField());
- int sortTypeID;
- switch (sortField.getType()) {
- case STRING:
- sortTypeID = 0;
- break;
- case LONG:
- sortTypeID = 1;
- break;
- case INT:
- sortTypeID = 2;
- break;
- case DOUBLE:
- sortTypeID = 3;
- break;
- case FLOAT:
- sortTypeID = 4;
- break;
- case CUSTOM:
- if (sortField instanceof SortedSetSortField) {
- sortTypeID = 5;
- sortType = SortField.Type.STRING;
- } else if (sortField instanceof SortedNumericSortField) {
- sortTypeID = 6;
- sortType = ((SortedNumericSortField) sortField).getNumericType();
- } else {
- throw new IllegalStateException("Unexpected SortedNumericSortField " + sortField);
- }
- break;
- default:
- throw new IllegalStateException("Unexpected sort type: " + sortField.getType());
- }
- output.writeVInt(sortTypeID);
- if (sortTypeID == 5) {
- SortedSetSortField ssf = (SortedSetSortField) sortField;
- if (ssf.getSelector() == SortedSetSelector.Type.MIN) {
- output.writeByte((byte) 0);
- } else if (ssf.getSelector() == SortedSetSelector.Type.MAX) {
- output.writeByte((byte) 1);
- } else if (ssf.getSelector() == SortedSetSelector.Type.MIDDLE_MIN) {
- output.writeByte((byte) 2);
- } else if (ssf.getSelector() == SortedSetSelector.Type.MIDDLE_MAX) {
- output.writeByte((byte) 3);
- } else {
- throw new IllegalStateException("Unexpected SortedSetSelector type: " + ssf.getSelector());
- }
- } else if (sortTypeID == 6) {
- SortedNumericSortField snsf = (SortedNumericSortField) sortField;
- if (snsf.getNumericType() == SortField.Type.LONG) {
- output.writeByte((byte) 0);
- } else if (snsf.getNumericType() == SortField.Type.INT) {
- output.writeByte((byte) 1);
- } else if (snsf.getNumericType() == SortField.Type.DOUBLE) {
- output.writeByte((byte) 2);
- } else if (snsf.getNumericType() == SortField.Type.FLOAT) {
- output.writeByte((byte) 3);
- } else {
- throw new IllegalStateException("Unexpected SortedNumericSelector type: " + snsf.getNumericType());
- }
- if (snsf.getSelector() == SortedNumericSelector.Type.MIN) {
- output.writeByte((byte) 0);
- } else if (snsf.getSelector() == SortedNumericSelector.Type.MAX) {
- output.writeByte((byte) 1);
- } else {
- throw new IllegalStateException("Unexpected sorted numeric selector type: " + snsf.getSelector());
- }
- }
- output.writeByte((byte) (sortField.getReverse() ? 0 : 1));
-
- // write missing value
- Object missingValue = sortField.getMissingValue();
- if (missingValue == null) {
- output.writeByte((byte) 0);
- } else {
- switch(sortType) {
- case STRING:
- if (missingValue == SortField.STRING_LAST) {
- output.writeByte((byte) 1);
- } else if (missingValue == SortField.STRING_FIRST) {
- output.writeByte((byte) 2);
- } else {
- throw new AssertionError("unrecognized missing value for STRING field \"" + sortField.getField() + "\": " + missingValue);
- }
- break;
- case LONG:
- output.writeByte((byte) 1);
- output.writeLong(((Long) missingValue).longValue());
- break;
- case INT:
- output.writeByte((byte) 1);
- output.writeInt(((Integer) missingValue).intValue());
- break;
- case DOUBLE:
- output.writeByte((byte) 1);
- output.writeLong(Double.doubleToLongBits(((Double) missingValue).doubleValue()));
- break;
- case FLOAT:
- output.writeByte((byte) 1);
- output.writeInt(Float.floatToIntBits(((Float) missingValue).floatValue()));
- break;
- default:
- throw new IllegalStateException("Unexpected sort type: " + sortField.getType());
- }
- }
- }
-
- CodecUtil.writeFooter(output);
- }
+ throw new UnsupportedOperationException("Old formats can't be used for writing");
}
/** File extension used to store {@link SegmentInfo}. */
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene70/package-info.java
similarity index 96%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java
rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene70/package-info.java
index e1913a0c4965..6bbf70c31d8e 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene70/package-info.java
@@ -16,7 +16,7 @@
*/
/**
- * Components from the Lucene 7.0 index format. See {@link org.apache.lucene.codecs.lucene80}
+ * Components from the Lucene 7.0 index format. See {@link org.apache.lucene.codecs.lucene86}
* for an overview of the current index format.
*/
package org.apache.lucene.codecs.lucene70;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene84/Lucene84Codec.java
similarity index 94%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84Codec.java
rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene84/Lucene84Codec.java
index e3f061ad27c0..bef563301bab 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84Codec.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene84/Lucene84Codec.java
@@ -60,31 +60,31 @@ public class Lucene84Codec extends Codec {
private final LiveDocsFormat liveDocsFormat = new Lucene50LiveDocsFormat();
private final CompoundFormat compoundFormat = new Lucene50CompoundFormat();
private final PostingsFormat defaultFormat;
-
+
private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
return Lucene84Codec.this.getPostingsFormatForField(field);
}
};
-
+
private final DocValuesFormat docValuesFormat = new PerFieldDocValuesFormat() {
@Override
public DocValuesFormat getDocValuesFormatForField(String field) {
return Lucene84Codec.this.getDocValuesFormatForField(field);
}
};
-
+
private final StoredFieldsFormat storedFieldsFormat;
- /**
+ /**
* Instantiates a new codec.
*/
public Lucene84Codec() {
this(Mode.BEST_SPEED);
}
-
- /**
+
+ /**
* Instantiates a new codec, specifying the stored fields compression
* mode to use.
* @param mode stored fields compression mode to use for newly
@@ -95,12 +95,12 @@ public Lucene84Codec(Mode mode) {
this.storedFieldsFormat = new Lucene50StoredFieldsFormat(Objects.requireNonNull(mode));
this.defaultFormat = new Lucene84PostingsFormat();
}
-
+
@Override
public final StoredFieldsFormat storedFieldsFormat() {
return storedFieldsFormat;
}
-
+
@Override
public final TermVectorsFormat termVectorsFormat() {
return vectorsFormat;
@@ -110,17 +110,17 @@ public final TermVectorsFormat termVectorsFormat() {
public final PostingsFormat postingsFormat() {
return postingsFormat;
}
-
+
@Override
public final FieldInfosFormat fieldInfosFormat() {
return fieldInfosFormat;
}
-
+
@Override
- public final SegmentInfoFormat segmentInfoFormat() {
+ public SegmentInfoFormat segmentInfoFormat() {
return segmentInfosFormat;
}
-
+
@Override
public final LiveDocsFormat liveDocsFormat() {
return liveDocsFormat;
@@ -132,36 +132,36 @@ public final CompoundFormat compoundFormat() {
}
@Override
- public final PointsFormat pointsFormat() {
+ public PointsFormat pointsFormat() {
return new Lucene60PointsFormat();
}
- /** Returns the postings format that should be used for writing
+ /** Returns the postings format that should be used for writing
* new segments of field.
- *
+ *
* The default implementation always returns "Lucene84".
*
- * WARNING: if you subclass, you are responsible for index
- * backwards compatibility: future version of Lucene are only
- * guaranteed to be able to read the default implementation.
+ * WARNING: if you subclass, you are responsible for index
+ * backwards compatibility: future version of Lucene are only
+ * guaranteed to be able to read the default implementation.
*/
public PostingsFormat getPostingsFormatForField(String field) {
return defaultFormat;
}
-
- /** Returns the docvalues format that should be used for writing
+
+ /** Returns the docvalues format that should be used for writing
* new segments of field.
- *
+ *
* The default implementation always returns "Lucene80".
*
- * WARNING: if you subclass, you are responsible for index
- * backwards compatibility: future version of Lucene are only
- * guaranteed to be able to read the default implementation.
+ * WARNING: if you subclass, you are responsible for index
+ * backwards compatibility: future version of Lucene are only
+ * guaranteed to be able to read the default implementation.
*/
public DocValuesFormat getDocValuesFormatForField(String field) {
return defaultDVFormat;
}
-
+
@Override
public final DocValuesFormat docValuesFormat() {
return docValuesFormat;
diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene84/package.html b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene84/package.html
new file mode 100644
index 000000000000..d0ba893dfad3
--- /dev/null
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene84/package.html
@@ -0,0 +1,25 @@
+
+
+
+
+
+
+
+Lucene 8.4 file format.
+
+
diff --git a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
index a818e355d1c4..cf7a945e1338 100644
--- a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
+++ b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
@@ -14,3 +14,4 @@
# limitations under the License.
org.apache.lucene.codecs.lucene80.Lucene80Codec
+org.apache.lucene.codecs.lucene84.Lucene84Codec
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java
similarity index 94%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java
rename to lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java
index c73a9b18f59a..06e965368d89 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java
@@ -101,9 +101,10 @@ public void writeField(FieldInfo fieldInfo, PointsReader reader) throws IOExcept
values.size())) {
if (values instanceof MutablePointValues) {
- final long fp = writer.writeField(dataOut, fieldInfo.name, (MutablePointValues) values);
- if (fp != -1) {
- indexFPs.put(fieldInfo.name, fp);
+ Runnable finalizer = writer.writeField(dataOut, dataOut, dataOut, fieldInfo.name, (MutablePointValues) values);
+ if (finalizer != null) {
+ indexFPs.put(fieldInfo.name, dataOut.getFilePointer());
+ finalizer.run();
}
return;
}
@@ -125,8 +126,10 @@ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
});
// We could have 0 points on merge since all docs with dimensional fields may be deleted:
- if (writer.getPointCount() > 0) {
- indexFPs.put(fieldInfo.name, writer.finish(dataOut));
+ Runnable finalizer = writer.finish(dataOut, dataOut, dataOut);
+ if (finalizer != null) {
+ indexFPs.put(fieldInfo.name, dataOut.getFilePointer());
+ finalizer.run();
}
}
}
@@ -210,9 +213,10 @@ public void merge(MergeState mergeState) throws IOException {
}
}
- long fp = writer.merge(dataOut, docMaps, bkdReaders);
- if (fp != -1) {
- indexFPs.put(fieldInfo.name, fp);
+ Runnable finalizer = writer.merge(dataOut, dataOut, dataOut, docMaps, bkdReaders);
+ if (finalizer != null) {
+ indexFPs.put(fieldInfo.name, dataOut.getFilePointer());
+ finalizer.run();
}
}
} else {
diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene60/Lucene60RWPointsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene60/Lucene60RWPointsFormat.java
new file mode 100644
index 000000000000..6f5127f070b6
--- /dev/null
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene60/Lucene60RWPointsFormat.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene60;
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.PointsWriter;
+import org.apache.lucene.index.SegmentWriteState;
+
+/** RW variant of Lucene60PointsFormat */
+public class Lucene60RWPointsFormat extends Lucene60PointsFormat {
+
+ /** Sole constructor. */
+ public Lucene60RWPointsFormat() {}
+
+ @Override
+ public PointsWriter fieldsWriter(SegmentWriteState state) throws IOException {
+ return new Lucene60PointsWriter(state);
+ }
+
+}
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java
similarity index 85%
rename from lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java
rename to lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java
index 4487ed012b6c..f6130bddff6a 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene60/TestLucene60PointsFormat.java
@@ -21,10 +21,7 @@
import java.util.Arrays;
import org.apache.lucene.codecs.Codec;
-import org.apache.lucene.codecs.FilterCodec;
-import org.apache.lucene.codecs.PointsFormat;
-import org.apache.lucene.codecs.PointsReader;
-import org.apache.lucene.codecs.PointsWriter;
+import org.apache.lucene.codecs.lucene84.Lucene84RWCodec;
import org.apache.lucene.document.BinaryPoint;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.BasePointsFormatTestCase;
@@ -35,8 +32,6 @@
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.MockRandomMergePolicy;
import org.apache.lucene.index.PointValues;
-import org.apache.lucene.index.SegmentReadState;
-import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.store.Directory;
@@ -51,38 +46,8 @@ public class TestLucene60PointsFormat extends BasePointsFormatTestCase {
private final int maxPointsInLeafNode;
public TestLucene60PointsFormat() {
- // standard issue
- Codec defaultCodec = TestUtil.getDefaultCodec();
- if (random().nextBoolean()) {
- // randomize parameters
- maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500);
- double maxMBSortInHeap = 3.0 + (3*random().nextDouble());
- if (VERBOSE) {
- System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap);
- }
-
- // sneaky impersonation!
- codec = new FilterCodec(defaultCodec.getName(), defaultCodec) {
- @Override
- public PointsFormat pointsFormat() {
- return new PointsFormat() {
- @Override
- public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
- return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap);
- }
-
- @Override
- public PointsReader fieldsReader(SegmentReadState readState) throws IOException {
- return new Lucene60PointsReader(readState);
- }
- };
- }
- };
- } else {
- // standard issue
- codec = defaultCodec;
- maxPointsInLeafNode = BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE;
- }
+ codec = new Lucene84RWCodec();
+ maxPointsInLeafNode = BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE;
}
@Override
@@ -90,12 +55,6 @@ protected Codec getCodec() {
return codec;
}
- @Override
- public void testMergeStability() throws Exception {
- assumeFalse("TODO: mess with the parameters and test gets angry!", codec instanceof FilterCodec);
- super.testMergeStability();
- }
-
public void testEstimatePointCount() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig();
@@ -239,12 +198,6 @@ public void testEstimatePointCount2Dims() throws IOException {
final LeafReader lr = getOnlyLeafReader(r);
PointValues points = lr.getPointValues("f");
- // With >1 dims, the tree is balanced
- long actualMaxPointsInLeafNode = points.size();
- while (actualMaxPointsInLeafNode > maxPointsInLeafNode) {
- actualMaxPointsInLeafNode = (actualMaxPointsInLeafNode + 1) / 2;
- }
-
IntersectVisitor allPointsVisitor = new IntersectVisitor() {
@Override
public void visit(int docID, byte[] packedValue) throws IOException {}
@@ -259,9 +212,9 @@ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
};
// If all points match, then the point count is numLeaves * maxPointsInLeafNode
- final int numLeaves = (int) Math.max(Long.highestOneBit( ((points.size() - 1) / actualMaxPointsInLeafNode)) << 1, 1);
+ final int numLeaves = (int) Math.ceil((double) points.size() / maxPointsInLeafNode);
- assertEquals(numLeaves * actualMaxPointsInLeafNode, points.estimatePointCount(allPointsVisitor));
+ assertEquals(numLeaves * maxPointsInLeafNode, points.estimatePointCount(allPointsVisitor));
assertEquals(numDocs, points.estimateDocCount(allPointsVisitor));
IntersectVisitor noPointsVisitor = new IntersectVisitor() {
@@ -302,7 +255,7 @@ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
final long pointCount = points.estimatePointCount(onePointMatchVisitor);
// The number of matches needs to be multiple of count per leaf
- final long countPerLeaf = (actualMaxPointsInLeafNode + 1) / 2;
+ final long countPerLeaf = (maxPointsInLeafNode + 1) / 2;
assertTrue(""+pointCount, pointCount % countPerLeaf == 0);
// in extreme cases, a point can be be shared by 4 leaves
assertTrue(""+pointCount, pointCount / countPerLeaf <= 4 && pointCount / countPerLeaf >= 1);
diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene70/Lucene70RWSegmentInfoFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene70/Lucene70RWSegmentInfoFormat.java
new file mode 100644
index 000000000000..75f31c294029
--- /dev/null
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene70/Lucene70RWSegmentInfoFormat.java
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.codecs.lucene70;
+
+import java.io.IOException;
+import java.util.Set;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.search.SortedNumericSelector;
+import org.apache.lucene.search.SortedNumericSortField;
+import org.apache.lucene.search.SortedSetSelector;
+import org.apache.lucene.search.SortedSetSortField;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.Version;
+
+/**
+ * Writable version of Lucene70SegmentInfoFormat for testing
+ */
+public class Lucene70RWSegmentInfoFormat extends Lucene70SegmentInfoFormat {
+
+ @Override
+ public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOException {
+ final String fileName = IndexFileNames.segmentFileName(si.name, "", Lucene70SegmentInfoFormat.SI_EXTENSION);
+
+ try (IndexOutput output = dir.createOutput(fileName, ioContext)) {
+ // Only add the file once we've successfully created it, else IFD assert can trip:
+ si.addFile(fileName);
+ CodecUtil.writeIndexHeader(output,
+ Lucene70SegmentInfoFormat.CODEC_NAME,
+ Lucene70SegmentInfoFormat.VERSION_CURRENT,
+ si.getId(),
+ "");
+ Version version = si.getVersion();
+ if (version.major < 7) {
+ throw new IllegalArgumentException("invalid major version: should be >= 7 but got: " + version.major + " segment=" + si);
+ }
+ // Write the Lucene version that created this segment, since 3.1
+ output.writeInt(version.major);
+ output.writeInt(version.minor);
+ output.writeInt(version.bugfix);
+
+ // Write the min Lucene version that contributed docs to the segment, since 7.0
+ if (si.getMinVersion() != null) {
+ output.writeByte((byte) 1);
+ Version minVersion = si.getMinVersion();
+ output.writeInt(minVersion.major);
+ output.writeInt(minVersion.minor);
+ output.writeInt(minVersion.bugfix);
+ } else {
+ output.writeByte((byte) 0);
+ }
+
+ assert version.prerelease == 0;
+ output.writeInt(si.maxDoc());
+
+ output.writeByte((byte) (si.getUseCompoundFile() ? SegmentInfo.YES : SegmentInfo.NO));
+ output.writeMapOfStrings(si.getDiagnostics());
+ Set files = si.files();
+ for (String file : files) {
+ if (!IndexFileNames.parseSegmentName(file).equals(si.name)) {
+ throw new IllegalArgumentException("invalid files: expected segment=" + si.name + ", got=" + files);
+ }
+ }
+ output.writeSetOfStrings(files);
+ output.writeMapOfStrings(si.getAttributes());
+
+ Sort indexSort = si.getIndexSort();
+ int numSortFields = indexSort == null ? 0 : indexSort.getSort().length;
+ output.writeVInt(numSortFields);
+ for (int i = 0; i < numSortFields; ++i) {
+ SortField sortField = indexSort.getSort()[i];
+ SortField.Type sortType = sortField.getType();
+ output.writeString(sortField.getField());
+ int sortTypeID;
+ switch (sortField.getType()) {
+ case STRING:
+ sortTypeID = 0;
+ break;
+ case LONG:
+ sortTypeID = 1;
+ break;
+ case INT:
+ sortTypeID = 2;
+ break;
+ case DOUBLE:
+ sortTypeID = 3;
+ break;
+ case FLOAT:
+ sortTypeID = 4;
+ break;
+ case CUSTOM:
+ if (sortField instanceof SortedSetSortField) {
+ sortTypeID = 5;
+ sortType = SortField.Type.STRING;
+ } else if (sortField instanceof SortedNumericSortField) {
+ sortTypeID = 6;
+ sortType = ((SortedNumericSortField) sortField).getNumericType();
+ } else {
+ throw new IllegalStateException("Unexpected SortedNumericSortField " + sortField);
+ }
+ break;
+ default:
+ throw new IllegalStateException("Unexpected sort type: " + sortField.getType());
+ }
+ output.writeVInt(sortTypeID);
+ if (sortTypeID == 5) {
+ SortedSetSortField ssf = (SortedSetSortField) sortField;
+ if (ssf.getSelector() == SortedSetSelector.Type.MIN) {
+ output.writeByte((byte) 0);
+ } else if (ssf.getSelector() == SortedSetSelector.Type.MAX) {
+ output.writeByte((byte) 1);
+ } else if (ssf.getSelector() == SortedSetSelector.Type.MIDDLE_MIN) {
+ output.writeByte((byte) 2);
+ } else if (ssf.getSelector() == SortedSetSelector.Type.MIDDLE_MAX) {
+ output.writeByte((byte) 3);
+ } else {
+ throw new IllegalStateException("Unexpected SortedSetSelector type: " + ssf.getSelector());
+ }
+ } else if (sortTypeID == 6) {
+ SortedNumericSortField snsf = (SortedNumericSortField) sortField;
+ if (snsf.getNumericType() == SortField.Type.LONG) {
+ output.writeByte((byte) 0);
+ } else if (snsf.getNumericType() == SortField.Type.INT) {
+ output.writeByte((byte) 1);
+ } else if (snsf.getNumericType() == SortField.Type.DOUBLE) {
+ output.writeByte((byte) 2);
+ } else if (snsf.getNumericType() == SortField.Type.FLOAT) {
+ output.writeByte((byte) 3);
+ } else {
+ throw new IllegalStateException("Unexpected SortedNumericSelector type: " + snsf.getNumericType());
+ }
+ if (snsf.getSelector() == SortedNumericSelector.Type.MIN) {
+ output.writeByte((byte) 0);
+ } else if (snsf.getSelector() == SortedNumericSelector.Type.MAX) {
+ output.writeByte((byte) 1);
+ } else {
+ throw new IllegalStateException("Unexpected sorted numeric selector type: " + snsf.getSelector());
+ }
+ }
+ output.writeByte((byte) (sortField.getReverse() ? 0 : 1));
+
+ // write missing value
+ Object missingValue = sortField.getMissingValue();
+ if (missingValue == null) {
+ output.writeByte((byte) 0);
+ } else {
+ switch(sortType) {
+ case STRING:
+ if (missingValue == SortField.STRING_LAST) {
+ output.writeByte((byte) 1);
+ } else if (missingValue == SortField.STRING_FIRST) {
+ output.writeByte((byte) 2);
+ } else {
+ throw new AssertionError("unrecognized missing value for STRING field \"" + sortField.getField() + "\": " + missingValue);
+ }
+ break;
+ case LONG:
+ output.writeByte((byte) 1);
+ output.writeLong(((Long) missingValue).longValue());
+ break;
+ case INT:
+ output.writeByte((byte) 1);
+ output.writeInt(((Integer) missingValue).intValue());
+ break;
+ case DOUBLE:
+ output.writeByte((byte) 1);
+ output.writeLong(Double.doubleToLongBits(((Double) missingValue).doubleValue()));
+ break;
+ case FLOAT:
+ output.writeByte((byte) 1);
+ output.writeInt(Float.floatToIntBits(((Float) missingValue).floatValue()));
+ break;
+ default:
+ throw new IllegalStateException("Unexpected sort type: " + sortField.getType());
+ }
+ }
+ }
+
+ CodecUtil.writeFooter(output);
+ }
+ }
+
+}
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70SegmentInfoFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene70/TestLucene70SegmentInfoFormat.java
similarity index 77%
rename from lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70SegmentInfoFormat.java
rename to lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene70/TestLucene70SegmentInfoFormat.java
index 3bf6a18c28aa..ac516a121ef0 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70SegmentInfoFormat.java
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene70/TestLucene70SegmentInfoFormat.java
@@ -14,22 +14,29 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
package org.apache.lucene.codecs.lucene70;
import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.FilterCodec;
+import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.index.BaseSegmentInfoFormatTestCase;
-import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.Version;
public class TestLucene70SegmentInfoFormat extends BaseSegmentInfoFormatTestCase {
@Override
protected Version[] getVersions() {
- return new Version[] { Version.LATEST };
+ return new Version[] { Version.LUCENE_8_4_0 };
}
@Override
protected Codec getCodec() {
- return TestUtil.getDefaultCodec();
+ return new FilterCodec("Lucene84", Codec.forName("Lucene84")) {
+ @Override
+ public SegmentInfoFormat segmentInfoFormat() {
+ return new Lucene70RWSegmentInfoFormat();
+ }
+ };
}
}
diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene84/Lucene84RWCodec.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene84/Lucene84RWCodec.java
new file mode 100644
index 000000000000..c1fd4677f928
--- /dev/null
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene84/Lucene84RWCodec.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene84;
+
+import org.apache.lucene.codecs.PointsFormat;
+import org.apache.lucene.codecs.SegmentInfoFormat;
+import org.apache.lucene.codecs.lucene60.Lucene60RWPointsFormat;
+import org.apache.lucene.codecs.lucene70.Lucene70RWSegmentInfoFormat;
+
+/**
+ * RW impersonation of {@link Lucene84Codec}.
+ */
+public class Lucene84RWCodec extends Lucene84Codec {
+
+ @Override
+ public PointsFormat pointsFormat() {
+ return new Lucene60RWPointsFormat();
+ }
+
+ @Override
+ public SegmentInfoFormat segmentInfoFormat() {
+ return new Lucene70RWSegmentInfoFormat();
+ }
+
+}
diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java
index 245cef1b2a09..c14919462056 100644
--- a/lucene/backward-codecs/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java
@@ -305,7 +305,9 @@ public void testCreateEmptyIndex() throws Exception {
"8.5.0-cfs",
"8.5.0-nocfs",
"8.5.1-cfs",
- "8.5.1-nocfs"
+ "8.5.1-nocfs",
+ "8.5.2-cfs",
+ "8.5.2-nocfs"
};
public static String[] getOldNames() {
@@ -322,7 +324,8 @@ public static String[] getOldNames() {
"sorted.8.4.0",
"sorted.8.4.1",
"sorted.8.5.0",
- "sorted.8.5.1"
+ "sorted.8.5.1",
+ "sorted.8.5.2"
};
public static String[] getOldSortedNames() {
@@ -524,7 +527,9 @@ public static String[] getOldSortedNames() {
"7.7.1-cfs",
"7.7.1-nocfs",
"7.7.2-cfs",
- "7.7.2-nocfs"
+ "7.7.2-nocfs",
+ "7.7.3-cfs",
+ "7.7.3-nocfs"
};
// TODO: on 6.0.0 release, gen the single segment indices and add here:
diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/index/index.8.5.2-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/index/index.8.5.2-cfs.zip
new file mode 100644
index 000000000000..06ef027031e8
Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/index/index.8.5.2-cfs.zip differ
diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/index/index.8.5.2-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/index/index.8.5.2-nocfs.zip
new file mode 100644
index 000000000000..dabe2d4ca0b4
Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/index/index.8.5.2-nocfs.zip differ
diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/index/sorted.8.5.2.zip b/lucene/backward-codecs/src/test/org/apache/lucene/index/sorted.8.5.2.zip
new file mode 100644
index 000000000000..738f1db9938b
Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/index/sorted.8.5.2.zip differ
diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/index/unsupported.7.7.3-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/index/unsupported.7.7.3-cfs.zip
new file mode 100644
index 000000000000..03f5d64bb4af
Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/index/unsupported.7.7.3-cfs.zip differ
diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/index/unsupported.7.7.3-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/index/unsupported.7.7.3-nocfs.zip
new file mode 100644
index 000000000000..94aaa74815c1
Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/index/unsupported.7.7.3-nocfs.zip differ
diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/index/unsupported.sorted.7.7.3.zip b/lucene/backward-codecs/src/test/org/apache/lucene/index/unsupported.sorted.7.7.3.zip
new file mode 100644
index 000000000000..3468e8b942ea
Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/index/unsupported.sorted.7.7.3.zip differ
diff --git a/lucene/benchmark/build.gradle b/lucene/benchmark/build.gradle
index 52b54d9fe692..4b34ccefdd9c 100644
--- a/lucene/benchmark/build.gradle
+++ b/lucene/benchmark/build.gradle
@@ -15,11 +15,13 @@
* limitations under the License.
*/
+apply plugin: 'java'
+// NOT a 'java-library'. Maybe 'application' but seems too limiting.
-apply plugin: 'java-library'
+description = 'System for benchmarking Lucene'
dependencies {
- api project(':lucene:core')
+ implementation project(':lucene:core')
implementation project(':lucene:analysis:common')
implementation project(':lucene:facet')
@@ -35,5 +37,120 @@ dependencies {
exclude module: "xml-apis"
})
+ runtimeOnly project(':lucene:analysis:icu')
+
testImplementation project(':lucene:test-framework')
}
+
+def tempDir = file("temp")
+def workDir = file("work")
+
+task run(type: JavaExec) {
+ description "Run a perf test (optional: -PtaskAlg=conf/your-algorithm-file -PmaxHeapSize=1G)"
+ main 'org.apache.lucene.benchmark.byTask.Benchmark'
+ classpath sourceSets.main.runtimeClasspath
+ // allow these to be specified on the CLI via -PtaskAlg= for example
+ args = [propertyOrDefault('taskAlg', 'conf/micro-standard.alg')]
+
+ maxHeapSize = propertyOrDefault('maxHeapSize', '1G')
+
+ String stdOutStr = propertyOrDefault('standardOutput', null)
+ if (stdOutStr != null) {
+ standardOutput = new File(stdOutStr).newOutputStream()
+ }
+
+ debugOptions {
+ enabled = false
+ port = 5005
+ suspend = true
+ }
+}
+
+/* Old "collation" Ant target:
+gradle getTop100kWikiWordFiles run -PtaskAlg=conf/collation.alg -PstandardOutput=work/collation.benchmark.output.txt
+perl -CSD scripts/collation.bm2jira.pl work/collation.benchmark.output.txt
+ */
+
+/* Old "shingle" Ant target:
+gradle getReuters run -PtaskAlg=conf/shingle.alg -PstandardOutput=work/shingle.benchmark.output.txt
+perl -CSD scripts/shingle.bm2jira.pl work/shingle.benchmark.output.txt
+ */
+
+// The remaining tasks just get / extract / prepare data
+
+task getEnWiki(type: Download) {
+ def finalName = "enwiki-20070527-pages-articles.xml"
+ src "https://home.apache.org/~dsmiley/data/" + finalName + ".bz2"
+ dest file("$tempDir/" + finalName + ".bz2")
+ overwrite false
+ compress false
+
+ doLast {
+ ant.bunzip2(src: dest, dest: tempDir)
+ }
+ outputs.file file("$tempDir/$finalName")
+}
+
+task getGeoNames(type: Download) {
+ // note: latest data is at: https://download.geonames.org/export/dump/allCountries.zip
+ // and then randomize with: gsort -R -S 1500M file.txt > file_random.txt
+ // and then compress with: bzip2 -9 -k file_random.txt
+ def finalName = "geonames_20130921_randomOrder_allCountries.txt"
+ src "https://home.apache.org/~dsmiley/data/" + finalName + ".bz2"
+ dest file("$tempDir/" + finalName + ".bz2")
+ overwrite false
+ compress false
+
+ doLast {
+ ant.bunzip2(src: dest, dest: tempDir) // will chop off .bz2
+ }
+ outputs.file file("$tempDir/$finalName")
+}
+
+task getTop100kWikiWordFiles(type: Download) {
+ src "https://home.apache.org/~rmuir/wikipedia/top.100k.words.de.en.fr.uk.wikipedia.2009-11.tar.bz2"
+ dest file("$tempDir/${src.file.split('/').last()}")
+ overwrite false
+ compress false
+
+ def finalPath = file("$workDir/top100k-out")
+
+ doLast {
+ project.sync {
+ from tarTree(dest) // defined above. Will decompress on the fly
+ into finalPath
+ }
+ }
+ outputs.dir finalPath
+}
+
+task getReuters(type: Download) {
+ // note: there is no HTTPS url and we don't care because this is merely test/perf data
+ src "http://www.daviddlewis.com/resources/testcollections/reuters21578/reuters21578.tar.gz"
+ dest file("$tempDir/${src.file.split('/').last()}")
+ overwrite false
+ compress false
+
+ def untarPath = file("$workDir/reuters")
+ def finalPath = file("$workDir/reuters-out")
+ dependsOn sourceSets.main.runtimeClasspath
+
+ doLast {
+ project.sync {
+ from(tarTree(dest)) { // defined above. Will decompress on the fly
+ exclude '*.txt'
+ }
+ into untarPath
+ }
+ println "Extracting reuters to $finalPath"
+ finalPath.deleteDir() // necessary
+ // TODO consider porting ExtractReuters to groovy?
+ project.javaexec {
+ main = 'org.apache.lucene.benchmark.utils.ExtractReuters'
+ classpath = sourceSets.main.runtimeClasspath
+ maxHeapSize = '1G'
+ args = [untarPath, finalPath]
+ }
+ }
+ outputs.dir finalPath
+}
\ No newline at end of file
diff --git a/lucene/benchmark/scripts/collation.bm2jira.pl b/lucene/benchmark/scripts/collation.bm2jira.pl
index b423f75ee8a8..41f67491bff6 100644
--- a/lucene/benchmark/scripts/collation.bm2jira.pl
+++ b/lucene/benchmark/scripts/collation.bm2jira.pl
@@ -40,17 +40,17 @@
}
# Print out platform info
-print "JAVA:\n", `java -version 2>&1`, "\nOS:\n";
-if ($^O =~ /win/i) {
- print "$^O\n";
- eval {
- require Win32;
- print Win32::GetOSName(), "\n", Win32::GetOSVersion(), "\n";
- };
- die "Error loading Win32: $@" if ($@);
-} else {
- print `uname -a 2>&1`;
-}
+#print "JAVA:\n", `java -version 2>&1`, "\nOS:\n";
+#if ($^O =~ /win/i) {
+# print "$^O\n";
+# eval {
+# require Win32;
+# print Win32::GetOSName(), "\n", Win32::GetOSVersion(), "\n";
+# };
+# die "Error loading Win32: $@" if ($@);
+#} else {
+# print `uname -a 2>&1`;
+#}
print "\n||Language||java.text||ICU4J||KeywordAnalyzer||ICU4J Improvement||\n";
diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
index 55103284d43b..db64781cff70 100644
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
+++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
@@ -29,7 +29,7 @@
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.PostingsFormat;
-import org.apache.lucene.codecs.lucene84.Lucene84Codec;
+import org.apache.lucene.codecs.lucene86.Lucene86Codec;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.IndexCommit;
import org.apache.lucene.index.IndexDeletionPolicy;
@@ -138,7 +138,7 @@ public static IndexWriterConfig createWriterConfig(Config config, PerfRunData ru
if (defaultCodec == null && postingsFormat != null) {
try {
final PostingsFormat postingsFormatChosen = PostingsFormat.forName(postingsFormat);
- iwConf.setCodec(new Lucene84Codec() {
+ iwConf.setCodec(new Lucene86Codec() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
return postingsFormatChosen;
diff --git a/lucene/build.gradle b/lucene/build.gradle
index 1efd5f4d6a03..1c71edb190b7 100644
--- a/lucene/build.gradle
+++ b/lucene/build.gradle
@@ -15,6 +15,8 @@
* limitations under the License.
*/
+description = 'Parent project for Apache Lucene Core'
+
subprojects {
group "org.apache.lucene"
}
\ No newline at end of file
diff --git a/lucene/classification/build.gradle b/lucene/classification/build.gradle
index 19c9ae7cc209..736dfb305a78 100644
--- a/lucene/classification/build.gradle
+++ b/lucene/classification/build.gradle
@@ -17,6 +17,8 @@
apply plugin: 'java-library'
+description = 'Classification module for Lucene'
+
dependencies {
api project(':lucene:core')
diff --git a/lucene/classification/src/test/org/apache/lucene/classification/document/DocumentClassificationTestBase.java b/lucene/classification/src/test/org/apache/lucene/classification/document/DocumentClassificationTestBase.java
index 3848151c5f87..bdf3ed8b2493 100644
--- a/lucene/classification/src/test/org/apache/lucene/classification/document/DocumentClassificationTestBase.java
+++ b/lucene/classification/src/test/org/apache/lucene/classification/document/DocumentClassificationTestBase.java
@@ -36,7 +36,7 @@
/**
* Base class for testing {@link org.apache.lucene.classification.Classifier}s
*/
-public abstract class DocumentClassificationTestBase extends ClassificationTestBase {
+public abstract class DocumentClassificationTestBase extends ClassificationTestBase{
protected static final BytesRef VIDEOGAME_RESULT = new BytesRef("videogames");
protected static final BytesRef VIDEOGAME_ANALYZED_RESULT = new BytesRef("videogam");
diff --git a/lucene/codecs/build.gradle b/lucene/codecs/build.gradle
index e39f2724af4c..ad26aae2d941 100644
--- a/lucene/codecs/build.gradle
+++ b/lucene/codecs/build.gradle
@@ -17,6 +17,8 @@
apply plugin: 'java-library'
+description = 'Lucene codecs and postings formats'
+
dependencies {
implementation project(':lucene:core')
testImplementation project(':lucene:test-framework')
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexReader.java
index f3d373ebc793..a67f2ddc7cec 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexReader.java
@@ -148,7 +148,7 @@ private final class FieldIndexData implements Accountable {
public FieldIndexData(IndexInput in, FieldInfo fieldInfo, long indexStart) throws IOException {
IndexInput clone = in.clone();
clone.seek(indexStart);
- fst = new FST<>(clone, fstOutputs);
+ fst = new FST<>(clone, clone, fstOutputs);
clone.close();
/*
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java
index b8785050b90b..dd327a05f1f8 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/VariableGapTermsIndexWriter.java
@@ -280,7 +280,7 @@ public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IO
public void finish(long termsFilePointer) throws IOException {
fst = fstCompiler.compile();
if (fst != null) {
- fst.save(out);
+ fst.save(out, out);
}
}
}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java
index b04977567ad3..e3bd3cad062d 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java
@@ -832,7 +832,7 @@ public void finish() throws IOException {
// Write FST to index
indexStartFP = indexOut.getFilePointer();
- root.index.save(indexOut);
+ root.index.save(indexOut, indexOut);
//System.out.println(" write FST " + indexStartFP + " field=" + fieldInfo.name);
// if (SAVE_DOT_FILES || DEBUG) {
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java
index 54954e85d3d2..e9772fb6f063 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java
@@ -78,7 +78,7 @@ final class OrdsFieldReader extends Terms implements Accountable {
final IndexInput clone = indexIn.clone();
//System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name);
clone.seek(indexStartFP);
- index = new FST<>(clone, OrdsBlockTreeTermsWriter.FST_OUTPUTS);
+ index = new FST<>(clone, clone, OrdsBlockTreeTermsWriter.FST_OUTPUTS);
/*
if (true) {
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java
index 8c232fa8d486..4cbaffce4894 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java
@@ -176,7 +176,7 @@ final class TermsReader extends Terms implements Accountable {
this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
- this.dict = new FST<>(in, new FSTTermOutputs(fieldInfo));
+ this.dict = new FST<>(in, in, new FSTTermOutputs(fieldInfo));
}
@Override
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java
index 751f3097e534..c16c2349b1f5 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java
@@ -209,7 +209,7 @@ public void close() throws IOException {
}
out.writeVLong(field.sumDocFreq);
out.writeVInt(field.docCount);
- field.dict.save(out);
+ field.dict.save(out, out);
}
writeTrailer(out, dirStart);
CodecUtil.writeFooter(out);
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java
index 5f22f6252c21..2acfe01618d1 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java
@@ -29,17 +29,16 @@
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.IndexSorter;
import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.SortFieldProvider;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
-import org.apache.lucene.search.SortedNumericSelector;
-import org.apache.lucene.search.SortedNumericSortField;
-import org.apache.lucene.search.SortedSetSelector;
-import org.apache.lucene.search.SortedSetSortField;
+import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
-import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
@@ -68,11 +67,9 @@ public class SimpleTextSegmentInfoFormat extends SegmentInfoFormat {
final static BytesRef SI_FILE = new BytesRef(" file ");
final static BytesRef SI_ID = new BytesRef(" id ");
final static BytesRef SI_SORT = new BytesRef(" sort ");
- final static BytesRef SI_SORT_FIELD = new BytesRef(" field ");
final static BytesRef SI_SORT_TYPE = new BytesRef(" type ");
- final static BytesRef SI_SELECTOR_TYPE = new BytesRef(" selector ");
- final static BytesRef SI_SORT_REVERSE = new BytesRef(" reverse ");
- final static BytesRef SI_SORT_MISSING = new BytesRef(" missing ");
+ final static BytesRef SI_SORT_NAME = new BytesRef(" name ");
+ final static BytesRef SI_SORT_BYTES = new BytesRef(" bytes ");
public static final String SI_EXTENSION = "si";
@@ -171,133 +168,18 @@ public SegmentInfo read(Directory directory, String segmentName, byte[] segmentI
SortField[] sortField = new SortField[numSortFields];
for (int i = 0; i < numSortFields; ++i) {
SimpleTextUtil.readLine(input, scratch);
- assert StringHelper.startsWith(scratch.get(), SI_SORT_FIELD);
- final String field = readString(SI_SORT_FIELD.length, scratch);
+ assert StringHelper.startsWith(scratch.get(), SI_SORT_NAME);
+ final String provider = readString(SI_SORT_NAME.length, scratch);
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch.get(), SI_SORT_TYPE);
- final String typeAsString = readString(SI_SORT_TYPE.length, scratch);
-
- final SortField.Type type;
- SortedSetSelector.Type selectorSet = null;
- SortedNumericSelector.Type selectorNumeric = null;
- switch (typeAsString) {
- case "string":
- type = SortField.Type.STRING;
- break;
- case "long":
- type = SortField.Type.LONG;
- break;
- case "int":
- type = SortField.Type.INT;
- break;
- case "double":
- type = SortField.Type.DOUBLE;
- break;
- case "float":
- type = SortField.Type.FLOAT;
- break;
- case "multi_valued_string":
- type = SortField.Type.STRING;
- selectorSet = readSetSelector(input, scratch);
- break;
- case "multi_valued_long":
- type = SortField.Type.LONG;
- selectorNumeric = readNumericSelector(input, scratch);
- break;
- case "multi_valued_int":
- type = SortField.Type.INT;
- selectorNumeric = readNumericSelector(input, scratch);
- break;
- case "multi_valued_double":
- type = SortField.Type.DOUBLE;
- selectorNumeric = readNumericSelector(input, scratch);
- break;
- case "multi_valued_float":
- type = SortField.Type.FLOAT;
- selectorNumeric = readNumericSelector(input, scratch);
- break;
- default:
- throw new CorruptIndexException("unable to parse sort type string: " + typeAsString, input);
- }
SimpleTextUtil.readLine(input, scratch);
- assert StringHelper.startsWith(scratch.get(), SI_SORT_REVERSE);
- final boolean reverse = Boolean.parseBoolean(readString(SI_SORT_REVERSE.length, scratch));
-
- SimpleTextUtil.readLine(input, scratch);
- assert StringHelper.startsWith(scratch.get(), SI_SORT_MISSING);
- final String missingLastAsString = readString(SI_SORT_MISSING.length, scratch);
- final Object missingValue;
- switch (type) {
- case STRING:
- switch (missingLastAsString) {
- case "null":
- missingValue = null;
- break;
- case "first":
- missingValue = SortField.STRING_FIRST;
- break;
- case "last":
- missingValue = SortField.STRING_LAST;
- break;
- default:
- throw new CorruptIndexException("unable to parse missing string: " + typeAsString, input);
- }
- break;
- case LONG:
- switch (missingLastAsString) {
- case "null":
- missingValue = null;
- break;
- default:
- missingValue = Long.parseLong(missingLastAsString);
- break;
- }
- break;
- case INT:
- switch (missingLastAsString) {
- case "null":
- missingValue = null;
- break;
- default:
- missingValue = Integer.parseInt(missingLastAsString);
- break;
- }
- break;
- case DOUBLE:
- switch (missingLastAsString) {
- case "null":
- missingValue = null;
- break;
- default:
- missingValue = Double.parseDouble(missingLastAsString);
- break;
- }
- break;
- case FLOAT:
- switch (missingLastAsString) {
- case "null":
- missingValue = null;
- break;
- default:
- missingValue = Float.parseFloat(missingLastAsString);
- break;
- }
- break;
- default:
- throw new AssertionError();
- }
- if (selectorSet != null) {
- sortField[i] = new SortedSetSortField(field, reverse);
- } else if (selectorNumeric != null) {
- sortField[i] = new SortedNumericSortField(field, type, reverse);
- } else {
- sortField[i] = new SortField(field, type, reverse);
- }
- if (missingValue != null) {
- sortField[i].setMissingValue(missingValue);
- }
+ assert StringHelper.startsWith(scratch.get(), SI_SORT_BYTES);
+ BytesRef serializedSort = SimpleTextUtil.fromBytesRefString(readString(SI_SORT_BYTES.length, scratch));
+ final ByteArrayDataInput bytes = new ByteArrayDataInput(serializedSort.bytes, serializedSort.offset, serializedSort.length);
+ sortField[i] = SortFieldProvider.forName(provider).readSortField(bytes);
+ assert bytes.eof();
}
Sort indexSort = sortField.length == 0 ? null : new Sort(sortField);
@@ -313,38 +195,6 @@ public SegmentInfo read(Directory directory, String segmentName, byte[] segmentI
private String readString(int offset, BytesRefBuilder scratch) {
return new String(scratch.bytes(), offset, scratch.length()-offset, StandardCharsets.UTF_8);
}
-
- private SortedSetSelector.Type readSetSelector(IndexInput input, BytesRefBuilder scratch) throws IOException {
- SimpleTextUtil.readLine(input, scratch);
- assert StringHelper.startsWith(scratch.get(), SI_SELECTOR_TYPE);
- final String selectorAsString = readString(SI_SELECTOR_TYPE.length, scratch);
- switch (selectorAsString) {
- case "min":
- return SortedSetSelector.Type.MIN;
- case "middle_min":
- return SortedSetSelector.Type.MIDDLE_MIN;
- case "middle_max":
- return SortedSetSelector.Type.MIDDLE_MAX;
- case "max":
- return SortedSetSelector.Type.MAX;
- default:
- throw new CorruptIndexException("unable to parse SortedSetSelector type: " + selectorAsString, input);
- }
- }
-
- private SortedNumericSelector.Type readNumericSelector(IndexInput input, BytesRefBuilder scratch) throws IOException {
- SimpleTextUtil.readLine(input, scratch);
- assert StringHelper.startsWith(scratch.get(), SI_SELECTOR_TYPE);
- final String selectorAsString = readString(SI_SELECTOR_TYPE.length, scratch);
- switch (selectorAsString) {
- case "min":
- return SortedNumericSelector.Type.MIN;
- case "max":
- return SortedNumericSelector.Type.MAX;
- default:
- throw new CorruptIndexException("unable to parse SortedNumericSelector type: " + selectorAsString, input);
- }
- }
@Override
public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOException {
@@ -434,120 +284,42 @@ public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOE
SimpleTextUtil.writeNewline(output);
for (int i = 0; i < numSortFields; ++i) {
final SortField sortField = indexSort.getSort()[i];
+ IndexSorter sorter = sortField.getIndexSorter();
+ if (sorter == null) {
+ throw new IllegalStateException("Cannot serialize sort " + sortField);
+ }
- SimpleTextUtil.write(output, SI_SORT_FIELD);
- SimpleTextUtil.write(output, sortField.getField(), scratch);
+ SimpleTextUtil.write(output, SI_SORT_NAME);
+ SimpleTextUtil.write(output, sorter.getProviderName(), scratch);
SimpleTextUtil.writeNewline(output);
SimpleTextUtil.write(output, SI_SORT_TYPE);
- final String sortTypeString;
- final SortField.Type sortType;
- final boolean multiValued;
- if (sortField instanceof SortedSetSortField) {
- sortType = SortField.Type.STRING;
- multiValued = true;
- } else if (sortField instanceof SortedNumericSortField) {
- sortType = ((SortedNumericSortField) sortField).getNumericType();
- multiValued = true;
- } else {
- sortType = sortField.getType();
- multiValued = false;
- }
- switch (sortType) {
- case STRING:
- if (multiValued) {
- sortTypeString = "multi_valued_string";
- } else {
- sortTypeString = "string";
- }
- break;
- case LONG:
- if (multiValued) {
- sortTypeString = "multi_valued_long";
- } else {
- sortTypeString = "long";
- }
- break;
- case INT:
- if (multiValued) {
- sortTypeString = "multi_valued_int";
- } else {
- sortTypeString = "int";
- }
- break;
- case DOUBLE:
- if (multiValued) {
- sortTypeString = "multi_valued_double";
- } else {
- sortTypeString = "double";
- }
- break;
- case FLOAT:
- if (multiValued) {
- sortTypeString = "multi_valued_float";
- } else {
- sortTypeString = "float";
- }
- break;
- default:
- throw new IllegalStateException("Unexpected sort type: " + sortField.getType());
- }
- SimpleTextUtil.write(output, sortTypeString, scratch);
- SimpleTextUtil.writeNewline(output);
-
- if (sortField instanceof SortedSetSortField) {
- SortedSetSelector.Type selector = ((SortedSetSortField) sortField).getSelector();
- final String selectorString;
- if (selector == SortedSetSelector.Type.MIN) {
- selectorString = "min";
- } else if (selector == SortedSetSelector.Type.MIDDLE_MIN) {
- selectorString = "middle_min";
- } else if (selector == SortedSetSelector.Type.MIDDLE_MAX) {
- selectorString = "middle_max";
- } else if (selector == SortedSetSelector.Type.MAX) {
- selectorString = "max";
- } else {
- throw new IllegalStateException("Unexpected SortedSetSelector type selector: " + selector);
- }
- SimpleTextUtil.write(output, SI_SELECTOR_TYPE);
- SimpleTextUtil.write(output, selectorString, scratch);
- SimpleTextUtil.writeNewline(output);
- } else if (sortField instanceof SortedNumericSortField) {
- SortedNumericSelector.Type selector = ((SortedNumericSortField) sortField).getSelector();
- final String selectorString;
- if (selector == SortedNumericSelector.Type.MIN) {
- selectorString = "min";
- } else if (selector == SortedNumericSelector.Type.MAX) {
- selectorString = "max";
- } else {
- throw new IllegalStateException("Unexpected SortedNumericSelector type selector: " + selector);
- }
- SimpleTextUtil.write(output, SI_SELECTOR_TYPE);
- SimpleTextUtil.write(output, selectorString, scratch);
- SimpleTextUtil.writeNewline(output);
- }
-
- SimpleTextUtil.write(output, SI_SORT_REVERSE);
- SimpleTextUtil.write(output, Boolean.toString(sortField.getReverse()), scratch);
+ SimpleTextUtil.write(output, sortField.toString(), scratch);
SimpleTextUtil.writeNewline(output);
- SimpleTextUtil.write(output, SI_SORT_MISSING);
- final Object missingValue = sortField.getMissingValue();
- final String missing;
- if (missingValue == null) {
- missing = "null";
- } else if (missingValue == SortField.STRING_FIRST) {
- missing = "first";
- } else if (missingValue == SortField.STRING_LAST) {
- missing = "last";
- } else {
- missing = missingValue.toString();
- }
- SimpleTextUtil.write(output, missing, scratch);
+ SimpleTextUtil.write(output, SI_SORT_BYTES);
+ BytesRefOutput b = new BytesRefOutput();
+ SortFieldProvider.write(sortField, b);
+ SimpleTextUtil.write(output, b.bytes.get().toString(), scratch);
SimpleTextUtil.writeNewline(output);
}
SimpleTextUtil.writeChecksum(output, scratch);
}
}
+
+ static class BytesRefOutput extends DataOutput {
+
+ final BytesRefBuilder bytes = new BytesRefBuilder();
+
+ @Override
+ public void writeByte(byte b) {
+ bytes.append(b);
+ }
+
+ @Override
+ public void writeBytes(byte[] b, int offset, int length) {
+ bytes.append(b, offset, length);
+ }
+ }
}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java
index 026e8724f315..191799c252fa 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/FSTDictionary.java
@@ -71,10 +71,10 @@ public long ramBytesUsed() {
@Override
public void write(DataOutput output, BlockEncoder blockEncoder) throws IOException {
if (blockEncoder == null) {
- fst.save(output);
+ fst.save(output, output);
} else {
ByteBuffersDataOutput bytesDataOutput = ByteBuffersDataOutput.newResettableInstance();
- fst.save(bytesDataOutput);
+ fst.save(bytesDataOutput, bytesDataOutput);
BlockEncoder.WritableBytes encodedBytes = blockEncoder.encode(bytesDataOutput.toDataInput(), bytesDataOutput.size());
output.writeVLong(encodedBytes.size());
encodedBytes.writeTo(output);
@@ -98,8 +98,8 @@ protected static FSTDictionary read(DataInput input, BlockDecoder blockDecoder,
isFSTOnHeap = true;
}
PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton();
- FST fst = isFSTOnHeap ? new FST<>(fstDataInput, fstOutputs)
- : new FST<>(fstDataInput, fstOutputs, new OffHeapFSTStore());
+ FST fst = isFSTOnHeap ? new FST<>(fstDataInput, fstDataInput, fstOutputs)
+ : new FST<>(fstDataInput, fstDataInput, fstOutputs, new OffHeapFSTStore());
return new FSTDictionary(fst);
}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java
index f982ed3ad2eb..a58a1de7400a 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java
@@ -47,7 +47,9 @@ public class UniformSplitPostingsFormat extends PostingsFormat {
*/
public static final String TERMS_BLOCKS_EXTENSION = "ustb";
- public static final int VERSION_CURRENT = 0;
+ public static final int VERSION_START = 0;
+ public static final int VERSION_ENCODABLE_FIELDS_METADATA = 1;
+ public static final int VERSION_CURRENT = VERSION_ENCODABLE_FIELDS_METADATA;
public static final String NAME = "UniformSplit";
@@ -74,10 +76,10 @@ public UniformSplitPostingsFormat() {
* Must be greater than or equal to 0 and strictly less than {@code targetNumBlockLines}.
* The block size will be {@code targetNumBlockLines}+-{@code deltaNumLines}.
* The block size must always be less than or equal to {@link UniformSplitTermsWriter#MAX_NUM_BLOCK_LINES}.
- * @param blockEncoder Optional block encoder, may be null if none.
- * It can be used for compression or encryption.
- * @param blockDecoder Optional block decoder, may be null if none.
- * It can be used for compression or encryption.
+ * @param blockEncoder Optional block encoder, may be null if none. If present, it is used to encode all terms
+ * blocks, as well as the FST dictionary and the fields metadata.
+ * @param blockDecoder Optional block decoder, may be null if none. If present, it is used to decode all terms
+ * blocks, as well as the FST dictionary and the fields metadata.
* @param dictionaryOnHeap Whether to force loading the terms dictionary on-heap. By default it is kept off-heap without
* impact on performance. If block encoding/decoding is used, then the dictionary is always
* loaded on-heap whatever this parameter value is.
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsReader.java
index 9b2552b5017f..377919dc81b7 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsReader.java
@@ -34,14 +34,14 @@
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.Terms;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
-import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.NAME;
-import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.TERMS_BLOCKS_EXTENSION;
-import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.TERMS_DICTIONARY_EXTENSION;
-import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.VERSION_CURRENT;
+import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.*;
/**
* A block-based terms index and dictionary based on the Uniform Split technique.
@@ -51,12 +51,11 @@
*/
public class UniformSplitTermsReader extends FieldsProducer {
- protected static final int VERSION_START = 0;
-
private static final long BASE_RAM_USAGE = RamUsageEstimator.shallowSizeOfInstance(UniformSplitTermsReader.class)
+ RamUsageEstimator.shallowSizeOfInstance(IndexInput.class) * 2;
protected final PostingsReaderBase postingsReader;
+ protected final int version;
protected final IndexInput blockInput;
protected final IndexInput dictionaryInput;
@@ -93,7 +92,7 @@ protected UniformSplitTermsReader(PostingsReaderBase postingsReader, SegmentRead
String termsName = IndexFileNames.segmentFileName(segmentName, state.segmentSuffix, termsBlocksExtension);
blockInput = state.directory.openInput(termsName, state.context);
- int version = CodecUtil.checkIndexHeader(blockInput, codecName, versionStart,
+ version = CodecUtil.checkIndexHeader(blockInput, codecName, versionStart,
versionCurrent, state.segmentInfo.getId(), state.segmentSuffix);
String indexName = IndexFileNames.segmentFileName(segmentName, state.segmentSuffix, dictionaryExtension);
dictionaryInput = state.directory.openInput(indexName, state.context);
@@ -105,7 +104,8 @@ protected UniformSplitTermsReader(PostingsReaderBase postingsReader, SegmentRead
CodecUtil.retrieveChecksum(blockInput);
seekFieldsMetadata(blockInput);
- Collection fieldMetadataCollection = parseFieldsMetadata(blockInput, state.fieldInfos, fieldMetadataReader, state.segmentInfo.maxDoc());
+ Collection fieldMetadataCollection =
+ readFieldsMetadata(blockInput, blockDecoder, state.fieldInfos, fieldMetadataReader, state.segmentInfo.maxDoc());
fieldToTermsMap = new HashMap<>();
this.blockInput = blockInput;
@@ -143,16 +143,36 @@ protected IndexDictionary.BrowserSupplier createDictionaryBrowserSupplier(Segmen
/**
* @param indexInput {@link IndexInput} must be positioned to the fields metadata
* details by calling {@link #seekFieldsMetadata(IndexInput)} before this call.
+ * @param blockDecoder Optional block decoder, may be null if none.
*/
- protected static Collection parseFieldsMetadata(IndexInput indexInput, FieldInfos fieldInfos,
- FieldMetadata.Serializer fieldMetadataReader, int maxNumDocs) throws IOException {
+ protected Collection readFieldsMetadata(IndexInput indexInput, BlockDecoder blockDecoder, FieldInfos fieldInfos,
+ FieldMetadata.Serializer fieldMetadataReader, int maxNumDocs) throws IOException {
int numFields = indexInput.readVInt();
if (numFields < 0) {
throw new CorruptIndexException("Illegal number of fields= " + numFields, indexInput);
}
+ return (blockDecoder != null && version >= VERSION_ENCODABLE_FIELDS_METADATA) ?
+ readEncodedFieldsMetadata(numFields, indexInput, blockDecoder, fieldInfos, fieldMetadataReader, maxNumDocs)
+ : readUnencodedFieldsMetadata(numFields, indexInput, fieldInfos, fieldMetadataReader, maxNumDocs);
+ }
+
+ protected Collection readEncodedFieldsMetadata(int numFields, DataInput metadataInput, BlockDecoder blockDecoder,
+ FieldInfos fieldInfos, FieldMetadata.Serializer fieldMetadataReader,
+ int maxNumDocs) throws IOException {
+ long encodedLength = metadataInput.readVLong();
+ if (encodedLength < 0) {
+ throw new CorruptIndexException("Illegal encoded length: " + encodedLength, metadataInput);
+ }
+ BytesRef decodedBytes = blockDecoder.decode(metadataInput, encodedLength);
+ DataInput decodedMetadataInput = new ByteArrayDataInput(decodedBytes.bytes, 0, decodedBytes.length);
+ return readUnencodedFieldsMetadata(numFields, decodedMetadataInput, fieldInfos, fieldMetadataReader, maxNumDocs);
+ }
+
+ protected Collection readUnencodedFieldsMetadata(int numFields, DataInput metadataInput, FieldInfos fieldInfos,
+ FieldMetadata.Serializer fieldMetadataReader, int maxNumDocs) throws IOException {
Collection fieldMetadataCollection = new ArrayList<>(numFields);
for (int i = 0; i < numFields; i++) {
- fieldMetadataCollection.add(fieldMetadataReader.read(indexInput, fieldInfos, maxNumDocs));
+ fieldMetadataCollection.add(fieldMetadataReader.read(metadataInput, fieldInfos, maxNumDocs));
}
return fieldMetadataCollection;
}
@@ -212,7 +232,7 @@ protected long getTermsRamBytesUsed() {
/**
* Positions the given {@link IndexInput} at the beginning of the fields metadata.
*/
- protected static void seekFieldsMetadata(IndexInput indexInput) throws IOException {
+ protected void seekFieldsMetadata(IndexInput indexInput) throws IOException {
indexInput.seek(indexInput.length() - CodecUtil.footerLength() - 8);
indexInput.seek(indexInput.readLong());
}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsWriter.java
index 101b6b5942f0..c4e089f56274 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitTermsWriter.java
@@ -249,11 +249,26 @@ public void write(Fields fields, NormsProducer normsProducer) throws IOException
protected void writeFieldsMetadata(int fieldsNumber, ByteBuffersDataOutput fieldsOutput) throws IOException {
long fieldsStartPosition = blockOutput.getFilePointer();
blockOutput.writeVInt(fieldsNumber);
- fieldsOutput.copyTo(blockOutput);
+ if (blockEncoder == null) {
+ writeUnencodedFieldsMetadata(fieldsOutput);
+ } else {
+ writeEncodedFieldsMetadata(fieldsOutput);
+ }
+ // Must be a fixed length. Read by UniformSplitTermsReader when seeking fields metadata.
blockOutput.writeLong(fieldsStartPosition);
CodecUtil.writeFooter(blockOutput);
}
+ protected void writeUnencodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException {
+ fieldsOutput.copyTo(blockOutput);
+ }
+
+ protected void writeEncodedFieldsMetadata(ByteBuffersDataOutput fieldsOutput) throws IOException {
+ BlockEncoder.WritableBytes encodedBytes = blockEncoder.encode(fieldsOutput.toDataInput(), fieldsOutput.size());
+ blockOutput.writeVLong(encodedBytes.size());
+ encodedBytes.writeTo(blockOutput);
+ }
+
/**
* @return 1 if the field was written; 0 otherwise.
*/
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitPostingsFormat.java
index 57c154099045..730728ba7f45 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitPostingsFormat.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitPostingsFormat.java
@@ -54,7 +54,7 @@ public class STUniformSplitPostingsFormat extends UniformSplitPostingsFormat {
*/
public static final String TERMS_BLOCKS_EXTENSION = "stustb";
- public static final int VERSION_CURRENT = 0;
+ public static final int VERSION_CURRENT = UniformSplitPostingsFormat.VERSION_CURRENT;
public static final String NAME = "SharedTermsUniformSplit";
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsReader.java
index cc25a30cef6b..5c2b24b5fca4 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/STUniformSplitTermsReader.java
@@ -30,10 +30,7 @@
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.store.IndexInput;
-import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.NAME;
-import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.TERMS_BLOCKS_EXTENSION;
-import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.TERMS_DICTIONARY_EXTENSION;
-import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.VERSION_CURRENT;
+import static org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat.*;
/**
* A block-based terms index and dictionary based on the Uniform Split technique,
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/UnionFieldMetadataBuilder.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/UnionFieldMetadataBuilder.java
index 85b6a27fd3bb..4cf5c2623ae2 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/UnionFieldMetadataBuilder.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/sharedterms/UnionFieldMetadataBuilder.java
@@ -33,15 +33,9 @@ public class UnionFieldMetadataBuilder {
private BytesRef maxLastTerm;
public UnionFieldMetadataBuilder() {
- reset();
- }
-
- public UnionFieldMetadataBuilder reset() {
dictionaryStartFP = -1;
minStartBlockFP = Long.MAX_VALUE;
maxEndBlockFP = Long.MIN_VALUE;
- maxLastTerm = null;
- return this;
}
public UnionFieldMetadataBuilder addFieldMetadata(FieldMetadata fieldMetadata) {
diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestUniformSplitPostingFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestUniformSplitPostingFormat.java
index db1d6c12e61a..9a68a14c21a2 100644
--- a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestUniformSplitPostingFormat.java
+++ b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestUniformSplitPostingFormat.java
@@ -51,17 +51,26 @@ protected Codec getCodec() {
@Before
public void initialize() {
+ initializeInner();
+ }
+
+ protected void initializeInner() {
UniformSplitRot13PostingsFormat.resetEncodingFlags();
}
@After
public void checkEncodingCalled() {
if (checkEncoding) {
- assertTrue(UniformSplitRot13PostingsFormat.blocksEncoded);
- assertTrue(UniformSplitRot13PostingsFormat.dictionaryEncoded);
- if (shouldCheckDecoderWasCalled) {
- assertTrue(UniformSplitRot13PostingsFormat.decoderCalled);
- }
+ checkEncodingCalledInner();
+ }
+ }
+
+ protected void checkEncodingCalledInner() {
+ assertTrue(UniformSplitRot13PostingsFormat.blocksEncoded);
+ assertTrue(UniformSplitRot13PostingsFormat.fieldsMetadataEncoded);
+ assertTrue(UniformSplitRot13PostingsFormat.dictionaryEncoded);
+ if (shouldCheckDecoderWasCalled) {
+ assertTrue(UniformSplitRot13PostingsFormat.decoderCalled);
}
}
diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReaderTest.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java
similarity index 98%
rename from lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReaderTest.java
rename to lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java
index 6d09fe36e16b..5707fb4f6a03 100644
--- a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReaderTest.java
+++ b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java
@@ -51,9 +51,9 @@
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
-public class STBlockReaderTest extends LuceneTestCase {
+public class TestSTBlockReader extends LuceneTestCase {
- private static final String MOCK_BLOCK_OUTPUT_NAME = "STBlockReaderTest.tmp";
+ private static final String MOCK_BLOCK_OUTPUT_NAME = "TestSTBlockReader.tmp";
private FieldInfos fieldInfos;
private List blockLines;
diff --git a/lucene/common-build.xml b/lucene/common-build.xml
index e7fc4174de81..7bb6e55081e2 100644
--- a/lucene/common-build.xml
+++ b/lucene/common-build.xml
@@ -2342,7 +2342,7 @@ ${ant.project.name}.test.dependencies=${test.classpath.list}
-
diff --git a/lucene/core/build.gradle b/lucene/core/build.gradle
index f5609bdb7037..989c57f09357 100644
--- a/lucene/core/build.gradle
+++ b/lucene/core/build.gradle
@@ -15,9 +15,10 @@
* limitations under the License.
*/
-
apply plugin: 'java-library'
+description = 'Lucene core library'
+
dependencies {
testImplementation project(':lucene:codecs')
testImplementation project(':lucene:test-framework')
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java
index 07797c6f95d0..8b5ca14ff898 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java
@@ -57,7 +57,7 @@ static NamedSPILoader getLoader() {
}
// TODO: should we use this, or maybe a system property is better?
- static Codec defaultCodec = LOADER.lookup("Lucene84");
+ static Codec defaultCodec = LOADER.lookup("Lucene86");
}
private final String name;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java
index c49946b7ffbf..8c40e2adcd62 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java
@@ -448,24 +448,27 @@ public static void checkFooter(ChecksumIndexInput in, Throwable priorException)
checkFooter(in);
} else {
try {
+ // If we have evidence of corruption then we return the corruption as the
+ // main exception and the prior exception gets suppressed. Otherwise we
+ // return the prior exception with a suppressed exception that notifies
+ // the user that checksums matched.
long remaining = in.length() - in.getFilePointer();
if (remaining < footerLength()) {
// corruption caused us to read into the checksum footer already: we can't proceed
- priorException.addSuppressed(new CorruptIndexException("checksum status indeterminate: remaining=" + remaining +
- ", please run checkindex for more details", in));
+ throw new CorruptIndexException("checksum status indeterminate: remaining=" + remaining +
+ "; please run checkindex for more details", in);
} else {
// otherwise, skip any unread bytes.
in.skipBytes(remaining - footerLength());
// now check the footer
- try {
- long checksum = checkFooter(in);
- priorException.addSuppressed(new CorruptIndexException("checksum passed (" + Long.toHexString(checksum) +
- "). possibly transient resource issue, or a Lucene or JVM bug", in));
- } catch (CorruptIndexException t) {
- priorException.addSuppressed(t);
- }
+ long checksum = checkFooter(in);
+ priorException.addSuppressed(new CorruptIndexException("checksum passed (" + Long.toHexString(checksum) +
+ "). possibly transient resource issue, or a Lucene or JVM bug", in));
}
+ } catch (CorruptIndexException corruptException) {
+ corruptException.addSuppressed(priorException);
+ throw corruptException;
} catch (Throwable t) {
// catch-all for things that shouldn't go wrong (e.g. OOM during readInt) but could...
priorException.addSuppressed(new CorruptIndexException("checksum status indeterminate: unexpected exception", in, t));
@@ -487,7 +490,25 @@ public static long retrieveChecksum(IndexInput in) throws IOException {
validateFooter(in);
return readCRC(in);
}
-
+
+ /**
+ * Returns (but does not validate) the checksum previously written by {@link #checkFooter}.
+ * @return actual checksum value
+ * @throws IOException if the footer is invalid
+ */
+ public static long retrieveChecksum(IndexInput in, long expectedLength) throws IOException {
+ if (expectedLength < footerLength()) {
+ throw new IllegalArgumentException("expectedLength cannot be less than the footer length");
+ }
+ if (in.length() < expectedLength) {
+ throw new CorruptIndexException("truncated file: length=" + in.length() + " but expectedLength==" + expectedLength, in);
+ } else if (in.length() > expectedLength) {
+ throw new CorruptIndexException("file too long: length=" + in.length() + " but expectedLength==" + expectedLength, in);
+ }
+
+ return retrieveChecksum(in);
+ }
+
private static void validateFooter(IndexInput in) throws IOException {
long remaining = in.length() - in.getFilePointer();
long expected = footerLength();
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java
index 408d97814785..c4bae5c0b18d 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java
@@ -393,7 +393,7 @@ public SortedNumericDocValues getSortedNumeric(FieldInfo fieldInfo) throws IOExc
}
}
if (values == null) {
- values = DocValues.emptySortedNumeric(mergeState.maxDocs[i]);
+ values = DocValues.emptySortedNumeric();
}
cost += values.cost();
subs.add(new SortedNumericDocValuesSub(mergeState.docMaps[i], values));
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
index 4847017ca80a..bee914ba0d02 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
@@ -16,7 +16,6 @@
*/
package org.apache.lucene.codecs.blocktree;
-
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
@@ -35,6 +34,7 @@
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.Terms;
+import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
@@ -97,13 +97,20 @@ public final class BlockTreeTermsReader extends FieldsProducer {
/** Suffixes are compressed to save space. */
public static final int VERSION_COMPRESSED_SUFFIXES = 5;
+ /** Metadata is written to its own file. */
+ public static final int VERSION_META_FILE = 6;
+
/** Current terms format. */
- public static final int VERSION_CURRENT = VERSION_COMPRESSED_SUFFIXES;
+ public static final int VERSION_CURRENT = VERSION_META_FILE;
/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tip";
final static String TERMS_INDEX_CODEC_NAME = "BlockTreeTermsIndex";
+ /** Extension of terms meta file */
+ static final String TERMS_META_EXTENSION = "tmd";
+ final static String TERMS_META_CODEC_NAME = "BlockTreeTermsMeta";
+
// Open input to the main terms dict file (_X.tib)
final IndexInput termsIn;
// Open input to the terms index file (_X.tip)
@@ -128,9 +135,9 @@ public BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState
this.postingsReader = postingsReader;
this.segment = state.segmentInfo.name;
-
- String termsName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_EXTENSION);
+
try {
+ String termsName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_EXTENSION);
termsIn = state.directory.openInput(termsName, state.context);
version = CodecUtil.checkIndexHeader(termsIn, TERMS_CODEC_NAME, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
@@ -138,66 +145,106 @@ public BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState
indexIn = state.directory.openInput(indexName, state.context);
CodecUtil.checkIndexHeader(indexIn, TERMS_INDEX_CODEC_NAME, version, version, state.segmentInfo.getId(), state.segmentSuffix);
- // Have PostingsReader init itself
- postingsReader.init(termsIn, state);
+ if (version < VERSION_META_FILE) {
+ // Have PostingsReader init itself
+ postingsReader.init(termsIn, state);
- // Verifying the checksum against all bytes would be too costly, but for now we at least
- // verify proper structure of the checksum footer. This is cheap and can detect some forms
- // of corruption such as file truncation.
- CodecUtil.retrieveChecksum(indexIn);
- CodecUtil.retrieveChecksum(termsIn);
+ // Verifying the checksum against all bytes would be too costly, but for now we at least
+ // verify proper structure of the checksum footer. This is cheap and can detect some forms
+ // of corruption such as file truncation.
+ CodecUtil.retrieveChecksum(indexIn);
+ CodecUtil.retrieveChecksum(termsIn);
+ }
// Read per-field details
- seekDir(termsIn);
- seekDir(indexIn);
+ String metaName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_META_EXTENSION);
+ Map fieldMap = null;
+ Throwable priorE = null;
+ long indexLength = -1, termsLength = -1;
+ try (ChecksumIndexInput metaIn = version >= VERSION_META_FILE ? state.directory.openChecksumInput(metaName, state.context) : null) {
+ try {
+ final IndexInput indexMetaIn, termsMetaIn;
+ if (version >= VERSION_META_FILE) {
+ CodecUtil.checkIndexHeader(metaIn, TERMS_META_CODEC_NAME, version, version, state.segmentInfo.getId(), state.segmentSuffix);
+ indexMetaIn = termsMetaIn = metaIn;
+ postingsReader.init(metaIn, state);
+ } else {
+ seekDir(termsIn);
+ seekDir(indexIn);
+ indexMetaIn = indexIn;
+ termsMetaIn = termsIn;
+ }
- final int numFields = termsIn.readVInt();
- if (numFields < 0) {
- throw new CorruptIndexException("invalid numFields: " + numFields, termsIn);
- }
- fieldMap = new HashMap<>((int) (numFields / 0.75f) + 1);
- for (int i = 0; i < numFields; ++i) {
- final int field = termsIn.readVInt();
- final long numTerms = termsIn.readVLong();
- if (numTerms <= 0) {
- throw new CorruptIndexException("Illegal numTerms for field number: " + field, termsIn);
- }
- final BytesRef rootCode = readBytesRef(termsIn);
- final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
- if (fieldInfo == null) {
- throw new CorruptIndexException("invalid field number: " + field, termsIn);
- }
- final long sumTotalTermFreq = termsIn.readVLong();
- // when frequencies are omitted, sumDocFreq=sumTotalTermFreq and only one value is written.
- final long sumDocFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : termsIn.readVLong();
- final int docCount = termsIn.readVInt();
- if (version < VERSION_META_LONGS_REMOVED) {
- final int longsSize = termsIn.readVInt();
- if (longsSize < 0) {
- throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, termsIn);
+ final int numFields = termsMetaIn.readVInt();
+ if (numFields < 0) {
+ throw new CorruptIndexException("invalid numFields: " + numFields, termsMetaIn);
+ }
+ fieldMap = new HashMap<>((int) (numFields / 0.75f) + 1);
+ for (int i = 0; i < numFields; ++i) {
+ final int field = termsMetaIn.readVInt();
+ final long numTerms = termsMetaIn.readVLong();
+ if (numTerms <= 0) {
+ throw new CorruptIndexException("Illegal numTerms for field number: " + field, termsMetaIn);
+ }
+ final BytesRef rootCode = readBytesRef(termsMetaIn);
+ final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
+ if (fieldInfo == null) {
+ throw new CorruptIndexException("invalid field number: " + field, termsMetaIn);
+ }
+ final long sumTotalTermFreq = termsMetaIn.readVLong();
+ // when frequencies are omitted, sumDocFreq=sumTotalTermFreq and only one value is written.
+ final long sumDocFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : termsMetaIn.readVLong();
+ final int docCount = termsMetaIn.readVInt();
+ if (version < VERSION_META_LONGS_REMOVED) {
+ final int longsSize = termsMetaIn.readVInt();
+ if (longsSize < 0) {
+ throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, termsMetaIn);
+ }
+ }
+ BytesRef minTerm = readBytesRef(termsMetaIn);
+ BytesRef maxTerm = readBytesRef(termsMetaIn);
+ if (docCount < 0 || docCount > state.segmentInfo.maxDoc()) { // #docs with field must be <= #docs
+ throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.maxDoc(), termsMetaIn);
+ }
+ if (sumDocFreq < docCount) { // #postings must be >= #docs with field
+ throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount, termsMetaIn);
+ }
+ if (sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings
+ throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, termsMetaIn);
+ }
+ final long indexStartFP = indexMetaIn.readVLong();
+ FieldReader previous = fieldMap.put(fieldInfo.name,
+ new FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount,
+ indexStartFP, indexMetaIn, indexIn, minTerm, maxTerm));
+ if (previous != null) {
+ throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsMetaIn);
+ }
+ }
+ if (version >= VERSION_META_FILE) {
+ indexLength = metaIn.readLong();
+ termsLength = metaIn.readLong();
+ }
+ } catch (Throwable exception) {
+ priorE = exception;
+ } finally {
+ if (metaIn != null) {
+ CodecUtil.checkFooter(metaIn, priorE);
+ } else if (priorE != null) {
+ IOUtils.rethrowAlways(priorE);
}
}
- BytesRef minTerm = readBytesRef(termsIn);
- BytesRef maxTerm = readBytesRef(termsIn);
- if (docCount < 0 || docCount > state.segmentInfo.maxDoc()) { // #docs with field must be <= #docs
- throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.maxDoc(), termsIn);
- }
- if (sumDocFreq < docCount) { // #postings must be >= #docs with field
- throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount, termsIn);
- }
- if (sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings
- throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, termsIn);
- }
- final long indexStartFP = indexIn.readVLong();
- FieldReader previous = fieldMap.put(fieldInfo.name,
- new FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount,
- indexStartFP, indexIn, minTerm, maxTerm));
- if (previous != null) {
- throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsIn);
- }
+ }
+ if (version >= VERSION_META_FILE) {
+ // At this point the checksum of the meta file has been verified so the lengths are likely correct
+ CodecUtil.retrieveChecksum(indexIn, indexLength);
+ CodecUtil.retrieveChecksum(termsIn, termsLength);
+ } else {
+ assert indexLength == -1 : indexLength;
+ assert termsLength == -1 : termsLength;
}
List fieldList = new ArrayList<>(fieldMap.keySet());
fieldList.sort(null);
+ this.fieldMap = fieldMap;
this.fieldList = Collections.unmodifiableList(fieldList);
success = true;
} finally {
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java
index d56a45e9a87e..bb7df7b11617 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java
@@ -211,6 +211,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
//private final static boolean SAVE_DOT_FILES = false;
+ private final IndexOutput metaOut;
private final IndexOutput termsOut;
private final IndexOutput indexOut;
final int maxDoc;
@@ -220,34 +221,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
final PostingsWriterBase postingsWriter;
final FieldInfos fieldInfos;
- private static class FieldMetaData {
- public final FieldInfo fieldInfo;
- public final BytesRef rootCode;
- public final long numTerms;
- public final long indexStartFP;
- public final long sumTotalTermFreq;
- public final long sumDocFreq;
- public final int docCount;
- public final BytesRef minTerm;
- public final BytesRef maxTerm;
-
- public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long indexStartFP, long sumTotalTermFreq, long sumDocFreq, int docCount,
- BytesRef minTerm, BytesRef maxTerm) {
- assert numTerms > 0;
- this.fieldInfo = fieldInfo;
- assert rootCode != null: "field=" + fieldInfo.name + " numTerms=" + numTerms;
- this.rootCode = rootCode;
- this.indexStartFP = indexStartFP;
- this.numTerms = numTerms;
- this.sumTotalTermFreq = sumTotalTermFreq;
- this.sumDocFreq = sumDocFreq;
- this.docCount = docCount;
- this.minTerm = minTerm;
- this.maxTerm = maxTerm;
- }
- }
-
- private final List fields = new ArrayList<>();
+ private final List fields = new ArrayList<>();
/** Create a new writer. The number of items (terms or
* sub-blocks) per block will aim to be between
@@ -272,7 +246,7 @@ public BlockTreeTermsWriter(SegmentWriteState state,
final String termsName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockTreeTermsReader.TERMS_EXTENSION);
termsOut = state.directory.createOutput(termsName, state.context);
boolean success = false;
- IndexOutput indexOut = null;
+ IndexOutput metaOut = null, indexOut = null;
try {
CodecUtil.writeIndexHeader(termsOut, BlockTreeTermsReader.TERMS_CODEC_NAME, BlockTreeTermsReader.VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
@@ -283,27 +257,23 @@ public BlockTreeTermsWriter(SegmentWriteState state,
state.segmentInfo.getId(), state.segmentSuffix);
//segment = state.segmentInfo.name;
- postingsWriter.init(termsOut, state); // have consumer write its format/header
-
+ final String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockTreeTermsReader.TERMS_META_EXTENSION);
+ metaOut = state.directory.createOutput(metaName, state.context);
+ CodecUtil.writeIndexHeader(metaOut, BlockTreeTermsReader.TERMS_META_CODEC_NAME, BlockTreeTermsReader.VERSION_CURRENT,
+ state.segmentInfo.getId(), state.segmentSuffix);
+
+ postingsWriter.init(metaOut, state); // have consumer write its format/header
+
+ this.metaOut = metaOut;
this.indexOut = indexOut;
success = true;
} finally {
if (!success) {
- IOUtils.closeWhileHandlingException(termsOut, indexOut);
+ IOUtils.closeWhileHandlingException(metaOut, termsOut, indexOut);
}
}
}
- /** Writes the terms file trailer. */
- private void writeTrailer(IndexOutput out, long dirStart) throws IOException {
- out.writeLong(dirStart);
- }
-
- /** Writes the index file trailer. */
- private void writeIndexTrailer(IndexOutput indexOut, long dirStart) throws IOException {
- indexOut.writeLong(dirStart);
- }
-
/** Throws {@code IllegalArgumentException} if any of these settings
* is invalid. */
public static void validateSettings(int minItemsInBlock, int maxItemsInBlock) {
@@ -548,7 +518,6 @@ class TermsWriter {
final FixedBitSet docsSeen;
long sumTotalTermFreq;
long sumDocFreq;
- long indexStartFP;
// Records index into pending where the current prefix at that
// length "started"; for example, if current term starts with 't',
@@ -1006,11 +975,27 @@ public void finish() throws IOException {
assert pending.size() == 1 && !pending.get(0).isTerm: "pending.size()=" + pending.size() + " pending=" + pending;
final PendingBlock root = (PendingBlock) pending.get(0);
assert root.prefix.length == 0;
- assert root.index.getEmptyOutput() != null;
-
+ final BytesRef rootCode = root.index.getEmptyOutput();
+ assert rootCode != null;
+
+ ByteBuffersDataOutput metaOut = new ByteBuffersDataOutput();
+ fields.add(metaOut);
+
+ metaOut.writeVInt(fieldInfo.number);
+ metaOut.writeVLong(numTerms);
+ metaOut.writeVInt(rootCode.length);
+ metaOut.writeBytes(rootCode.bytes, rootCode.offset, rootCode.length);
+ assert fieldInfo.getIndexOptions() != IndexOptions.NONE;
+ if (fieldInfo.getIndexOptions() != IndexOptions.DOCS) {
+ metaOut.writeVLong(sumTotalTermFreq);
+ }
+ metaOut.writeVLong(sumDocFreq);
+ metaOut.writeVInt(docsSeen.cardinality());
+ writeBytesRef(metaOut, new BytesRef(firstPendingTerm.termBytes));
+ writeBytesRef(metaOut, new BytesRef(lastPendingTerm.termBytes));
+ metaOut.writeVLong(indexOut.getFilePointer());
// Write FST to index
- indexStartFP = indexOut.getFilePointer();
- root.index.save(indexOut);
+ root.index.save(metaOut, indexOut);
//System.out.println(" write FST " + indexStartFP + " field=" + fieldInfo.name);
/*
@@ -1022,20 +1007,7 @@ public void finish() throws IOException {
w.close();
}
*/
- assert firstPendingTerm != null;
- BytesRef minTerm = new BytesRef(firstPendingTerm.termBytes);
-
- assert lastPendingTerm != null;
- BytesRef maxTerm = new BytesRef(lastPendingTerm.termBytes);
-
- fields.add(new FieldMetaData(fieldInfo,
- ((PendingBlock) pending.get(0)).index.getEmptyOutput(),
- numTerms,
- indexStartFP,
- sumTotalTermFreq,
- sumDocFreq,
- docsSeen.cardinality(),
- minTerm, maxTerm));
+
} else {
assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS && sumTotalTermFreq == -1;
assert sumDocFreq == 0;
@@ -1060,47 +1032,29 @@ public void close() throws IOException {
return;
}
closed = true;
-
+
boolean success = false;
try {
-
- final long dirStart = termsOut.getFilePointer();
- final long indexDirStart = indexOut.getFilePointer();
-
- termsOut.writeVInt(fields.size());
-
- for(FieldMetaData field : fields) {
- //System.out.println(" field " + field.fieldInfo.name + " " + field.numTerms + " terms");
- termsOut.writeVInt(field.fieldInfo.number);
- assert field.numTerms > 0;
- termsOut.writeVLong(field.numTerms);
- termsOut.writeVInt(field.rootCode.length);
- termsOut.writeBytes(field.rootCode.bytes, field.rootCode.offset, field.rootCode.length);
- assert field.fieldInfo.getIndexOptions() != IndexOptions.NONE;
- if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS) {
- termsOut.writeVLong(field.sumTotalTermFreq);
- }
- termsOut.writeVLong(field.sumDocFreq);
- termsOut.writeVInt(field.docCount);
- indexOut.writeVLong(field.indexStartFP);
- writeBytesRef(termsOut, field.minTerm);
- writeBytesRef(termsOut, field.maxTerm);
+ metaOut.writeVInt(fields.size());
+ for (ByteBuffersDataOutput fieldMeta : fields) {
+ fieldMeta.copyTo(metaOut);
}
- writeTrailer(termsOut, dirStart);
- CodecUtil.writeFooter(termsOut);
- writeIndexTrailer(indexOut, indexDirStart);
CodecUtil.writeFooter(indexOut);
+ metaOut.writeLong(indexOut.getFilePointer());
+ CodecUtil.writeFooter(termsOut);
+ metaOut.writeLong(termsOut.getFilePointer());
+ CodecUtil.writeFooter(metaOut);
success = true;
} finally {
if (success) {
- IOUtils.close(termsOut, indexOut, postingsWriter);
+ IOUtils.close(metaOut, termsOut, indexOut, postingsWriter);
} else {
- IOUtils.closeWhileHandlingException(termsOut, indexOut, postingsWriter);
+ IOUtils.closeWhileHandlingException(metaOut, termsOut, indexOut, postingsWriter);
}
}
}
- private static void writeBytesRef(IndexOutput out, BytesRef bytes) throws IOException {
+ private static void writeBytesRef(DataOutput out, BytesRef bytes) throws IOException {
out.writeVInt(bytes.length);
out.writeBytes(bytes.bytes, bytes.offset, bytes.length);
}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
index 01b9fa86dbd3..748fbbb97650 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
@@ -52,7 +52,6 @@ public final class FieldReader extends Terms implements Accountable {
final long sumTotalTermFreq;
final long sumDocFreq;
final int docCount;
- final long indexStartFP;
final long rootBlockFP;
final BytesRef rootCode;
final BytesRef minTerm;
@@ -63,7 +62,7 @@ public final class FieldReader extends Terms implements Accountable {
//private boolean DEBUG;
FieldReader(BlockTreeTermsReader parent, FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount,
- long indexStartFP, IndexInput indexIn, BytesRef minTerm, BytesRef maxTerm) throws IOException {
+ long indexStartFP, IndexInput metaIn, IndexInput indexIn, BytesRef minTerm, BytesRef maxTerm) throws IOException {
assert numTerms > 0;
this.fieldInfo = fieldInfo;
//DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id");
@@ -72,7 +71,6 @@ public final class FieldReader extends Terms implements Accountable {
this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
- this.indexStartFP = indexStartFP;
this.rootCode = rootCode;
this.minTerm = minTerm;
this.maxTerm = maxTerm;
@@ -81,22 +79,22 @@ public final class FieldReader extends Terms implements Accountable {
// }
rootBlockFP = (new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong() >>> BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
// Initialize FST always off-heap.
- if (indexIn != null) {
- final IndexInput clone = indexIn.clone();
- clone.seek(indexStartFP);
- index = new FST<>(clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore());
- /*
- if (false) {
- final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
- Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName));
- Util.toDot(index, w, false, false);
- System.out.println("FST INDEX: SAVED to " + dotFileName);
- w.close();
- }
- */
+ final IndexInput clone = indexIn.clone();
+ clone.seek(indexStartFP);
+ if (metaIn == indexIn) { // Only true before Lucene 8.6
+ index = new FST<>(clone, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore());
} else {
- index = null;
+ index = new FST<>(metaIn, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore());
}
+ /*
+ if (false) {
+ final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
+ Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName));
+ Util.toDot(index, w, false, false);
+ System.out.println("FST INDEX: SAVED to " + dotFileName);
+ w.close();
+ }
+ */
}
@Override
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java
index b7145ccea94a..d807058f6468 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java
@@ -16,7 +16,7 @@
*/
/**
- * Components from the Lucene 6.0 index format. See {@link org.apache.lucene.codecs.lucene80}
+ * Components from the Lucene 6.0 index format. See {@link org.apache.lucene.codecs.lucene86}
* for an overview of the current index format.
*/
package org.apache.lucene.codecs.lucene60;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/package-info.java
index 91ee2e2cba63..5940a47dca83 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/package-info.java
@@ -16,399 +16,7 @@
*/
/**
- * Lucene 8.4 file format.
- *
- *
This document defines the index file formats used in this version of Lucene.
- * If you are using a different version of Lucene, please consult the copy of
- * docs/ that was distributed with
- * the version you are using.
- *
This document attempts to provide a high-level definition of the Apache
- * Lucene file formats.
- *
- *
- *
Definitions
- *
- *
The fundamental concepts in Lucene are index, document, field and term.
- *
An index contains a sequence of documents.
- *
- *
A document is a sequence of fields.
- *
A field is a named sequence of terms.
- *
A term is a sequence of bytes.
- *
- *
The same sequence of bytes in two different fields is considered a different
- * term. Thus terms are represented as a pair: the string naming the field, and the
- * bytes within the field.
- *
- *
Inverted Indexing
- *
The index stores statistics about terms in order to make term-based search
- * more efficient. Lucene's index falls into the family of indexes known as an
- * inverted index. This is because it can list, for a term, the documents
- * that contain it. This is the inverse of the natural relationship, in which
- * documents list terms.
- *
- *
Types of Fields
- *
In Lucene, fields may be stored, in which case their text is stored
- * in the index literally, in a non-inverted manner. Fields that are inverted are
- * called indexed. A field may be both stored and indexed.
- *
The text of a field may be tokenized into terms to be indexed, or the
- * text of a field may be used literally as a term to be indexed. Most fields are
- * tokenized, but sometimes it is useful for certain identifier fields to be
- * indexed literally.
- *
See the {@link org.apache.lucene.document.Field Field}
- * java docs for more information on Fields.
- *
- *
Segments
- *
Lucene indexes may be composed of multiple sub-indexes, or segments.
- * Each segment is a fully independent index, which could be searched separately.
- * Indexes evolve by:
- *
- *
Creating new segments for newly added documents.
- *
Merging existing segments.
- *
- *
Searches may involve multiple segments and/or multiple indexes, each index
- * potentially composed of a set of segments.
- *
- *
Document Numbers
- *
Internally, Lucene refers to documents by an integer document number.
- * The first document added to an index is numbered zero, and each subsequent
- * document added gets a number one greater than the previous.
- *
Note that a document's number may change, so caution should be taken when
- * storing these numbers outside of Lucene. In particular, numbers may change in
- * the following situations:
- *
- *
- *
The numbers stored in each segment are unique only within the segment, and
- * must be converted before they can be used in a larger context. The standard
- * technique is to allocate each segment a range of values, based on the range of
- * numbers used in that segment. To convert a document number from a segment to an
- * external value, the segment's base document number is added. To convert
- * an external value back to a segment-specific value, the segment is identified
- * by the range that the external value is in, and the segment's base value is
- * subtracted. For example two five document segments might be combined, so that
- * the first segment has a base value of zero, and the second of five. Document
- * three from the second segment would have an external value of eight.
- *
- *
- *
When documents are deleted, gaps are created in the numbering. These are
- * eventually removed as the index evolves through merging. Deleted documents are
- * dropped when segments are merged. A freshly-merged segment thus has no gaps in
- * its numbering.
- *
- *
- *
- *
- *
Index Structure Overview
- *
- *
Each segment index maintains the following:
- *
- *
- * {@link org.apache.lucene.codecs.lucene70.Lucene70SegmentInfoFormat Segment info}.
- * This contains metadata about a segment, such as the number of documents,
- * what files it uses,
- *
- *
- * {@link org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat Field names}.
- * This contains the set of field names used in the index.
- *
- *
- * {@link org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat Stored Field values}.
- * This contains, for each document, a list of attribute-value pairs, where the attributes
- * are field names. These are used to store auxiliary information about the document, such as
- * its title, url, or an identifier to access a database. The set of stored fields are what is
- * returned for each hit when searching. This is keyed by document number.
- *
- *
- * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term dictionary}.
- * A dictionary containing all of the terms used in all of the
- * indexed fields of all of the documents. The dictionary also contains the number
- * of documents which contain the term, and pointers to the term's frequency and
- * proximity data.
- *
- *
- * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Frequency data}.
- * For each term in the dictionary, the numbers of all the
- * documents that contain that term, and the frequency of the term in that
- * document, unless frequencies are omitted ({@link org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
- *
- *
- * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Proximity data}.
- * For each term in the dictionary, the positions that the
- * term occurs in each document. Note that this will not exist if all fields in
- * all documents omit position data.
- *
- *
- * {@link org.apache.lucene.codecs.lucene80.Lucene80NormsFormat Normalization factors}.
- * For each field in each document, a value is stored
- * that is multiplied into the score for hits on that field.
- *
- *
- * {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vectors}.
- * For each field in each document, the term vector (sometimes
- * called document vector) may be stored. A term vector consists of term text and
- * term frequency. To add Term Vectors to your index see the
- * {@link org.apache.lucene.document.Field Field} constructors
- *
- *
- * {@link org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat Per-document values}.
- * Like stored values, these are also keyed by document
- * number, but are generally intended to be loaded into main memory for fast
- * access. Whereas stored values are generally intended for summary results from
- * searches, per-document values are useful for things like scoring factors.
- *
- *
- * {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live documents}.
- * An optional file indicating which documents are live.
- *
- *
- * {@link org.apache.lucene.codecs.lucene60.Lucene60PointsFormat Point values}.
- * Optional pair of files, recording dimensionally indexed fields, to enable fast
- * numeric range filtering and large numeric values like BigInteger and BigDecimal (1D)
- * and geographic shape intersection (2D, 3D).
- *
- *
- *
Details on each of these are provided in their linked pages.
- *
- *
- *
File Naming
- *
- *
All files belonging to a segment have the same name with varying extensions.
- * The extensions correspond to the different file formats described below. When
- * using the Compound File format (default for small segments) these files (except
- * for the Segment info file, the Lock file, and Deleted documents file) are collapsed
- * into a single .cfs file (see below for details)
- *
Typically, all segments in an index are stored in a single directory,
- * although this is not required.
- *
File names are never re-used. That is, when any file is saved
- * to the Directory it is given a never before used filename. This is achieved
- * using a simple generations approach. For example, the first segments file is
- * segments_1, then segments_2, etc. The generation is a sequential long integer
- * represented in alpha-numeric (base 36) form.
- *
- *
- *
Summary of File Extensions
- *
- *
The following table summarizes the names and extensions of the files in
- * Lucene:
Encodes additional scoring factors or other per-document information.
- *
- *
- *
{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Index}
- *
.tvx
- *
Stores offset into the document data file
- *
- *
- *
{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Data}
- *
.tvd
- *
Contains term vector data.
- *
- *
- *
{@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents}
- *
.liv
- *
Info about what documents are live
- *
- *
- *
{@link org.apache.lucene.codecs.lucene60.Lucene60PointsFormat Point values}
- *
.dii, .dim
- *
Holds indexed points, if any
- *
- *
- *
- *
- *
Lock File
- * The write lock, which is stored in the index directory by default, is named
- * "write.lock". If the lock directory is different from the index directory then
- * the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix
- * derived from the full path to the index directory. When this file is present, a
- * writer is currently modifying the index (adding or removing documents). This
- * lock file ensures that only one writer is modifying the index at a time.
- *
- *
History
- *
Compatibility notes are provided in this document, describing how file
- * formats have changed from prior versions:
- *
- *
In version 2.1, the file format was changed to allow lock-less commits (ie,
- * no more commit lock). The change is fully backwards compatible: you can open a
- * pre-2.1 index for searching or adding/deleting of docs. When the new segments
- * file is saved (committed), it will be written in the new file format (meaning
- * no specific "upgrade" process is needed). But note that once a commit has
- * occurred, pre-2.1 Lucene will not be able to read the index.
- *
In version 2.3, the file format was changed to allow segments to share a
- * single set of doc store (vectors & stored fields) files. This allows for
- * faster indexing in certain cases. The change is fully backwards compatible (in
- * the same way as the lock-less commits change in 2.1).
- *
In version 2.4, Strings are now written as true UTF-8 byte sequence, not
- * Java's modified UTF-8. See
- * LUCENE-510 for details.
- *
In version 2.9, an optional opaque Map<String,String> CommitUserData
- * may be passed to IndexWriter's commit methods (and later retrieved), which is
- * recorded in the segments_N file. See
- * LUCENE-1382 for details. Also,
- * diagnostics were added to each segment written recording details about why it
- * was written (due to flush, merge; which OS/JRE was used; etc.). See issue
- * LUCENE-1654 for details.
- *
In version 3.0, compressed fields are no longer written to the index (they
- * can still be read, but on merge the new segment will write them, uncompressed).
- * See issue LUCENE-1960
- * for details.
- *
In version 3.1, segments records the code version that created them. See
- * LUCENE-2720 for details.
- * Additionally segments track explicitly whether or not they have term vectors.
- * See LUCENE-2811
- * for details.
- *
In version 3.2, numeric fields are written as natively to stored fields
- * file, previously they were stored in text format only.
- *
In version 3.4, fields can omit position data while still indexing term
- * frequencies.
- *
In version 4.0, the format of the inverted index became extensible via
- * the {@link org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage
- * ({@code DocValues}) was introduced. Normalization factors need no longer be a
- * single byte, they can be any {@link org.apache.lucene.index.NumericDocValues NumericDocValues}.
- * Terms need not be unicode strings, they can be any byte sequence. Term offsets
- * can optionally be indexed into the postings lists. Payloads can be stored in the
- * term vectors.
- *
In version 4.1, the format of the postings list changed to use either
- * of FOR compression or variable-byte encoding, depending upon the frequency
- * of the term. Terms appearing only once were changed to inline directly into
- * the term dictionary. Stored fields are compressed by default.
- *
In version 4.2, term vectors are compressed by default. DocValues has
- * a new multi-valued type (SortedSet), that can be used for faceting/grouping/joining
- * on multi-valued fields.
- *
In version 4.5, DocValues were extended to explicitly represent missing values.
- *
In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
- * allow updating NumericDocValues fields.
- *
In version 4.8, checksum footers were added to the end of each index file
- * for improved data integrity. Specifically, the last 8 bytes of every index file
- * contain the zlib-crc32 checksum of the file.
- *
In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric)
- * that is suitable for faceting/sorting/analytics.
- *
In version 5.4, DocValues have been improved to store more information on disk:
- * addresses for binary fields and ord indexes for multi-valued fields.
- *
In version 6.0, Points were added, for multi-dimensional range/distance search.
- *
In version 6.2, new Segment info format that reads/writes the index sort, to support index sorting.
- *
In version 7.0, DocValues have been improved to better support sparse doc values
- * thanks to an iterator API.
- *
In version 8.0, postings have been enhanced to record, for each block of
- * doc ids, the (term freq, normalization factor) pairs that may trigger the
- * maximum score of the block. This information is recorded alongside skip data
- * in order to be able to skip blocks of doc ids if they may not produce high
- * enough scores.
- * Additionally doc values and norms has been extended with jump-tables to make access O(1)
- * instead of O(n), where n is the number of elements to skip when advancing in the data.
- *
In version 8.4, postings, positions, offsets and payload lengths have move to a more
- * performant encoding that is vectorized.
- *
- *
- *
Limitations
- *
- *
Lucene uses a Java int to refer to
- * document numbers, and the index file format uses an Int32
- * on-disk to store document numbers. This is a limitation
- * of both the index file format and the current implementation. Eventually these
- * should be replaced with either UInt64 values, or
- * better yet, {@link org.apache.lucene.store.DataOutput#writeVInt VInt} values which have no limit.
- *
+ * Components from the Lucene 8.4 index format. See {@link org.apache.lucene.codecs.lucene86}
+ * for an overview of the current index format.
*/
package org.apache.lucene.codecs.lucene84;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86Codec.java
new file mode 100644
index 000000000000..3f69874ef200
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86Codec.java
@@ -0,0 +1,178 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.codecs.lucene86;
+
+import java.util.Objects;
+
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.CompoundFormat;
+import org.apache.lucene.codecs.DocValuesFormat;
+import org.apache.lucene.codecs.FieldInfosFormat;
+import org.apache.lucene.codecs.FilterCodec;
+import org.apache.lucene.codecs.LiveDocsFormat;
+import org.apache.lucene.codecs.NormsFormat;
+import org.apache.lucene.codecs.PointsFormat;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.SegmentInfoFormat;
+import org.apache.lucene.codecs.StoredFieldsFormat;
+import org.apache.lucene.codecs.TermVectorsFormat;
+import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat;
+import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
+import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
+import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
+import org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat;
+import org.apache.lucene.codecs.lucene80.Lucene80NormsFormat;
+import org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat;
+import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
+import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
+
+/**
+ * Implements the Lucene 8.6 index format, with configurable per-field postings
+ * and docvalues formats.
+ *
+ * If you want to reuse functionality of this codec in another codec, extend
+ * {@link FilterCodec}.
+ *
+ * @see org.apache.lucene.codecs.lucene86 package documentation for file format details.
+ *
+ * @lucene.experimental
+ */
+public class Lucene86Codec extends Codec {
+ private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
+ private final FieldInfosFormat fieldInfosFormat = new Lucene60FieldInfosFormat();
+ private final SegmentInfoFormat segmentInfosFormat = new Lucene86SegmentInfoFormat();
+ private final LiveDocsFormat liveDocsFormat = new Lucene50LiveDocsFormat();
+ private final CompoundFormat compoundFormat = new Lucene50CompoundFormat();
+ private final PointsFormat pointsFormat = new Lucene86PointsFormat();
+ private final PostingsFormat defaultFormat;
+
+ private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() {
+ @Override
+ public PostingsFormat getPostingsFormatForField(String field) {
+ return Lucene86Codec.this.getPostingsFormatForField(field);
+ }
+ };
+
+ private final DocValuesFormat docValuesFormat = new PerFieldDocValuesFormat() {
+ @Override
+ public DocValuesFormat getDocValuesFormatForField(String field) {
+ return Lucene86Codec.this.getDocValuesFormatForField(field);
+ }
+ };
+
+ private final StoredFieldsFormat storedFieldsFormat;
+
+ /**
+ * Instantiates a new codec.
+ */
+ public Lucene86Codec() {
+ this(Lucene50StoredFieldsFormat.Mode.BEST_SPEED);
+ }
+
+ /**
+ * Instantiates a new codec, specifying the stored fields compression
+ * mode to use.
+ * @param mode stored fields compression mode to use for newly
+ * flushed/merged segments.
+ */
+ public Lucene86Codec(Lucene50StoredFieldsFormat.Mode mode) {
+ super("Lucene86");
+ this.storedFieldsFormat = new Lucene50StoredFieldsFormat(Objects.requireNonNull(mode));
+ this.defaultFormat = new Lucene84PostingsFormat();
+ }
+
+ @Override
+ public final StoredFieldsFormat storedFieldsFormat() {
+ return storedFieldsFormat;
+ }
+
+ @Override
+ public final TermVectorsFormat termVectorsFormat() {
+ return vectorsFormat;
+ }
+
+ @Override
+ public final PostingsFormat postingsFormat() {
+ return postingsFormat;
+ }
+
+ @Override
+ public final FieldInfosFormat fieldInfosFormat() {
+ return fieldInfosFormat;
+ }
+
+ @Override
+ public final SegmentInfoFormat segmentInfoFormat() {
+ return segmentInfosFormat;
+ }
+
+ @Override
+ public final LiveDocsFormat liveDocsFormat() {
+ return liveDocsFormat;
+ }
+
+ @Override
+ public final CompoundFormat compoundFormat() {
+ return compoundFormat;
+ }
+
+ @Override
+ public final PointsFormat pointsFormat() {
+ return pointsFormat;
+ }
+
+ /** Returns the postings format that should be used for writing
+ * new segments of field.
+ *
+ * The default implementation always returns "Lucene84".
+ *
+ * WARNING: if you subclass, you are responsible for index
+ * backwards compatibility: future version of Lucene are only
+ * guaranteed to be able to read the default implementation.
+ */
+ public PostingsFormat getPostingsFormatForField(String field) {
+ return defaultFormat;
+ }
+
+ /** Returns the docvalues format that should be used for writing
+ * new segments of field.
+ *
+ * The default implementation always returns "Lucene80".
+ *
+ * WARNING: if you subclass, you are responsible for index
+ * backwards compatibility: future version of Lucene are only
+ * guaranteed to be able to read the default implementation.
+ */
+ public DocValuesFormat getDocValuesFormatForField(String field) {
+ return defaultDVFormat;
+ }
+
+ @Override
+ public final DocValuesFormat docValuesFormat() {
+ return docValuesFormat;
+ }
+
+ private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene80");
+
+ private final NormsFormat normsFormat = new Lucene80NormsFormat();
+
+ @Override
+ public final NormsFormat normsFormat() {
+ return normsFormat;
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86PointsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86PointsFormat.java
new file mode 100644
index 000000000000..8cd63a790c4c
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86PointsFormat.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene86;
+
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.PointsFormat;
+import org.apache.lucene.codecs.PointsReader;
+import org.apache.lucene.codecs.PointsWriter;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+
+/**
+ * Lucene 8.6 point format, which encodes dimensional values in a block KD-tree structure
+ * for fast 1D range and N dimensional shape intersection filtering.
+ * See this paper for details.
+ *
+ *
Data is stored across three files
+ *
+ *
A .kdm file that records metadata about the fields, such as numbers of
+ * dimensions or numbers of bytes per dimension.
+ *
A .kdi file that stores inner nodes of the tree.
+ *
A .kdm file that stores leaf nodes, where most of the data lives.
+ *
+ *
+ * @lucene.experimental
+ */
+public final class Lucene86PointsFormat extends PointsFormat {
+
+ static final String DATA_CODEC_NAME = "Lucene86PointsFormatData";
+ static final String INDEX_CODEC_NAME = "Lucene86PointsFormatIndex";
+ static final String META_CODEC_NAME = "Lucene86PointsFormatMeta";
+
+ /**
+ * Filename extension for the leaf blocks
+ */
+ public static final String DATA_EXTENSION = "kdd";
+
+ /**
+ * Filename extension for the index per field
+ */
+ public static final String INDEX_EXTENSION = "kdi";
+
+ /**
+ * Filename extension for the meta per field
+ */
+ public static final String META_EXTENSION = "kdm";
+
+ static final int VERSION_START = 0;
+ static final int VERSION_CURRENT = VERSION_START;
+
+ /** Sole constructor */
+ public Lucene86PointsFormat() {
+ }
+
+ @Override
+ public PointsWriter fieldsWriter(SegmentWriteState state) throws IOException {
+ return new Lucene86PointsWriter(state);
+ }
+
+ @Override
+ public PointsReader fieldsReader(SegmentReadState state) throws IOException {
+ return new Lucene86PointsReader(state);
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86PointsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86PointsReader.java
new file mode 100644
index 000000000000..fdc3cbd78b1b
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86PointsReader.java
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene86;
+
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.PointsReader;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.PointValues;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.bkd.BKDReader;
+
+/** Reads point values previously written with {@link Lucene86PointsWriter} */
+public class Lucene86PointsReader extends PointsReader implements Closeable {
+ final IndexInput indexIn, dataIn;
+ final SegmentReadState readState;
+ final Map readers = new HashMap<>();
+
+ /** Sole constructor */
+ public Lucene86PointsReader(SegmentReadState readState) throws IOException {
+ this.readState = readState;
+
+ String metaFileName = IndexFileNames.segmentFileName(readState.segmentInfo.name,
+ readState.segmentSuffix,
+ Lucene86PointsFormat.META_EXTENSION);
+ String indexFileName = IndexFileNames.segmentFileName(readState.segmentInfo.name,
+ readState.segmentSuffix,
+ Lucene86PointsFormat.INDEX_EXTENSION);
+ String dataFileName = IndexFileNames.segmentFileName(readState.segmentInfo.name,
+ readState.segmentSuffix,
+ Lucene86PointsFormat.DATA_EXTENSION);
+
+ boolean success = false;
+ try {
+ indexIn = readState.directory.openInput(indexFileName, readState.context);
+ CodecUtil.checkIndexHeader(indexIn,
+ Lucene86PointsFormat.INDEX_CODEC_NAME,
+ Lucene86PointsFormat.VERSION_START,
+ Lucene86PointsFormat.VERSION_CURRENT,
+ readState.segmentInfo.getId(),
+ readState.segmentSuffix);
+
+ dataIn = readState.directory.openInput(dataFileName, readState.context);
+ CodecUtil.checkIndexHeader(dataIn,
+ Lucene86PointsFormat.DATA_CODEC_NAME,
+ Lucene86PointsFormat.VERSION_START,
+ Lucene86PointsFormat.VERSION_CURRENT,
+ readState.segmentInfo.getId(),
+ readState.segmentSuffix);
+
+ long indexLength = -1, dataLength = -1;
+ try (ChecksumIndexInput metaIn = readState.directory.openChecksumInput(metaFileName, readState.context)) {
+ Throwable priorE = null;
+ try {
+ CodecUtil.checkIndexHeader(metaIn,
+ Lucene86PointsFormat.META_CODEC_NAME,
+ Lucene86PointsFormat.VERSION_START,
+ Lucene86PointsFormat.VERSION_CURRENT,
+ readState.segmentInfo.getId(),
+ readState.segmentSuffix);
+
+ while (true) {
+ int fieldNumber = metaIn.readInt();
+ if (fieldNumber == -1) {
+ break;
+ } else if (fieldNumber < 0) {
+ throw new CorruptIndexException("Illegal field number: " + fieldNumber, metaIn);
+ }
+ BKDReader reader = new BKDReader(metaIn, indexIn, dataIn);
+ readers.put(fieldNumber, reader);
+ }
+ indexLength = metaIn.readLong();
+ dataLength = metaIn.readLong();
+ } catch (Throwable t) {
+ priorE = t;
+ } finally {
+ CodecUtil.checkFooter(metaIn, priorE);
+ }
+ }
+ // At this point, checksums of the meta file have been validated so we
+ // know that indexLength and dataLength are very likely correct.
+ CodecUtil.retrieveChecksum(indexIn, indexLength);
+ CodecUtil.retrieveChecksum(dataIn, dataLength);
+ success = true;
+ } finally {
+ if (success == false) {
+ IOUtils.closeWhileHandlingException(this);
+ }
+ }
+
+ }
+
+ /** Returns the underlying {@link BKDReader}.
+ *
+ * @lucene.internal */
+ @Override
+ public PointValues getValues(String fieldName) {
+ FieldInfo fieldInfo = readState.fieldInfos.fieldInfo(fieldName);
+ if (fieldInfo == null) {
+ throw new IllegalArgumentException("field=\"" + fieldName + "\" is unrecognized");
+ }
+ if (fieldInfo.getPointDimensionCount() == 0) {
+ throw new IllegalArgumentException("field=\"" + fieldName + "\" did not index point values");
+ }
+
+ return readers.get(fieldInfo.number);
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ return 0L;
+ }
+
+ @Override
+ public void checkIntegrity() throws IOException {
+ CodecUtil.checksumEntireFile(indexIn);
+ CodecUtil.checksumEntireFile(dataIn);
+ }
+
+ @Override
+ public void close() throws IOException {
+ IOUtils.close(indexIn, dataIn);
+ // Free up heap:
+ readers.clear();
+ }
+
+}
+
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86PointsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86PointsWriter.java
new file mode 100644
index 000000000000..6fe35710c500
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86PointsWriter.java
@@ -0,0 +1,265 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene86;
+
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.MutablePointValues;
+import org.apache.lucene.codecs.PointsReader;
+import org.apache.lucene.codecs.PointsWriter;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.MergeState;
+import org.apache.lucene.index.PointValues;
+import org.apache.lucene.index.PointValues.IntersectVisitor;
+import org.apache.lucene.index.PointValues.Relation;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.bkd.BKDReader;
+import org.apache.lucene.util.bkd.BKDWriter;
+
+/** Writes dimensional values */
+public class Lucene86PointsWriter extends PointsWriter implements Closeable {
+
+ /** Outputs used to write the BKD tree data files. */
+ protected final IndexOutput metaOut, indexOut, dataOut;
+
+ final SegmentWriteState writeState;
+ final int maxPointsInLeafNode;
+ final double maxMBSortInHeap;
+ private boolean finished;
+
+ /** Full constructor */
+ public Lucene86PointsWriter(SegmentWriteState writeState, int maxPointsInLeafNode, double maxMBSortInHeap) throws IOException {
+ assert writeState.fieldInfos.hasPointValues();
+ this.writeState = writeState;
+ this.maxPointsInLeafNode = maxPointsInLeafNode;
+ this.maxMBSortInHeap = maxMBSortInHeap;
+ String dataFileName = IndexFileNames.segmentFileName(writeState.segmentInfo.name,
+ writeState.segmentSuffix,
+ Lucene86PointsFormat.DATA_EXTENSION);
+ dataOut = writeState.directory.createOutput(dataFileName, writeState.context);
+ boolean success = false;
+ try {
+ CodecUtil.writeIndexHeader(dataOut,
+ Lucene86PointsFormat.DATA_CODEC_NAME,
+ Lucene86PointsFormat.VERSION_CURRENT,
+ writeState.segmentInfo.getId(),
+ writeState.segmentSuffix);
+
+ String metaFileName = IndexFileNames.segmentFileName(writeState.segmentInfo.name,
+ writeState.segmentSuffix,
+ Lucene86PointsFormat.META_EXTENSION);
+ metaOut = writeState.directory.createOutput(metaFileName, writeState.context);
+ CodecUtil.writeIndexHeader(metaOut,
+ Lucene86PointsFormat.META_CODEC_NAME,
+ Lucene86PointsFormat.VERSION_CURRENT,
+ writeState.segmentInfo.getId(),
+ writeState.segmentSuffix);
+
+ String indexFileName = IndexFileNames.segmentFileName(writeState.segmentInfo.name,
+ writeState.segmentSuffix,
+ Lucene86PointsFormat.INDEX_EXTENSION);
+ indexOut = writeState.directory.createOutput(indexFileName, writeState.context);
+ CodecUtil.writeIndexHeader(indexOut,
+ Lucene86PointsFormat.INDEX_CODEC_NAME,
+ Lucene86PointsFormat.VERSION_CURRENT,
+ writeState.segmentInfo.getId(),
+ writeState.segmentSuffix);
+
+ success = true;
+ } finally {
+ if (success == false) {
+ IOUtils.closeWhileHandlingException(this);
+ }
+ }
+ }
+
+ /** Uses the defaults values for {@code maxPointsInLeafNode} (1024) and {@code maxMBSortInHeap} (16.0) */
+ public Lucene86PointsWriter(SegmentWriteState writeState) throws IOException {
+ this(writeState, BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE, BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP);
+ }
+
+ @Override
+ public void writeField(FieldInfo fieldInfo, PointsReader reader) throws IOException {
+
+ PointValues values = reader.getValues(fieldInfo.name);
+
+ try (BKDWriter writer = new BKDWriter(writeState.segmentInfo.maxDoc(),
+ writeState.directory,
+ writeState.segmentInfo.name,
+ fieldInfo.getPointDimensionCount(),
+ fieldInfo.getPointIndexDimensionCount(),
+ fieldInfo.getPointNumBytes(),
+ maxPointsInLeafNode,
+ maxMBSortInHeap,
+ values.size())) {
+
+ if (values instanceof MutablePointValues) {
+ Runnable finalizer = writer.writeField(metaOut, indexOut, dataOut, fieldInfo.name, (MutablePointValues) values);
+ if (finalizer != null) {
+ metaOut.writeInt(fieldInfo.number);
+ finalizer.run();
+ }
+ return;
+ }
+
+ values.intersect(new IntersectVisitor() {
+ @Override
+ public void visit(int docID) {
+ throw new IllegalStateException();
+ }
+
+ public void visit(int docID, byte[] packedValue) throws IOException {
+ writer.add(packedValue, docID);
+ }
+
+ @Override
+ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+ return Relation.CELL_CROSSES_QUERY;
+ }
+ });
+
+ // We could have 0 points on merge since all docs with dimensional fields may be deleted:
+ Runnable finalizer = writer.finish(metaOut, indexOut, dataOut);
+ if (finalizer != null) {
+ metaOut.writeInt(fieldInfo.number);
+ finalizer.run();
+ }
+ }
+ }
+
+ @Override
+ public void merge(MergeState mergeState) throws IOException {
+ /**
+ * If indexSort is activated and some of the leaves are not sorted the next test will catch that and the non-optimized merge will run.
+ * If the readers are all sorted then it's safe to perform a bulk merge of the points.
+ **/
+ for(PointsReader reader : mergeState.pointsReaders) {
+ if (reader instanceof Lucene86PointsReader == false) {
+ // We can only bulk merge when all to-be-merged segments use our format:
+ super.merge(mergeState);
+ return;
+ }
+ }
+ for (PointsReader reader : mergeState.pointsReaders) {
+ if (reader != null) {
+ reader.checkIntegrity();
+ }
+ }
+
+ for (FieldInfo fieldInfo : mergeState.mergeFieldInfos) {
+ if (fieldInfo.getPointDimensionCount() != 0) {
+ if (fieldInfo.getPointDimensionCount() == 1) {
+
+ // Worst case total maximum size (if none of the points are deleted):
+ long totMaxSize = 0;
+ for(int i=0;i 0) {
+ PointValues values = reader.getValues(fieldInfo.name);
+ if (values != null) {
+ totMaxSize += values.size();
+ }
+ }
+ }
+ }
+
+ //System.out.println("MERGE: field=" + fieldInfo.name);
+ // Optimize the 1D case to use BKDWriter.merge, which does a single merge sort of the
+ // already sorted incoming segments, instead of trying to sort all points again as if
+ // we were simply reindexing them:
+ try (BKDWriter writer = new BKDWriter(writeState.segmentInfo.maxDoc(),
+ writeState.directory,
+ writeState.segmentInfo.name,
+ fieldInfo.getPointDimensionCount(),
+ fieldInfo.getPointIndexDimensionCount(),
+ fieldInfo.getPointNumBytes(),
+ maxPointsInLeafNode,
+ maxMBSortInHeap,
+ totMaxSize)) {
+ List bkdReaders = new ArrayList<>();
+ List docMaps = new ArrayList<>();
+ for(int i=0;i 0) {
+ BKDReader bkdReader = reader60.readers.get(readerFieldInfo.number);
+ if (bkdReader != null) {
+ bkdReaders.add(bkdReader);
+ docMaps.add(mergeState.docMaps[i]);
+ }
+ }
+ }
+ }
+
+ Runnable finalizer = writer.merge(metaOut, indexOut, dataOut, docMaps, bkdReaders);
+ if (finalizer != null) {
+ metaOut.writeInt(fieldInfo.number);
+ finalizer.run();
+ }
+ }
+ } else {
+ mergeOneField(mergeState, fieldInfo);
+ }
+ }
+ }
+
+ finish();
+ }
+
+ @Override
+ public void finish() throws IOException {
+ if (finished) {
+ throw new IllegalStateException("already finished");
+ }
+ finished = true;
+ metaOut.writeInt(-1);
+ CodecUtil.writeFooter(indexOut);
+ CodecUtil.writeFooter(dataOut);
+ metaOut.writeLong(indexOut.getFilePointer());
+ metaOut.writeLong(dataOut.getFilePointer());
+ CodecUtil.writeFooter(metaOut);
+ }
+
+ @Override
+ public void close() throws IOException {
+ IOUtils.close(metaOut, indexOut, dataOut);
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86SegmentInfoFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86SegmentInfoFormat.java
new file mode 100644
index 000000000000..b2bcdc2282ec
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene86/Lucene86SegmentInfoFormat.java
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.codecs.lucene86;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.SegmentInfoFormat;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.IndexSorter;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.SegmentInfos;
+import org.apache.lucene.index.SortFieldProvider;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.Version;
+
+/**
+ * Lucene 8.6 Segment info format.
+ *
IndexSort --> {@link DataOutput#writeVInt Int32} count, followed by {@code count} SortField
+ *
SortField --> {@link DataOutput#writeString String} sort class, followed by a per-sort bytestream
+ * (see {@link SortFieldProvider#readSortField(DataInput)})
+ *
SegVersion is the code version that created the segment.
+ *
SegMinVersion is the minimum code version that contributed documents to the segment.
+ *
SegSize is the number of documents contained in the segment index.
+ *
IsCompoundFile records whether the segment is written as a compound file or
+ * not. If this is -1, the segment is not a compound file. If it is 1, the segment
+ * is a compound file.
+ *
The Diagnostics Map is privately written by {@link IndexWriter}, as a debugging aid,
+ * for each segment it creates. It includes metadata like the current Lucene
+ * version, OS, Java version, why the segment was created (merge, flush,
+ * addIndexes), etc.
+ *
Files is a list of files referred to by this segment.
+ *
+ *
+ * @see SegmentInfos
+ * @lucene.experimental
+ */
+public class Lucene86SegmentInfoFormat extends SegmentInfoFormat {
+
+ /** Sole constructor. */
+ public Lucene86SegmentInfoFormat() {
+ }
+
+ @Override
+ public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context) throws IOException {
+ final String fileName = IndexFileNames.segmentFileName(segment, "", SI_EXTENSION);
+ try (ChecksumIndexInput input = dir.openChecksumInput(fileName, context)) {
+ Throwable priorE = null;
+ SegmentInfo si = null;
+ try {
+ int format = CodecUtil.checkIndexHeader(input, CODEC_NAME,
+ VERSION_START,
+ VERSION_CURRENT,
+ segmentID, "");
+ final Version version = Version.fromBits(input.readInt(), input.readInt(), input.readInt());
+ byte hasMinVersion = input.readByte();
+ final Version minVersion;
+ switch (hasMinVersion) {
+ case 0:
+ minVersion = null;
+ break;
+ case 1:
+ minVersion = Version.fromBits(input.readInt(), input.readInt(), input.readInt());
+ break;
+ default:
+ throw new CorruptIndexException("Illegal boolean value " + hasMinVersion, input);
+ }
+
+ final int docCount = input.readInt();
+ if (docCount < 0) {
+ throw new CorruptIndexException("invalid docCount: " + docCount, input);
+ }
+ final boolean isCompoundFile = input.readByte() == SegmentInfo.YES;
+
+ final Map diagnostics = input.readMapOfStrings();
+ final Set files = input.readSetOfStrings();
+ final Map attributes = input.readMapOfStrings();
+
+ int numSortFields = input.readVInt();
+ Sort indexSort;
+ if (numSortFields > 0) {
+ SortField[] sortFields = new SortField[numSortFields];
+ for(int i=0;i= 7 but got: " + version.major + " segment=" + si);
+ }
+ // Write the Lucene version that created this segment, since 3.1
+ output.writeInt(version.major);
+ output.writeInt(version.minor);
+ output.writeInt(version.bugfix);
+
+ // Write the min Lucene version that contributed docs to the segment, since 7.0
+ if (si.getMinVersion() != null) {
+ output.writeByte((byte) 1);
+ Version minVersion = si.getMinVersion();
+ output.writeInt(minVersion.major);
+ output.writeInt(minVersion.minor);
+ output.writeInt(minVersion.bugfix);
+ } else {
+ output.writeByte((byte) 0);
+ }
+
+ assert version.prerelease == 0;
+ output.writeInt(si.maxDoc());
+
+ output.writeByte((byte) (si.getUseCompoundFile() ? SegmentInfo.YES : SegmentInfo.NO));
+ output.writeMapOfStrings(si.getDiagnostics());
+ Set files = si.files();
+ for (String file : files) {
+ if (!IndexFileNames.parseSegmentName(file).equals(si.name)) {
+ throw new IllegalArgumentException("invalid files: expected segment=" + si.name + ", got=" + files);
+ }
+ }
+ output.writeSetOfStrings(files);
+ output.writeMapOfStrings(si.getAttributes());
+
+ Sort indexSort = si.getIndexSort();
+ int numSortFields = indexSort == null ? 0 : indexSort.getSort().length;
+ output.writeVInt(numSortFields);
+ for (int i = 0; i < numSortFields; ++i) {
+ SortField sortField = indexSort.getSort()[i];
+ IndexSorter sorter = sortField.getIndexSorter();
+ if (sorter == null) {
+ throw new IllegalArgumentException("cannot serialize SortField " + sortField);
+ }
+ output.writeString(sorter.getProviderName());
+ SortFieldProvider.write(sortField, output);
+ }
+
+ CodecUtil.writeFooter(output);
+ }
+ }
+
+ /** File extension used to store {@link SegmentInfo}. */
+ public final static String SI_EXTENSION = "si";
+ static final String CODEC_NAME = "Lucene86SegmentInfo";
+ static final int VERSION_START = 0;
+ static final int VERSION_CURRENT = VERSION_START;
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene86/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene86/package-info.java
new file mode 100644
index 000000000000..19be7eb66157
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene86/package-info.java
@@ -0,0 +1,416 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Lucene 8.6 file format.
+ *
+ *
This document defines the index file formats used in this version of Lucene.
+ * If you are using a different version of Lucene, please consult the copy of
+ * docs/ that was distributed with
+ * the version you are using.
+ *
This document attempts to provide a high-level definition of the Apache
+ * Lucene file formats.
+ *
+ *
+ *
Definitions
+ *
+ *
The fundamental concepts in Lucene are index, document, field and term.
+ *
An index contains a sequence of documents.
+ *
+ *
A document is a sequence of fields.
+ *
A field is a named sequence of terms.
+ *
A term is a sequence of bytes.
+ *
+ *
The same sequence of bytes in two different fields is considered a different
+ * term. Thus terms are represented as a pair: the string naming the field, and the
+ * bytes within the field.
+ *
+ *
Inverted Indexing
+ *
The index stores statistics about terms in order to make term-based search
+ * more efficient. Lucene's index falls into the family of indexes known as an
+ * inverted index. This is because it can list, for a term, the documents
+ * that contain it. This is the inverse of the natural relationship, in which
+ * documents list terms.
+ *
+ *
Types of Fields
+ *
In Lucene, fields may be stored, in which case their text is stored
+ * in the index literally, in a non-inverted manner. Fields that are inverted are
+ * called indexed. A field may be both stored and indexed.
+ *
The text of a field may be tokenized into terms to be indexed, or the
+ * text of a field may be used literally as a term to be indexed. Most fields are
+ * tokenized, but sometimes it is useful for certain identifier fields to be
+ * indexed literally.
+ *
See the {@link org.apache.lucene.document.Field Field}
+ * java docs for more information on Fields.
+ *
+ *
Segments
+ *
Lucene indexes may be composed of multiple sub-indexes, or segments.
+ * Each segment is a fully independent index, which could be searched separately.
+ * Indexes evolve by:
+ *
+ *
Creating new segments for newly added documents.
+ *
Merging existing segments.
+ *
+ *
Searches may involve multiple segments and/or multiple indexes, each index
+ * potentially composed of a set of segments.
+ *
+ *
Document Numbers
+ *
Internally, Lucene refers to documents by an integer document number.
+ * The first document added to an index is numbered zero, and each subsequent
+ * document added gets a number one greater than the previous.
+ *
Note that a document's number may change, so caution should be taken when
+ * storing these numbers outside of Lucene. In particular, numbers may change in
+ * the following situations:
+ *
+ *
+ *
The numbers stored in each segment are unique only within the segment, and
+ * must be converted before they can be used in a larger context. The standard
+ * technique is to allocate each segment a range of values, based on the range of
+ * numbers used in that segment. To convert a document number from a segment to an
+ * external value, the segment's base document number is added. To convert
+ * an external value back to a segment-specific value, the segment is identified
+ * by the range that the external value is in, and the segment's base value is
+ * subtracted. For example two five document segments might be combined, so that
+ * the first segment has a base value of zero, and the second of five. Document
+ * three from the second segment would have an external value of eight.
+ *
+ *
+ *
When documents are deleted, gaps are created in the numbering. These are
+ * eventually removed as the index evolves through merging. Deleted documents are
+ * dropped when segments are merged. A freshly-merged segment thus has no gaps in
+ * its numbering.
+ *
+ *
+ *
+ *
+ *
Index Structure Overview
+ *
+ *
Each segment index maintains the following:
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat Segment info}.
+ * This contains metadata about a segment, such as the number of documents,
+ * what files it uses, and information about how the segment is sorted
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat Field names}.
+ * This contains the set of field names used in the index.
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat Stored Field values}.
+ * This contains, for each document, a list of attribute-value pairs, where the attributes
+ * are field names. These are used to store auxiliary information about the document, such as
+ * its title, url, or an identifier to access a database. The set of stored fields are what is
+ * returned for each hit when searching. This is keyed by document number.
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term dictionary}.
+ * A dictionary containing all of the terms used in all of the
+ * indexed fields of all of the documents. The dictionary also contains the number
+ * of documents which contain the term, and pointers to the term's frequency and
+ * proximity data.
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Frequency data}.
+ * For each term in the dictionary, the numbers of all the
+ * documents that contain that term, and the frequency of the term in that
+ * document, unless frequencies are omitted ({@link org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Proximity data}.
+ * For each term in the dictionary, the positions that the
+ * term occurs in each document. Note that this will not exist if all fields in
+ * all documents omit position data.
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene80.Lucene80NormsFormat Normalization factors}.
+ * For each field in each document, a value is stored
+ * that is multiplied into the score for hits on that field.
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vectors}.
+ * For each field in each document, the term vector (sometimes
+ * called document vector) may be stored. A term vector consists of term text and
+ * term frequency. To add Term Vectors to your index see the
+ * {@link org.apache.lucene.document.Field Field} constructors
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat Per-document values}.
+ * Like stored values, these are also keyed by document
+ * number, but are generally intended to be loaded into main memory for fast
+ * access. Whereas stored values are generally intended for summary results from
+ * searches, per-document values are useful for things like scoring factors.
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live documents}.
+ * An optional file indicating which documents are live.
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene86.Lucene86PointsFormat Point values}.
+ * Optional pair of files, recording dimensionally indexed fields, to enable fast
+ * numeric range filtering and large numeric values like BigInteger and BigDecimal (1D)
+ * and geographic shape intersection (2D, 3D).
+ *
+ *
+ *
Details on each of these are provided in their linked pages.
+ *
+ *
+ *
File Naming
+ *
+ *
All files belonging to a segment have the same name with varying extensions.
+ * The extensions correspond to the different file formats described below. When
+ * using the Compound File format (default for small segments) these files (except
+ * for the Segment info file, the Lock file, and Deleted documents file) are collapsed
+ * into a single .cfs file (see below for details)
+ *
Typically, all segments in an index are stored in a single directory,
+ * although this is not required.
+ *
File names are never re-used. That is, when any file is saved
+ * to the Directory it is given a never before used filename. This is achieved
+ * using a simple generations approach. For example, the first segments file is
+ * segments_1, then segments_2, etc. The generation is a sequential long integer
+ * represented in alpha-numeric (base 36) form.
+ *
+ *
+ *
Summary of File Extensions
+ *
+ *
The following table summarizes the names and extensions of the files in
+ * Lucene:
Encodes additional scoring factors or other per-document information.
+ *
+ *
+ *
{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Index}
+ *
.tvx
+ *
Stores offset into the document data file
+ *
+ *
+ *
{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Data}
+ *
.tvd
+ *
Contains term vector data.
+ *
+ *
+ *
{@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents}
+ *
.liv
+ *
Info about what documents are live
+ *
+ *
+ *
{@link org.apache.lucene.codecs.lucene86.Lucene86PointsFormat Point values}
+ *
.dii, .dim
+ *
Holds indexed points, if any
+ *
+ *
+ *
+ *
+ *
Lock File
+ * The write lock, which is stored in the index directory by default, is named
+ * "write.lock". If the lock directory is different from the index directory then
+ * the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix
+ * derived from the full path to the index directory. When this file is present, a
+ * writer is currently modifying the index (adding or removing documents). This
+ * lock file ensures that only one writer is modifying the index at a time.
+ *
+ *
History
+ *
Compatibility notes are provided in this document, describing how file
+ * formats have changed from prior versions:
+ *
+ *
In version 2.1, the file format was changed to allow lock-less commits (ie,
+ * no more commit lock). The change is fully backwards compatible: you can open a
+ * pre-2.1 index for searching or adding/deleting of docs. When the new segments
+ * file is saved (committed), it will be written in the new file format (meaning
+ * no specific "upgrade" process is needed). But note that once a commit has
+ * occurred, pre-2.1 Lucene will not be able to read the index.
+ *
In version 2.3, the file format was changed to allow segments to share a
+ * single set of doc store (vectors & stored fields) files. This allows for
+ * faster indexing in certain cases. The change is fully backwards compatible (in
+ * the same way as the lock-less commits change in 2.1).
+ *
In version 2.4, Strings are now written as true UTF-8 byte sequence, not
+ * Java's modified UTF-8. See
+ * LUCENE-510 for details.
+ *
In version 2.9, an optional opaque Map<String,String> CommitUserData
+ * may be passed to IndexWriter's commit methods (and later retrieved), which is
+ * recorded in the segments_N file. See
+ * LUCENE-1382 for details. Also,
+ * diagnostics were added to each segment written recording details about why it
+ * was written (due to flush, merge; which OS/JRE was used; etc.). See issue
+ * LUCENE-1654 for details.
+ *
In version 3.0, compressed fields are no longer written to the index (they
+ * can still be read, but on merge the new segment will write them, uncompressed).
+ * See issue LUCENE-1960
+ * for details.
+ *
In version 3.1, segments records the code version that created them. See
+ * LUCENE-2720 for details.
+ * Additionally segments track explicitly whether or not they have term vectors.
+ * See LUCENE-2811
+ * for details.
+ *
In version 3.2, numeric fields are written as natively to stored fields
+ * file, previously they were stored in text format only.
+ *
In version 3.4, fields can omit position data while still indexing term
+ * frequencies.
+ *
In version 4.0, the format of the inverted index became extensible via
+ * the {@link org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage
+ * ({@code DocValues}) was introduced. Normalization factors need no longer be a
+ * single byte, they can be any {@link org.apache.lucene.index.NumericDocValues NumericDocValues}.
+ * Terms need not be unicode strings, they can be any byte sequence. Term offsets
+ * can optionally be indexed into the postings lists. Payloads can be stored in the
+ * term vectors.
+ *
In version 4.1, the format of the postings list changed to use either
+ * of FOR compression or variable-byte encoding, depending upon the frequency
+ * of the term. Terms appearing only once were changed to inline directly into
+ * the term dictionary. Stored fields are compressed by default.
+ *
In version 4.2, term vectors are compressed by default. DocValues has
+ * a new multi-valued type (SortedSet), that can be used for faceting/grouping/joining
+ * on multi-valued fields.
+ *
In version 4.5, DocValues were extended to explicitly represent missing values.
+ *
In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
+ * allow updating NumericDocValues fields.
+ *
In version 4.8, checksum footers were added to the end of each index file
+ * for improved data integrity. Specifically, the last 8 bytes of every index file
+ * contain the zlib-crc32 checksum of the file.
+ *
In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric)
+ * that is suitable for faceting/sorting/analytics.
+ *
In version 5.4, DocValues have been improved to store more information on disk:
+ * addresses for binary fields and ord indexes for multi-valued fields.
+ *
In version 6.0, Points were added, for multi-dimensional range/distance search.
+ *
In version 6.2, new Segment info format that reads/writes the index sort, to support index sorting.
+ *
In version 7.0, DocValues have been improved to better support sparse doc values
+ * thanks to an iterator API.
+ *
In version 8.0, postings have been enhanced to record, for each block of
+ * doc ids, the (term freq, normalization factor) pairs that may trigger the
+ * maximum score of the block. This information is recorded alongside skip data
+ * in order to be able to skip blocks of doc ids if they may not produce high
+ * enough scores.
+ * Additionally doc values and norms has been extended with jump-tables to make access O(1)
+ * instead of O(n), where n is the number of elements to skip when advancing in the data.
+ *
In version 8.4, postings, positions, offsets and payload lengths have move to a more
+ * performant encoding that is vectorized.
+ *
In version 8.6, index sort serialization is delegated to the sorts themselves, to
+ * allow user-defined sorts to be used
+ *
+ *
+ *
Limitations
+ *
+ *
Lucene uses a Java int to refer to
+ * document numbers, and the index file format uses an Int32
+ * on-disk to store document numbers. This is a limitation
+ * of both the index file format and the current implementation. Eventually these
+ * should be replaced with either UInt64 values, or
+ * better yet, {@link org.apache.lucene.store.DataOutput#writeVInt VInt} values which have no limit.
+ *
+ */
+package org.apache.lucene.codecs.lucene86;
diff --git a/lucene/core/src/java/org/apache/lucene/geo/Tessellator.java b/lucene/core/src/java/org/apache/lucene/geo/Tessellator.java
index c61fba938c3a..1600955b1726 100644
--- a/lucene/core/src/java/org/apache/lucene/geo/Tessellator.java
+++ b/lucene/core/src/java/org/apache/lucene/geo/Tessellator.java
@@ -376,7 +376,12 @@ private static Node getSharedVertex(final Node polygon, final Node vertex) {
Node next = polygon;
do {
if (isVertexEquals(next, vertex)) {
- return next;
+ // make sure we are not crossing the polygon. This might happen when several holes share the same polygon vertex.
+ boolean crosses = GeoUtils.lineCrossesLine(next.previous.getX(), next.previous.getY(), vertex.next.getX(), vertex.next.getY(),
+ next.next.getX(), next.next.getY(), vertex.previous.getX(), vertex.previous.getY());
+ if (crosses == false) {
+ return next;
+ }
}
next = next.next;
} while(next != polygon);
diff --git a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
index 1aeab4c98026..e213a485ea0f 100644
--- a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
@@ -21,7 +21,6 @@
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.search.SortField;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.ArrayUtil;
@@ -37,7 +36,7 @@
/** Buffers up pending byte[] per doc, then flushes when
* segment flushes. */
-class BinaryDocValuesWriter extends DocValuesWriter {
+class BinaryDocValuesWriter extends DocValuesWriter {
/** Maximum length for a binary field. */
private static final int MAX_LENGTH = ArrayUtil.MAX_ARRAY_LENGTH;
@@ -56,6 +55,8 @@ class BinaryDocValuesWriter extends DocValuesWriter {
private int lastDocID = -1;
private int maxLength = 0;
+ private PackedLongValues finalLengths;
+
public BinaryDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
this.fieldInfo = fieldInfo;
this.bytes = new PagedBytes(BLOCK_BITS);
@@ -98,10 +99,6 @@ private void updateBytesUsed() {
bytesUsed = newBytesUsed;
}
- @Override
- public void finish(int maxDoc) {
- }
-
private SortingLeafReader.CachedBinaryDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException {
FixedBitSet docsWithField = new FixedBitSet(maxDoc);
BytesRef[] values = new BytesRef[maxDoc];
@@ -118,18 +115,23 @@ private SortingLeafReader.CachedBinaryDVs sortDocValues(int maxDoc, Sorter.DocMa
}
@Override
- Sorter.DocComparator getDocComparator(int numDoc, SortField sortField) throws IOException {
- throw new IllegalArgumentException("It is forbidden to sort on a binary field");
+ BinaryDocValues getDocValues() {
+ if (finalLengths == null) {
+ finalLengths = this.lengths.build();
+ }
+ return new BufferedBinaryDocValues(finalLengths, maxLength, bytes.getDataInput(), docsWithField.iterator());
}
@Override
public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsumer dvConsumer) throws IOException {
bytes.freeze(false);
- final PackedLongValues lengths = this.lengths.build();
+ if (finalLengths == null) {
+ finalLengths = this.lengths.build();
+ }
final SortingLeafReader.CachedBinaryDVs sorted;
if (sortMap != null) {
sorted = sortDocValues(state.segmentInfo.maxDoc(), sortMap,
- new BufferedBinaryDocValues(lengths, maxLength, bytes.getDataInput(), docsWithField.iterator()));
+ new BufferedBinaryDocValues(finalLengths, maxLength, bytes.getDataInput(), docsWithField.iterator()));
} else {
sorted = null;
}
@@ -141,7 +143,7 @@ public BinaryDocValues getBinary(FieldInfo fieldInfoIn) {
throw new IllegalArgumentException("wrong fieldInfo");
}
if (sorted == null) {
- return new BufferedBinaryDocValues(lengths, maxLength, bytes.getDataInput(), docsWithField.iterator());
+ return new BufferedBinaryDocValues(finalLengths, maxLength, bytes.getDataInput(), docsWithField.iterator());
} else {
return new SortingLeafReader.SortingBinaryDocValues(sorted);
}
@@ -200,9 +202,4 @@ public BytesRef binaryValue() {
return value.get();
}
}
-
- @Override
- DocIdSetIterator getDocIdSet() {
- return docsWithField.iterator();
- }
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/ByteSliceWriter.java b/lucene/core/src/java/org/apache/lucene/index/ByteSliceWriter.java
index b96f7fe1f041..75650049aa2c 100644
--- a/lucene/core/src/java/org/apache/lucene/index/ByteSliceWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/ByteSliceWriter.java
@@ -26,7 +26,6 @@
* byte[]. This is used by DocumentsWriter to hold the
* posting list for many terms in RAM.
*/
-
final class ByteSliceWriter extends DataOutput {
private byte[] slice;
diff --git a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
index 03cabc13ba03..128aee028323 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
@@ -22,11 +22,11 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
-import java.util.HashSet;
import java.util.List;
import java.util.Map;
-import java.util.Set;
+import java.util.Objects;
+import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesFormat;
@@ -39,8 +39,6 @@
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
-import org.apache.lucene.search.SortedNumericSortField;
-import org.apache.lucene.search.SortedSetSortField;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.ArrayUtil;
@@ -48,13 +46,13 @@
import org.apache.lucene.util.BytesRefHash.MaxBytesLengthExceededException;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.RamUsageEstimator;
/** Default general purpose indexing chain, which handles
* indexing all types of fields. */
final class DefaultIndexingChain extends DocConsumer {
final Counter bytesUsed;
- final DocumentsWriterPerThread.DocState docState;
final DocumentsWriterPerThread docWriter;
final FieldInfos.Builder fieldInfos;
@@ -74,14 +72,13 @@ final class DefaultIndexingChain extends DocConsumer {
// Holds fields seen in each document
private PerField[] fields = new PerField[1];
-
- private final Set finishedDocValues = new HashSet<>();
+ private final InfoStream infoStream;
public DefaultIndexingChain(DocumentsWriterPerThread docWriter) {
this.docWriter = docWriter;
this.fieldInfos = docWriter.getFieldInfosBuilder();
- this.docState = docWriter.docState;
this.bytesUsed = docWriter.bytesUsed;
+ this.infoStream = docWriter.getIndexWriterConfig().getInfoStream();
final TermsHash termVectorsWriter;
if (docWriter.getSegmentInfo().getIndexSort() == null) {
@@ -94,29 +91,96 @@ public DefaultIndexingChain(DocumentsWriterPerThread docWriter) {
termsHash = new FreqProxTermsWriter(docWriter, termVectorsWriter);
}
+ private LeafReader getDocValuesLeafReader() {
+ return new DocValuesLeafReader() {
+ @Override
+ public NumericDocValues getNumericDocValues(String field) {
+ PerField pf = getPerField(field);
+ if (pf == null) {
+ return null;
+ }
+ if (pf.fieldInfo.getDocValuesType() == DocValuesType.NUMERIC) {
+ return (NumericDocValues) pf.docValuesWriter.getDocValues();
+ }
+ return null;
+ }
+
+ @Override
+ public BinaryDocValues getBinaryDocValues(String field) {
+ PerField pf = getPerField(field);
+ if (pf == null) {
+ return null;
+ }
+ if (pf.fieldInfo.getDocValuesType() == DocValuesType.BINARY) {
+ return (BinaryDocValues) pf.docValuesWriter.getDocValues();
+ }
+ return null;
+ }
+
+ @Override
+ public SortedDocValues getSortedDocValues(String field) throws IOException {
+ PerField pf = getPerField(field);
+ if (pf == null) {
+ return null;
+ }
+ if (pf.fieldInfo.getDocValuesType() == DocValuesType.SORTED) {
+ return (SortedDocValues) pf.docValuesWriter.getDocValues();
+ }
+ return null;
+ }
+
+ @Override
+ public SortedNumericDocValues getSortedNumericDocValues(String field) throws IOException {
+ PerField pf = getPerField(field);
+ if (pf == null) {
+ return null;
+ }
+ if (pf.fieldInfo.getDocValuesType() == DocValuesType.SORTED_NUMERIC) {
+ return (SortedNumericDocValues) pf.docValuesWriter.getDocValues();
+ }
+ return null;
+ }
+
+ @Override
+ public SortedSetDocValues getSortedSetDocValues(String field) throws IOException {
+ PerField pf = getPerField(field);
+ if (pf == null) {
+ return null;
+ }
+ if (pf.fieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) {
+ return (SortedSetDocValues) pf.docValuesWriter.getDocValues();
+ }
+ return null;
+ }
+
+ @Override
+ public FieldInfos getFieldInfos() {
+ return fieldInfos.finish();
+ }
+
+ };
+ }
+
private Sorter.DocMap maybeSortSegment(SegmentWriteState state) throws IOException {
Sort indexSort = state.segmentInfo.getIndexSort();
if (indexSort == null) {
return null;
}
- List comparators = new ArrayList<>();
+ LeafReader docValuesReader = getDocValuesLeafReader();
+
+ List comparators = new ArrayList<>();
for (int i = 0; i < indexSort.getSort().length; i++) {
SortField sortField = indexSort.getSort()[i];
- PerField perField = getPerField(sortField.getField());
- if (perField != null && perField.docValuesWriter != null &&
- finishedDocValues.contains(perField.fieldInfo.name) == false) {
- perField.docValuesWriter.finish(state.segmentInfo.maxDoc());
- Sorter.DocComparator cmp = perField.docValuesWriter.getDocComparator(state.segmentInfo.maxDoc(), sortField);
- comparators.add(cmp);
- finishedDocValues.add(perField.fieldInfo.name);
- } else {
- // safe to ignore, sort field with no values or already seen before
+ IndexSorter sorter = sortField.getIndexSorter();
+ if (sorter == null) {
+ throw new UnsupportedOperationException("Cannot sort index using sort field " + sortField);
}
+ comparators.add(sorter.getDocComparator(docValuesReader, state.segmentInfo.maxDoc()));
}
Sorter sorter = new Sorter(indexSort);
// returns null if the documents are already sorted
- return sorter.sort(state.segmentInfo.maxDoc(), comparators.toArray(new Sorter.DocComparator[comparators.size()]));
+ return sorter.sort(state.segmentInfo.maxDoc(), comparators.toArray(IndexSorter.DocComparator[]::new));
}
@Override
@@ -128,29 +192,29 @@ public Sorter.DocMap flush(SegmentWriteState state) throws IOException {
int maxDoc = state.segmentInfo.maxDoc();
long t0 = System.nanoTime();
writeNorms(state, sortMap);
- if (docState.infoStream.isEnabled("IW")) {
- docState.infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write norms");
+ if (infoStream.isEnabled("IW")) {
+ infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write norms");
}
SegmentReadState readState = new SegmentReadState(state.directory, state.segmentInfo, state.fieldInfos, IOContext.READ, state.segmentSuffix);
t0 = System.nanoTime();
writeDocValues(state, sortMap);
- if (docState.infoStream.isEnabled("IW")) {
- docState.infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write docValues");
+ if (infoStream.isEnabled("IW")) {
+ infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write docValues");
}
t0 = System.nanoTime();
writePoints(state, sortMap);
- if (docState.infoStream.isEnabled("IW")) {
- docState.infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write points");
+ if (infoStream.isEnabled("IW")) {
+ infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write points");
}
// it's possible all docs hit non-aborting exceptions...
t0 = System.nanoTime();
storedFieldsConsumer.finish(maxDoc);
storedFieldsConsumer.flush(state, sortMap);
- if (docState.infoStream.isEnabled("IW")) {
- docState.infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to finish stored fields");
+ if (infoStream.isEnabled("IW")) {
+ infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to finish stored fields");
}
t0 = System.nanoTime();
@@ -175,8 +239,8 @@ public Sorter.DocMap flush(SegmentWriteState state) throws IOException {
}
termsHash.flush(fieldsToFlush, state, sortMap, normsMergeInstance);
}
- if (docState.infoStream.isEnabled("IW")) {
- docState.infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write postings and finish vectors");
+ if (infoStream.isEnabled("IW")) {
+ infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write postings and finish vectors");
}
// Important to save after asking consumer to flush so
@@ -185,8 +249,8 @@ public Sorter.DocMap flush(SegmentWriteState state) throws IOException {
// FieldInfo.storePayload.
t0 = System.nanoTime();
docWriter.codec.fieldInfosFormat().write(state.directory, state.segmentInfo, "", state.fieldInfos, IOContext.DEFAULT);
- if (docState.infoStream.isEnabled("IW")) {
- docState.infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write fieldInfos");
+ if (infoStream.isEnabled("IW")) {
+ infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write fieldInfos");
}
return sortMap;
@@ -255,10 +319,6 @@ private void writeDocValues(SegmentWriteState state, Sorter.DocMap sortMap) thro
DocValuesFormat fmt = state.segmentInfo.getCodec().docValuesFormat();
dvConsumer = fmt.fieldsConsumer(state);
}
-
- if (finishedDocValues.contains(perField.fieldInfo.name) == false) {
- perField.docValuesWriter.finish(maxDoc);
- }
perField.docValuesWriter.flush(state, sortMap, dvConsumer);
perField.docValuesWriter = null;
} else if (perField.fieldInfo.getDocValuesType() != DocValuesType.NONE) {
@@ -382,7 +442,7 @@ private void finishStoredFields() throws IOException {
}
@Override
- public void processDocument() throws IOException {
+ public void processDocument(int docID, Iterable extends IndexableField> document) throws IOException {
// How many indexed field names we've seen (collapses
// multiple field instances by the same name):
@@ -399,23 +459,23 @@ public void processDocument() throws IOException {
termsHash.startDocument();
- startStoredFields(docState.docID);
+ startStoredFields(docID);
try {
- for (IndexableField field : docState.doc) {
- fieldCount = processField(field, fieldGen, fieldCount);
+ for (IndexableField field : document) {
+ fieldCount = processField(docID, field, fieldGen, fieldCount);
}
} finally {
if (docWriter.hasHitAbortingException() == false) {
// Finish each indexed field name seen in the document:
for (int i=0;i fi.putAttribute(k, v));
}
- fp = new PerField(docWriter.getIndexCreatedVersionMajor(), fi, invert);
+ LiveIndexWriterConfig indexWriterConfig = docWriter.getIndexWriterConfig();
+ fp = new PerField(docWriter.getIndexCreatedVersionMajor(), fi, invert,
+ indexWriterConfig.getSimilarity(), indexWriterConfig.getInfoStream(), indexWriterConfig.getAnalyzer());
fp.next = fieldHash[hashPos];
fieldHash[hashPos] = fp;
totalFieldCount++;
@@ -713,7 +786,7 @@ private final class PerField implements Comparable {
// Non-null if this field ever had doc values in this
// segment:
- DocValuesWriter docValuesWriter;
+ DocValuesWriter> docValuesWriter;
// Non-null if this field ever had points in this segment:
PointValuesWriter pointValuesWriter;
@@ -730,11 +803,15 @@ private final class PerField implements Comparable {
// reused
TokenStream tokenStream;
+ private final InfoStream infoStream;
+ private final Analyzer analyzer;
- public PerField(int indexCreatedVersionMajor, FieldInfo fieldInfo, boolean invert) {
+ PerField(int indexCreatedVersionMajor, FieldInfo fieldInfo, boolean invert, Similarity similarity, InfoStream infoStream, Analyzer analyzer) {
this.indexCreatedVersionMajor = indexCreatedVersionMajor;
this.fieldInfo = fieldInfo;
- similarity = docState.similarity;
+ this.similarity = similarity;
+ this.infoStream = infoStream;
+ this.analyzer = analyzer;
if (invert) {
setInvertState();
}
@@ -746,7 +823,7 @@ void setInvertState() {
if (fieldInfo.omitsNorms() == false) {
assert norms == null;
// Even if no documents actually succeed in setting a norm, we still write norms for this segment:
- norms = new NormValuesWriter(fieldInfo, docState.docWriter.bytesUsed);
+ norms = new NormValuesWriter(fieldInfo, bytesUsed);
}
}
@@ -755,7 +832,7 @@ public int compareTo(PerField other) {
return this.fieldInfo.name.compareTo(other.fieldInfo.name);
}
- public void finish() throws IOException {
+ public void finish(int docID) throws IOException {
if (fieldInfo.omitsNorms() == false) {
long normValue;
if (invertState.length == 0) {
@@ -769,7 +846,7 @@ public void finish() throws IOException {
throw new IllegalStateException("Similarity " + similarity + " return 0 for non-empty field");
}
}
- norms.addValue(docState.docID, normValue);
+ norms.addValue(docID, normValue);
}
termsHashPerField.finish();
@@ -778,7 +855,7 @@ public void finish() throws IOException {
/** Inverts one field for one document; first is true
* if this is the first time we are seeing this field
* name in this document. */
- public void invert(IndexableField field, boolean first) throws IOException {
+ public void invert(int docID, IndexableField field, boolean first) throws IOException {
if (first) {
// First time we're seeing this field (indexed) in
// this document:
@@ -794,7 +871,7 @@ public void invert(IndexableField field, boolean first) throws IOException {
fieldInfo.setOmitsNorms();
}
- final boolean analyzed = fieldType.tokenized() && docState.analyzer != null;
+ final boolean analyzed = fieldType.tokenized() && analyzer != null;
/*
* To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream
@@ -802,7 +879,7 @@ public void invert(IndexableField field, boolean first) throws IOException {
* but rather a finally that takes note of the problem.
*/
boolean succeededInProcessingField = false;
- try (TokenStream stream = tokenStream = field.tokenStream(docState.analyzer, tokenStream)) {
+ try (TokenStream stream = tokenStream = field.tokenStream(analyzer, tokenStream)) {
// reset the TokenStream to the first token
stream.reset();
invertState.setAttributeSource(stream);
@@ -858,14 +935,14 @@ public void invert(IndexableField field, boolean first) throws IOException {
// corrupt and should not be flushed to a
// new segment:
try {
- termsHashPerField.add();
+ termsHashPerField.add(invertState.termAttribute.getBytesRef(), docID);
} catch (MaxBytesLengthExceededException e) {
byte[] prefix = new byte[30];
BytesRef bigTerm = invertState.termAttribute.getBytesRef();
System.arraycopy(bigTerm.bytes, bigTerm.offset, prefix, 0, 30);
String msg = "Document contains at least one immense term in field=\"" + fieldInfo.name + "\" (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + Arrays.toString(prefix) + "...', original message: " + e.getMessage();
- if (docState.infoStream.isEnabled("IW")) {
- docState.infoStream.message("IW", "ERROR: " + msg);
+ if (infoStream.isEnabled("IW")) {
+ infoStream.message("IW", "ERROR: " + msg);
}
// Document will be deleted above:
throw new IllegalArgumentException(msg, e);
@@ -886,14 +963,14 @@ public void invert(IndexableField field, boolean first) throws IOException {
/* if there is an exception coming through, we won't set this to true here:*/
succeededInProcessingField = true;
} finally {
- if (!succeededInProcessingField && docState.infoStream.isEnabled("DW")) {
- docState.infoStream.message("DW", "An exception was thrown while processing field " + fieldInfo.name);
+ if (!succeededInProcessingField && infoStream.isEnabled("DW")) {
+ infoStream.message("DW", "An exception was thrown while processing field " + fieldInfo.name);
}
}
if (analyzed) {
- invertState.position += docState.analyzer.getPositionIncrementGap(fieldInfo.name);
- invertState.offset += docState.analyzer.getOffsetGap(fieldInfo.name);
+ invertState.position += analyzer.getPositionIncrementGap(fieldInfo.name);
+ invertState.offset += analyzer.getOffsetGap(fieldInfo.name);
}
}
}
@@ -907,7 +984,7 @@ DocIdSetIterator getHasDocValues(String field) {
return null;
}
- return perField.docValuesWriter.getDocIdSet();
+ return perField.docValuesWriter.getDocValues();
}
}
return null;
diff --git a/lucene/core/src/java/org/apache/lucene/index/DocConsumer.java b/lucene/core/src/java/org/apache/lucene/index/DocConsumer.java
index d124434a5f9c..90a98f43e547 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DocConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DocConsumer.java
@@ -22,7 +22,7 @@
import org.apache.lucene.search.DocIdSetIterator;
abstract class DocConsumer {
- abstract void processDocument() throws IOException;
+ abstract void processDocument(int docId, Iterable extends IndexableField> document) throws IOException;
abstract Sorter.DocMap flush(final SegmentWriteState state) throws IOException;
abstract void abort() throws IOException;
diff --git a/lucene/core/src/java/org/apache/lucene/index/DocValues.java b/lucene/core/src/java/org/apache/lucene/index/DocValues.java
index 63488d038084..f90d715e69cc 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DocValues.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DocValues.java
@@ -168,7 +168,7 @@ public int getValueCount() {
/**
* An empty SortedNumericDocValues which returns zero values for every document
*/
- public static final SortedNumericDocValues emptySortedNumeric(int maxDoc) {
+ public static final SortedNumericDocValues emptySortedNumeric() {
return new SortedNumericDocValues() {
private int doc = -1;
@@ -387,7 +387,7 @@ public static SortedNumericDocValues getSortedNumeric(LeafReader reader, String
NumericDocValues single = reader.getNumericDocValues(field);
if (single == null) {
checkField(reader, field, DocValuesType.SORTED_NUMERIC, DocValuesType.NUMERIC);
- return emptySortedNumeric(reader.maxDoc());
+ return emptySortedNumeric();
}
return singleton(single);
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/DocValuesLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/DocValuesLeafReader.java
new file mode 100644
index 000000000000..93b7f4988d68
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/index/DocValuesLeafReader.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.index;
+
+import java.io.IOException;
+
+import org.apache.lucene.util.Bits;
+
+abstract class DocValuesLeafReader extends LeafReader {
+ @Override
+ public final CacheHelper getCoreCacheHelper() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public final Terms terms(String field) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public final NumericDocValues getNormValues(String field) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public final Bits getLiveDocs() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public final PointValues getPointValues(String field) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public final void checkIntegrity() throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public final LeafMetaData getMetaData() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public final Fields getTermVectors(int docID) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public final int numDocs() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public final int maxDoc() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public final void document(int docID, StoredFieldVisitor visitor) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ protected final void doClose() throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public final CacheHelper getReaderCacheHelper() {
+ throw new UnsupportedOperationException();
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/index/DocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/DocValuesWriter.java
index b739b14a2a77..4098cb05cf7f 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DocValuesWriter.java
@@ -21,12 +21,8 @@
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.search.SortField;
-abstract class DocValuesWriter {
- abstract void finish(int numDoc);
+abstract class DocValuesWriter {
abstract void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsumer consumer) throws IOException;
- abstract Sorter.DocComparator getDocComparator(int numDoc, SortField sortField) throws IOException;
- abstract DocIdSetIterator getDocIdSet();
-
+ abstract T getDocValues();
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java
index d4083c44e3ef..9f01f884b8c9 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java
@@ -137,7 +137,7 @@ final class DocumentsWriter implements Closeable, Accountable {
final FieldInfos.Builder infos = new FieldInfos.Builder(globalFieldNumberMap);
return new DocumentsWriterPerThread(indexCreatedVersionMajor,
segmentNameSupplier.get(), directoryOrig,
- directory, config, infoStream, deleteQueue, infos,
+ directory, config, deleteQueue, infos,
pendingNumDocs, enableTestPoints);
});
this.pendingNumDocs = pendingNumDocs;
diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java
index d810234d568e..48f676e56226 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java
@@ -26,11 +26,9 @@
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.ReentrantLock;
-import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.index.DocumentsWriterDeleteQueue.DeleteSlice;
import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FlushInfo;
import org.apache.lucene.store.IOContext;
@@ -52,6 +50,10 @@
final class DocumentsWriterPerThread {
+ LiveIndexWriterConfig getIndexWriterConfig() {
+ return indexWriterConfig;
+ }
+
/**
* The IndexingChain must define the {@link #getChain(DocumentsWriterPerThread)} method
* which returns the DocConsumer that the DocumentsWriter calls to process the
@@ -85,27 +87,6 @@ DocConsumer getChain(DocumentsWriterPerThread documentsWriterPerThread) {
}
};
- static class DocState {
- final DocumentsWriterPerThread docWriter;
- final Analyzer analyzer;
- InfoStream infoStream;
- Similarity similarity;
- int docID;
- Iterable extends IndexableField> doc;
-
- DocState(DocumentsWriterPerThread docWriter, Analyzer analyzer, InfoStream infoStream) {
- this.docWriter = docWriter;
- this.infoStream = infoStream;
- this.analyzer = analyzer;
- }
-
- public void clear() {
- // don't hold onto doc nor analyzer, in case it is
- // largish:
- doc = null;
- }
- }
-
static final class FlushedSegment {
final SegmentCommitInfo segmentInfo;
final FieldInfos fieldInfos;
@@ -150,7 +131,6 @@ void abort() throws IOException{
private final static boolean INFO_VERBOSE = false;
final Codec codec;
final TrackingDirectoryWrapper directory;
- final DocState docState;
private final DocConsumer consumer;
final Counter bytesUsed;
@@ -179,15 +159,13 @@ void abort() throws IOException{
private int numDeletedDocIds = 0;
- DocumentsWriterPerThread(int indexVersionCreated, String segmentName, Directory directoryOrig, Directory directory, LiveIndexWriterConfig indexWriterConfig, InfoStream infoStream, DocumentsWriterDeleteQueue deleteQueue,
+ DocumentsWriterPerThread(int indexVersionCreated, String segmentName, Directory directoryOrig, Directory directory, LiveIndexWriterConfig indexWriterConfig, DocumentsWriterDeleteQueue deleteQueue,
FieldInfos.Builder fieldInfos, AtomicLong pendingNumDocs, boolean enableTestPoints) throws IOException {
this.directory = new TrackingDirectoryWrapper(directory);
this.fieldInfos = fieldInfos;
this.indexWriterConfig = indexWriterConfig;
- this.infoStream = infoStream;
+ this.infoStream = indexWriterConfig.getInfoStream();
this.codec = indexWriterConfig.getCodec();
- this.docState = new DocState(this, indexWriterConfig.getAnalyzer(), infoStream);
- this.docState.similarity = indexWriterConfig.getSimilarity();
this.pendingNumDocs = pendingNumDocs;
bytesUsed = Counter.newCounter();
byteBlockAllocator = new DirectTrackingAllocator(bytesUsed);
@@ -239,7 +217,7 @@ long updateDocuments(Iterable extends Iterable extends IndexableField>> docs
testPoint("DocumentsWriterPerThread addDocuments start");
assert hasHitAbortingException() == false: "DWPT has hit aborting exception but is still indexing";
if (INFO_VERBOSE && infoStream.isEnabled("DWPT")) {
- infoStream.message("DWPT", Thread.currentThread().getName() + " update delTerm=" + deleteNode + " docID=" + docState.docID + " seg=" + segmentInfo.name);
+ infoStream.message("DWPT", Thread.currentThread().getName() + " update delTerm=" + deleteNode + " docID=" + numDocsInRAM + " seg=" + segmentInfo.name);
}
final int docsInRamBefore = numDocsInRAM;
boolean allDocsIndexed = false;
@@ -252,13 +230,7 @@ long updateDocuments(Iterable extends Iterable extends IndexableField>> docs
// it's very hard to fix (we can't easily distinguish aborting
// vs non-aborting exceptions):
reserveOneDoc();
- docState.doc = doc;
- docState.docID = numDocsInRAM;
- try {
- consumer.processDocument();
- } finally {
- numDocsInRAM++; // we count the doc anyway even in the case of an exception
- }
+ consumer.processDocument(numDocsInRAM++, doc);
}
allDocsIndexed = true;
return finishDocuments(deleteNode, docsInRamBefore);
@@ -268,7 +240,6 @@ long updateDocuments(Iterable extends Iterable extends IndexableField>> docs
// go and mark all docs from this block as deleted
deleteLastDocs(numDocsInRAM - docsInRamBefore);
}
- docState.clear();
}
} finally {
maybeAbort("updateDocuments", flushNotifications);
@@ -400,8 +371,8 @@ FlushedSegment flush(DocumentsWriter.FlushNotifications flushNotifications) thro
final Sorter.DocMap sortMap;
try {
DocIdSetIterator softDeletedDocs;
- if (indexWriterConfig.getSoftDeletesField() != null) {
- softDeletedDocs = consumer.getHasDocValues(indexWriterConfig.getSoftDeletesField());
+ if (getIndexWriterConfig().getSoftDeletesField() != null) {
+ softDeletedDocs = consumer.getHasDocValues(getIndexWriterConfig().getSoftDeletesField());
} else {
softDeletedDocs = null;
}
@@ -509,7 +480,7 @@ void sealFlushedSegment(FlushedSegment flushedSegment, Sorter.DocMap sortMap, Do
boolean success = false;
try {
- if (indexWriterConfig.getUseCompoundFile()) {
+ if (getIndexWriterConfig().getUseCompoundFile()) {
Set originalFiles = newSegment.info.files();
// TODO: like addIndexes, we are relying on createCompoundFile to successfully cleanup...
IndexWriter.createCompoundFile(infoStream, new TrackingDirectoryWrapper(directory), newSegment.info, context, flushNotifications::deleteUnusedFiles);
diff --git a/lucene/core/src/java/org/apache/lucene/index/FilterMergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/FilterMergePolicy.java
index eb634b48a6b7..b4e33f8f6b4f 100644
--- a/lucene/core/src/java/org/apache/lucene/index/FilterMergePolicy.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FilterMergePolicy.java
@@ -57,6 +57,11 @@ public MergeSpecification findForcedDeletesMerges(SegmentInfos segmentInfos, Mer
return in.findForcedDeletesMerges(segmentInfos, mergeContext);
}
+ @Override
+ public MergeSpecification findFullFlushMerges(MergeTrigger mergeTrigger, SegmentInfos segmentInfos, MergeContext mergeContext) throws IOException {
+ return in.findFullFlushMerges(mergeTrigger, segmentInfos, mergeContext);
+ }
+
@Override
public boolean useCompoundFile(SegmentInfos infos, SegmentCommitInfo mergedInfo, MergeContext mergeContext)
throws IOException {
diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxFields.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxFields.java
index 4ec9fd5662bb..db1748fcb9a8 100644
--- a/lucene/core/src/java/org/apache/lucene/index/FreqProxFields.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxFields.java
@@ -39,7 +39,7 @@ class FreqProxFields extends Fields {
public FreqProxFields(List fieldList) {
// NOTE: fields are already sorted by field name
for(FreqProxTermsWriterPerField field : fieldList) {
- fields.put(field.fieldInfo.name, field);
+ fields.put(field.getFieldName(), field);
}
}
@@ -55,7 +55,6 @@ public Terms terms(String field) throws IOException {
@Override
public int size() {
- //return fields.size();
throw new UnsupportedOperationException();
}
@@ -75,31 +74,27 @@ public TermsEnum iterator() {
@Override
public long size() {
- //return terms.termsHashPerField.bytesHash.size();
throw new UnsupportedOperationException();
}
@Override
public long getSumTotalTermFreq() {
- //return terms.sumTotalTermFreq;
throw new UnsupportedOperationException();
}
@Override
public long getSumDocFreq() {
- //return terms.sumDocFreq;
throw new UnsupportedOperationException();
}
@Override
public int getDocCount() {
- //return terms.docCount;
throw new UnsupportedOperationException();
}
@Override
public boolean hasFreqs() {
- return terms.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
+ return terms.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
}
@Override
@@ -107,7 +102,7 @@ public boolean hasOffsets() {
// NOTE: the in-memory buffer may have indexed offsets
// because that's what FieldInfo said when we started,
// but during indexing this may have been downgraded:
- return terms.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+ return terms.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
}
@Override
@@ -115,7 +110,7 @@ public boolean hasPositions() {
// NOTE: the in-memory buffer may have indexed positions
// because that's what FieldInfo said when we started,
// but during indexing this may have been downgraded:
- return terms.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
+ return terms.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
}
@Override
@@ -132,10 +127,10 @@ private static class FreqProxTermsEnum extends BaseTermsEnum {
final int numTerms;
int ord;
- public FreqProxTermsEnum(FreqProxTermsWriterPerField terms) {
+ FreqProxTermsEnum(FreqProxTermsWriterPerField terms) {
this.terms = terms;
- this.numTerms = terms.bytesHash.size();
- sortedTermIDs = terms.sortedTermIDs;
+ this.numTerms = terms.getNumTerms();
+ sortedTermIDs = terms.getSortedTermIDs();
assert sortedTermIDs != null;
postingsArray = (FreqProxPostingsArray) terms.postingsArray;
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
index 5180aafd63b1..bbc7b18dcad2 100644
--- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
@@ -75,9 +75,9 @@ public void flush(Map fieldsToFlush, final SegmentWrit
for (TermsHashPerField f : fieldsToFlush.values()) {
final FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField) f;
- if (perField.bytesHash.size() > 0) {
- perField.sortPostings();
- assert perField.fieldInfo.getIndexOptions() != IndexOptions.NONE;
+ if (perField.getNumTerms() > 0) {
+ perField.sortTerms();
+ assert perField.indexOptions != IndexOptions.NONE;
allFields.add(perField);
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
index 7d77d0b62dde..1b87cbb5c0d6 100644
--- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
@@ -20,6 +20,7 @@
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
import org.apache.lucene.util.BytesRef;
// TODO: break into separate freq and prox writers as
@@ -28,26 +29,25 @@
final class FreqProxTermsWriterPerField extends TermsHashPerField {
private FreqProxPostingsArray freqProxPostingsArray;
+ private final FieldInvertState fieldState;
+ private final FieldInfo fieldInfo;
final boolean hasFreq;
final boolean hasProx;
final boolean hasOffsets;
PayloadAttribute payloadAttribute;
OffsetAttribute offsetAttribute;
- long sumTotalTermFreq;
- long sumDocFreq;
-
- // How many docs have this field:
- int docCount;
+ TermFrequencyAttribute termFreqAtt;
/** Set to true if any token had a payload in the current
* segment. */
boolean sawPayloads;
- public FreqProxTermsWriterPerField(FieldInvertState invertState, TermsHash termsHash, FieldInfo fieldInfo, TermsHashPerField nextPerField) {
- super(fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 ? 2 : 1, invertState, termsHash, nextPerField, fieldInfo);
- IndexOptions indexOptions = fieldInfo.getIndexOptions();
- assert indexOptions != IndexOptions.NONE;
+ FreqProxTermsWriterPerField(FieldInvertState invertState, TermsHash termsHash, FieldInfo fieldInfo, TermsHashPerField nextPerField) {
+ super(fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 ? 2 : 1,
+ termsHash.intPool, termsHash.bytePool, termsHash.termBytePool, termsHash.bytesUsed, nextPerField, fieldInfo.name, fieldInfo.getIndexOptions());
+ this.fieldState = invertState;
+ this.fieldInfo = fieldInfo;
hasFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
hasProx = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
hasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
@@ -56,12 +56,6 @@ public FreqProxTermsWriterPerField(FieldInvertState invertState, TermsHash terms
@Override
void finish() throws IOException {
super.finish();
- sumDocFreq += fieldState.uniqueTermCount;
- sumTotalTermFreq += fieldState.length;
- if (fieldState.length > 0) {
- docCount++;
- }
-
if (sawPayloads) {
fieldInfo.setStorePayloads();
}
@@ -70,6 +64,7 @@ void finish() throws IOException {
@Override
boolean start(IndexableField f, boolean first) {
super.start(f, first);
+ termFreqAtt = fieldState.termFreqAttribute;
payloadAttribute = fieldState.payloadAttribute;
offsetAttribute = fieldState.offsetAttribute;
return true;
@@ -104,18 +99,18 @@ void writeOffsets(int termID, int offsetAccum) {
}
@Override
- void newTerm(final int termID) {
+ void newTerm(final int termID, final int docID) {
// First time we're seeing this term since the last
// flush
final FreqProxPostingsArray postings = freqProxPostingsArray;
- postings.lastDocIDs[termID] = docState.docID;
+ postings.lastDocIDs[termID] = docID;
if (!hasFreq) {
assert postings.termFreqs == null;
- postings.lastDocCodes[termID] = docState.docID;
+ postings.lastDocCodes[termID] = docID;
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
} else {
- postings.lastDocCodes[termID] = docState.docID << 1;
+ postings.lastDocCodes[termID] = docID << 1;
postings.termFreqs[termID] = getTermFreq();
if (hasProx) {
writeProx(termID, fieldState.position);
@@ -131,25 +126,25 @@ void newTerm(final int termID) {
}
@Override
- void addTerm(final int termID) {
+ void addTerm(final int termID, final int docID) {
final FreqProxPostingsArray postings = freqProxPostingsArray;
assert !hasFreq || postings.termFreqs[termID] > 0;
if (!hasFreq) {
assert postings.termFreqs == null;
if (termFreqAtt.getTermFrequency() != 1) {
- throw new IllegalStateException("field \"" + fieldInfo.name + "\": must index term freq while using custom TermFrequencyAttribute");
+ throw new IllegalStateException("field \"" + getFieldName() + "\": must index term freq while using custom TermFrequencyAttribute");
}
- if (docState.docID != postings.lastDocIDs[termID]) {
+ if (docID != postings.lastDocIDs[termID]) {
// New document; now encode docCode for previous doc:
- assert docState.docID > postings.lastDocIDs[termID];
+ assert docID > postings.lastDocIDs[termID];
writeVInt(0, postings.lastDocCodes[termID]);
- postings.lastDocCodes[termID] = docState.docID - postings.lastDocIDs[termID];
- postings.lastDocIDs[termID] = docState.docID;
+ postings.lastDocCodes[termID] = docID - postings.lastDocIDs[termID];
+ postings.lastDocIDs[termID] = docID;
fieldState.uniqueTermCount++;
}
- } else if (docState.docID != postings.lastDocIDs[termID]) {
- assert docState.docID > postings.lastDocIDs[termID]:"id: "+docState.docID + " postings ID: "+ postings.lastDocIDs[termID] + " termID: "+termID;
+ } else if (docID != postings.lastDocIDs[termID]) {
+ assert docID > postings.lastDocIDs[termID]:"id: "+docID + " postings ID: "+ postings.lastDocIDs[termID] + " termID: "+termID;
// Term not yet seen in the current doc but previously
// seen in other doc(s) since the last flush
@@ -165,8 +160,8 @@ void addTerm(final int termID) {
// Init freq for the current document
postings.termFreqs[termID] = getTermFreq();
fieldState.maxTermFrequency = Math.max(postings.termFreqs[termID], fieldState.maxTermFrequency);
- postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
- postings.lastDocIDs[termID] = docState.docID;
+ postings.lastDocCodes[termID] = (docID - postings.lastDocIDs[termID]) << 1;
+ postings.lastDocIDs[termID] = docID;
if (hasProx) {
writeProx(termID, fieldState.position);
if (hasOffsets) {
@@ -193,7 +188,7 @@ private int getTermFreq() {
int freq = termFreqAtt.getTermFrequency();
if (freq != 1) {
if (hasProx) {
- throw new IllegalStateException("field \"" + fieldInfo.name + "\": cannot index positions while using custom TermFrequencyAttribute");
+ throw new IllegalStateException("field \"" + getFieldName() + "\": cannot index positions while using custom TermFrequencyAttribute");
}
}
@@ -207,8 +202,6 @@ public void newPostingsArray() {
@Override
ParallelPostingsArray createPostingsArray(int size) {
- IndexOptions indexOptions = fieldInfo.getIndexOptions();
- assert indexOptions != IndexOptions.NONE;
boolean hasFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
boolean hasProx = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
boolean hasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexSorter.java b/lucene/core/src/java/org/apache/lucene/index/IndexSorter.java
new file mode 100644
index 000000000000..81fdf6207d2f
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexSorter.java
@@ -0,0 +1,448 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.index;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.lucene.search.FieldComparator;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.util.LongValues;
+import org.apache.lucene.util.NumericUtils;
+import org.apache.lucene.util.packed.PackedInts;
+
+import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
+
+/**
+ * Handles how documents should be sorted in an index, both within a segment and between
+ * segments.
+ *
+ * Implementers must provide the following methods:
+ * {@link #getDocComparator(LeafReader,int)} - an object that determines how documents within a segment are to be sorted
+ * {@link #getComparableProviders(List)} - an array of objects that return a sortable long value per document and segment
+ * {@link #getProviderName()} - the SPI-registered name of a {@link SortFieldProvider} to serialize the sort
+ *
+ * The companion {@link SortFieldProvider} should be registered with SPI via {@code META-INF/services}
+ */
+public interface IndexSorter {
+
+ /** Used for sorting documents across segments */
+ interface ComparableProvider {
+ /**
+ * Returns a long so that the natural ordering of long values matches the
+ * ordering of doc IDs for the given comparator
+ */
+ long getAsComparableLong(int docID) throws IOException;
+ }
+
+ /** A comparator of doc IDs, used for sorting documents within a segment */
+ interface DocComparator {
+ /** Compare docID1 against docID2. The contract for the return value is the
+ * same as {@link Comparator#compare(Object, Object)}. */
+ int compare(int docID1, int docID2);
+ }
+
+ /**
+ * Get an array of {@link ComparableProvider}, one per segment, for merge sorting documents in different segments
+ * @param readers the readers to be merged
+ */
+ ComparableProvider[] getComparableProviders(List extends LeafReader> readers) throws IOException;
+
+ /**
+ * Get a comparator that determines the sort order of docs within a single Reader.
+ *
+ * NB We cannot simply use the {@link FieldComparator} API because it requires docIDs to be sent
+ * in-order. The default implementations allocate array[maxDoc] to hold native values for comparison,
+ * but 1) they are transient (only alive while sorting this one segment) and 2) in the typical
+ * index sorting case, they are only used to sort newly flushed segments, which will be smaller
+ * than merged segments
+ *
+ * @param reader the Reader to sort
+ * @param maxDoc the number of documents in the Reader
+ */
+ DocComparator getDocComparator(LeafReader reader, int maxDoc) throws IOException;
+
+ /**
+ * The SPI-registered name of a {@link SortFieldProvider} that will deserialize the parent SortField
+ */
+ String getProviderName();
+
+ /**
+ * Provide a NumericDocValues instance for a LeafReader
+ */
+ interface NumericDocValuesProvider {
+ /**
+ * Returns the NumericDocValues instance for this LeafReader
+ */
+ NumericDocValues get(LeafReader reader) throws IOException;
+ }
+
+ /**
+ * Provide a SortedDocValues instance for a LeafReader
+ */
+ interface SortedDocValuesProvider {
+ /**
+ * Returns the SortedDocValues instance for this LeafReader
+ */
+ SortedDocValues get(LeafReader reader) throws IOException;
+ }
+
+ /**
+ * Sorts documents based on integer values from a NumericDocValues instance
+ */
+ final class IntSorter implements IndexSorter {
+
+ private final Integer missingValue;
+ private final int reverseMul;
+ private final NumericDocValuesProvider valuesProvider;
+ private final String providerName;
+
+ /**
+ * Creates a new IntSorter
+ */
+ public IntSorter(String providerName, Integer missingValue, boolean reverse, NumericDocValuesProvider valuesProvider) {
+ this.missingValue = missingValue;
+ this.reverseMul = reverse ? -1 : 1;
+ this.valuesProvider = valuesProvider;
+ this.providerName = providerName;
+ }
+
+ @Override
+ public ComparableProvider[] getComparableProviders(List extends LeafReader> readers) throws IOException {
+ ComparableProvider[] providers = new ComparableProvider[readers.size()];
+ final long missingValue;
+ if (this.missingValue != null) {
+ missingValue = this.missingValue;
+ } else {
+ missingValue = 0L;
+ }
+
+ for(int readerIndex=0;readerIndex {
+ if (values.advanceExact(docID)) {
+ return values.longValue();
+ } else {
+ return missingValue;
+ }
+ };
+ }
+ return providers;
+ }
+
+ @Override
+ public DocComparator getDocComparator(LeafReader reader, int maxDoc) throws IOException {
+ final NumericDocValues dvs = valuesProvider.get(reader);
+ int[] values = new int[maxDoc];
+ if (this.missingValue != null) {
+ Arrays.fill(values, this.missingValue);
+ }
+ while (true) {
+ int docID = dvs.nextDoc();
+ if (docID == NO_MORE_DOCS) {
+ break;
+ }
+ values[docID] = (int) dvs.longValue();
+ }
+
+ return (docID1, docID2) -> reverseMul * Integer.compare(values[docID1], values[docID2]);
+ }
+
+ @Override
+ public String getProviderName() {
+ return providerName;
+ }
+ }
+
+ /**
+ * Sorts documents based on long values from a NumericDocValues instance
+ */
+ final class LongSorter implements IndexSorter {
+
+ private final String providerName;
+ private final Long missingValue;
+ private final int reverseMul;
+ private final NumericDocValuesProvider valuesProvider;
+
+ /** Creates a new LongSorter */
+ public LongSorter(String providerName, Long missingValue, boolean reverse, NumericDocValuesProvider valuesProvider) {
+ this.providerName = providerName;
+ this.missingValue = missingValue;
+ this.reverseMul = reverse ? -1 : 1;
+ this.valuesProvider = valuesProvider;
+ }
+
+ @Override
+ public ComparableProvider[] getComparableProviders(List extends LeafReader> readers) throws IOException {
+ ComparableProvider[] providers = new ComparableProvider[readers.size()];
+ final long missingValue;
+ if (this.missingValue != null) {
+ missingValue = this.missingValue;
+ } else {
+ missingValue = 0L;
+ }
+
+ for(int readerIndex=0;readerIndex {
+ if (values.advanceExact(docID)) {
+ return values.longValue();
+ } else {
+ return missingValue;
+ }
+ };
+ }
+ return providers;
+ }
+
+ @Override
+ public DocComparator getDocComparator(LeafReader reader, int maxDoc) throws IOException {
+ final NumericDocValues dvs = valuesProvider.get(reader);
+ long[] values = new long[maxDoc];
+ if (this.missingValue != null) {
+ Arrays.fill(values, this.missingValue);
+ }
+ while (true) {
+ int docID = dvs.nextDoc();
+ if (docID == NO_MORE_DOCS) {
+ break;
+ }
+ values[docID] = dvs.longValue();
+ }
+
+ return (docID1, docID2) -> reverseMul * Long.compare(values[docID1], values[docID2]);
+ }
+
+ @Override
+ public String getProviderName() {
+ return providerName;
+ }
+ }
+
+ /**
+ * Sorts documents based on float values from a NumericDocValues instance
+ */
+ final class FloatSorter implements IndexSorter {
+
+ private final String providerName;
+ private final Float missingValue;
+ private final int reverseMul;
+ private final NumericDocValuesProvider valuesProvider;
+
+ /** Creates a new FloatSorter */
+ public FloatSorter(String providerName, Float missingValue, boolean reverse, NumericDocValuesProvider valuesProvider) {
+ this.providerName = providerName;
+ this.missingValue = missingValue;
+ this.reverseMul = reverse ? -1 : 1;
+ this.valuesProvider = valuesProvider;
+ }
+
+ @Override
+ public ComparableProvider[] getComparableProviders(List extends LeafReader> readers) throws IOException {
+ ComparableProvider[] providers = new ComparableProvider[readers.size()];
+ final float missingValue;
+ if (this.missingValue != null) {
+ missingValue = this.missingValue;
+ } else {
+ missingValue = 0.0f;
+ }
+
+ for(int readerIndex=0;readerIndex {
+ float value = missingValue;
+ if (values.advanceExact(docID)) {
+ value = Float.intBitsToFloat((int) values.longValue());
+ }
+ return NumericUtils.floatToSortableInt(value);
+ };
+ }
+ return providers;
+ }
+
+ @Override
+ public DocComparator getDocComparator(LeafReader reader, int maxDoc) throws IOException {
+ final NumericDocValues dvs = valuesProvider.get(reader);
+ float[] values = new float[maxDoc];
+ if (this.missingValue != null) {
+ Arrays.fill(values, this.missingValue);
+ }
+ while (true) {
+ int docID = dvs.nextDoc();
+ if (docID == NO_MORE_DOCS) {
+ break;
+ }
+ values[docID] = Float.intBitsToFloat((int) dvs.longValue());
+ }
+
+ return (docID1, docID2) -> reverseMul * Float.compare(values[docID1], values[docID2]);
+ }
+
+ @Override
+ public String getProviderName() {
+ return providerName;
+ }
+ }
+
+ /**
+ * Sorts documents based on double values from a NumericDocValues instance
+ */
+ final class DoubleSorter implements IndexSorter {
+
+ private final String providerName;
+ private final Double missingValue;
+ private final int reverseMul;
+ private final NumericDocValuesProvider valuesProvider;
+
+ /** Creates a new DoubleSorter */
+ public DoubleSorter(String providerName, Double missingValue, boolean reverse, NumericDocValuesProvider valuesProvider) {
+ this.providerName = providerName;
+ this.missingValue = missingValue;
+ this.reverseMul = reverse ? -1 : 1;
+ this.valuesProvider = valuesProvider;
+ }
+
+ @Override
+ public ComparableProvider[] getComparableProviders(List extends LeafReader> readers) throws IOException {
+ ComparableProvider[] providers = new ComparableProvider[readers.size()];
+ final double missingValue;
+ if (this.missingValue != null) {
+ missingValue = this.missingValue;
+ } else {
+ missingValue = 0.0f;
+ }
+
+ for(int readerIndex=0;readerIndex {
+ double value = missingValue;
+ if (values.advanceExact(docID)) {
+ value = Double.longBitsToDouble(values.longValue());
+ }
+ return NumericUtils.doubleToSortableLong(value);
+ };
+ }
+ return providers;
+ }
+
+ @Override
+ public DocComparator getDocComparator(LeafReader reader, int maxDoc) throws IOException {
+ final NumericDocValues dvs = valuesProvider.get(reader);
+ double[] values = new double[maxDoc];
+ if (missingValue != null) {
+ Arrays.fill(values, missingValue);
+ }
+ while (true) {
+ int docID = dvs.nextDoc();
+ if (docID == NO_MORE_DOCS) {
+ break;
+ }
+ values[docID] = Double.longBitsToDouble(dvs.longValue());
+ }
+
+ return (docID1, docID2) -> reverseMul * Double.compare(values[docID1], values[docID2]);
+ }
+
+ @Override
+ public String getProviderName() {
+ return providerName;
+ }
+ }
+
+ /**
+ * Sorts documents based on terms from a SortedDocValues instance
+ */
+ final class StringSorter implements IndexSorter {
+
+ private final String providerName;
+ private final Object missingValue;
+ private final int reverseMul;
+ private final SortedDocValuesProvider valuesProvider;
+
+ /** Creates a new StringSorter */
+ public StringSorter(String providerName, Object missingValue, boolean reverse, SortedDocValuesProvider valuesProvider) {
+ this.providerName = providerName;
+ this.missingValue = missingValue;
+ this.reverseMul = reverse ? -1 : 1;
+ this.valuesProvider = valuesProvider;
+ }
+
+ @Override
+ public ComparableProvider[] getComparableProviders(List extends LeafReader> readers) throws IOException {
+ final ComparableProvider[] providers = new ComparableProvider[readers.size()];
+ final SortedDocValues[] values = new SortedDocValues[readers.size()];
+ for(int i=0;i {
+ if (readerValues.advanceExact(docID)) {
+ // translate segment's ord to global ord space:
+ return globalOrds.get(readerValues.ordValue());
+ } else {
+ return missingOrd;
+ }
+ };
+ }
+ return providers;
+ }
+
+ @Override
+ public DocComparator getDocComparator(LeafReader reader, int maxDoc) throws IOException {
+ final SortedDocValues sorted = valuesProvider.get(reader);
+ final int missingOrd;
+ if (missingValue == SortField.STRING_LAST) {
+ missingOrd = Integer.MAX_VALUE;
+ } else {
+ missingOrd = Integer.MIN_VALUE;
+ }
+
+ final int[] ords = new int[maxDoc];
+ Arrays.fill(ords, missingOrd);
+ int docID;
+ while ((docID = sorted.nextDoc()) != NO_MORE_DOCS) {
+ ords[docID] = sorted.ordValue();
+ }
+
+ return (docID1, docID2) -> reverseMul * Integer.compare(ords[docID1], ords[docID2]);
+ }
+
+ @Override
+ public String getProviderName() {
+ return providerName;
+ }
+ }
+
+}
diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
index 464be01d97ae..bd273a1344fe 100644
--- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
@@ -33,6 +33,7 @@
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.Semaphore;
+import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
@@ -2129,12 +2130,12 @@ public final void maybeMerge() throws IOException {
private final void maybeMerge(MergePolicy mergePolicy, MergeTrigger trigger, int maxNumSegments) throws IOException {
ensureOpen(false);
- if (updatePendingMerges(mergePolicy, trigger, maxNumSegments)) {
+ if (updatePendingMerges(mergePolicy, trigger, maxNumSegments) != null) {
mergeScheduler.merge(mergeSource, trigger);
}
}
- private synchronized boolean updatePendingMerges(MergePolicy mergePolicy, MergeTrigger trigger, int maxNumSegments)
+ private synchronized MergePolicy.MergeSpecification updatePendingMerges(MergePolicy mergePolicy, MergeTrigger trigger, int maxNumSegments)
throws IOException {
// In case infoStream was disabled on init, but then enabled at some
@@ -2144,22 +2145,21 @@ private synchronized boolean updatePendingMerges(MergePolicy mergePolicy, MergeT
assert maxNumSegments == UNBOUNDED_MAX_MERGE_SEGMENTS || maxNumSegments > 0;
assert trigger != null;
if (stopMerges) {
- return false;
+ return null;
}
// Do not start new merges if disaster struck
if (tragedy.get() != null) {
- return false;
+ return null;
}
- boolean newMergesFound = false;
+
final MergePolicy.MergeSpecification spec;
if (maxNumSegments != UNBOUNDED_MAX_MERGE_SEGMENTS) {
assert trigger == MergeTrigger.EXPLICIT || trigger == MergeTrigger.MERGE_FINISHED :
"Expected EXPLICT or MERGE_FINISHED as trigger even with maxNumSegments set but was: " + trigger.name();
spec = mergePolicy.findForcedMerges(segmentInfos, maxNumSegments, Collections.unmodifiableMap(segmentsToMerge), this);
- newMergesFound = spec != null;
- if (newMergesFound) {
+ if (spec != null) {
final int numMerges = spec.merges.size();
for(int i=0;i 0) {
+ SegmentInfos committingSegmentInfos = toCommit;
+ onCommitMerges = updatePendingMerges(new OneMergeWrappingMergePolicy(config.getMergePolicy(), toWrap ->
+ new MergePolicy.OneMerge(toWrap.segments) {
+ @Override
+ public void mergeFinished(boolean committed) throws IOException {
+ assert Thread.holdsLock(IndexWriter.this);
+ if (committed && includeInCommit.get()) {
+ deleter.incRef(info.files());
+ Set mergedSegmentNames = new HashSet<>();
+ for (SegmentCommitInfo sci : segments) {
+ mergedSegmentNames.add(sci.info.name);
+ }
+ List toCommitMergedAwaySegments = new ArrayList<>();
+ for (SegmentCommitInfo sci : committingSegmentInfos) {
+ if (mergedSegmentNames.contains(sci.info.name)) {
+ toCommitMergedAwaySegments.add(sci);
+ deleter.decRef(sci.files());
+ }
+ }
+ // Construct a OneMerge that applies to toCommit
+ MergePolicy.OneMerge applicableMerge = new MergePolicy.OneMerge(toCommitMergedAwaySegments);
+ applicableMerge.info = info.clone();
+ long segmentCounter = Long.parseLong(info.info.name.substring(1), Character.MAX_RADIX);
+ committingSegmentInfos.counter = Math.max(committingSegmentInfos.counter, segmentCounter + 1);
+ committingSegmentInfos.applyMergeChanges(applicableMerge, false);
+ }
+ toWrap.mergeFinished(committed);
+ super.mergeFinished(committed);
+ }
+
+ @Override
+ public CodecReader wrapForMerge(CodecReader reader) throws IOException {
+ return toWrap.wrapForMerge(reader);
+ }
+ }
+ ), MergeTrigger.COMMIT, UNBOUNDED_MAX_MERGE_SEGMENTS);
+ }
+
pendingCommitChangeCount = changeCount.get();
// This protects the segmentInfos we are now going
@@ -3236,8 +3281,7 @@ private long prepareCommitInternal() throws IOException {
// we are trying to sync all referenced files, a
// merge completes which would otherwise have
// removed the files we are now syncing.
- filesToCommit = toCommit.files(false);
- deleter.incRef(filesToCommit);
+ deleter.incRef(toCommit.files(false));
}
success = true;
} finally {
@@ -3258,7 +3302,16 @@ private long prepareCommitInternal() throws IOException {
} finally {
maybeCloseOnTragicEvent();
}
-
+
+ if (onCommitMerges != null) {
+ mergeScheduler.merge(mergeSource, MergeTrigger.COMMIT);
+ onCommitMerges.await(maxCommitMergeWaitSeconds, TimeUnit.SECONDS);
+ synchronized (this) {
+ // we need to call this under lock since mergeFinished above is also called under the IW lock
+ includeInCommit.set(false);
+ }
+ }
+ filesToCommit = toCommit.files(false);
try {
if (anyChanges) {
maybeMerge.set(true);
@@ -4290,7 +4343,7 @@ private synchronized void mergeFinish(MergePolicy.OneMerge merge) {
@SuppressWarnings("try")
private synchronized void closeMergeReaders(MergePolicy.OneMerge merge, boolean suppressExceptions) throws IOException {
final boolean drop = suppressExceptions == false;
- try (Closeable finalizer = merge::mergeFinished) {
+ try (Closeable finalizer = () -> merge.mergeFinished(suppressExceptions == false)) {
IOUtils.applyToAll(merge.readers, sr -> {
final ReadersAndUpdates rld = getPooledInstance(sr.getOriginalSegmentInfo(), false);
// We still hold a ref so it should not have been removed:
@@ -4484,6 +4537,7 @@ public int length() {
// Merge would produce a 0-doc segment, so we do nothing except commit the merge to remove all the 0-doc segments that we "merged":
assert merge.info.info.maxDoc() == 0;
commitMerge(merge, mergeState);
+ success = true;
return 0;
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java
index 4cdc9c02529f..12379328bc13 100644
--- a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java
@@ -32,9 +32,9 @@
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.PrintStreamInfoStream;
+import org.apache.lucene.util.SetOnce;
import org.apache.lucene.util.SetOnce.AlreadySetException;
import org.apache.lucene.util.Version;
-import org.apache.lucene.util.SetOnce;
/**
* Holds all the configuration that is used to create an {@link IndexWriter}.
@@ -110,6 +110,9 @@ public static enum OpenMode {
/** Default value for whether calls to {@link IndexWriter#close()} include a commit. */
public final static boolean DEFAULT_COMMIT_ON_CLOSE = true;
+
+ /** Default value for time to wait for merges on commit (when using a {@link MergePolicy} that implements findFullFlushMerges). */
+ public static final long DEFAULT_MAX_COMMIT_MERGE_WAIT_SECONDS = 0;
// indicates whether this config instance is already attached to a writer.
// not final so that it can be cloned properly.
@@ -460,6 +463,20 @@ public IndexWriterConfig setCommitOnClose(boolean commitOnClose) {
return this;
}
+ /**
+ * Expert: sets the amount of time to wait for merges returned by MergePolicy.findFullFlushMerges(...).
+ * If this time is reached, we proceed with the commit based on segments merged up to that point.
+ * The merges are not cancelled, and will still run to completion independent of the commit
+ * like normal segment merges. The default is {@value IndexWriterConfig#DEFAULT_MAX_COMMIT_MERGE_WAIT_SECONDS}.
+ *
+ * Note: This settings has no effect unless {@link MergePolicy#findFullFlushMerges(MergeTrigger, SegmentInfos, MergePolicy.MergeContext)}
+ * has an implementation that actually returns merges which by default doesn't return any merges.
+ */
+ public IndexWriterConfig setMaxCommitMergeWaitSeconds(long maxCommitMergeWaitSeconds) {
+ this.maxCommitMergeWaitSeconds = maxCommitMergeWaitSeconds;
+ return this;
+ }
+
/** We only allow sorting on these types */
private static final EnumSet ALLOWED_INDEX_SORT_TYPES = EnumSet.of(SortField.Type.STRING,
SortField.Type.LONG,
@@ -471,10 +488,9 @@ public IndexWriterConfig setCommitOnClose(boolean commitOnClose) {
* Set the {@link Sort} order to use for all (flushed and merged) segments.
*/
public IndexWriterConfig setIndexSort(Sort sort) {
- for(SortField sortField : sort.getSort()) {
- final SortField.Type sortType = Sorter.getSortFieldType(sortField);
- if (ALLOWED_INDEX_SORT_TYPES.contains(sortType) == false) {
- throw new IllegalArgumentException("invalid SortField type: must be one of " + ALLOWED_INDEX_SORT_TYPES + " but got: " + sortField);
+ for (SortField sortField : sort.getSort()) {
+ if (sortField.getIndexSorter() == null) {
+ throw new IllegalArgumentException("Cannot sort index with sort field " + sortField);
}
}
this.indexSort = sort;
diff --git a/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java b/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java
index 1f48acc8d5f6..9b1d56c0a96d 100644
--- a/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java
+++ b/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java
@@ -109,6 +109,8 @@ public class LiveIndexWriterConfig {
/** soft deletes field */
protected String softDeletesField = null;
+ /** Amount of time to wait for merges returned by MergePolicy.findFullFlushMerges(...) */
+ protected volatile long maxCommitMergeWaitSeconds;
// used by IndexWriterConfig
LiveIndexWriterConfig(Analyzer analyzer) {
@@ -132,6 +134,7 @@ public class LiveIndexWriterConfig {
flushPolicy = new FlushByRamOrCountsPolicy();
readerPooling = IndexWriterConfig.DEFAULT_READER_POOLING;
perThreadHardLimitMB = IndexWriterConfig.DEFAULT_RAM_PER_THREAD_HARD_LIMIT_MB;
+ maxCommitMergeWaitSeconds = IndexWriterConfig.DEFAULT_MAX_COMMIT_MERGE_WAIT_SECONDS;
}
/** Returns the default analyzer to use for indexing documents. */
@@ -461,6 +464,15 @@ public String getSoftDeletesField() {
return softDeletesField;
}
+ /**
+ * Expert: return the amount of time to wait for merges returned by by MergePolicy.findFullFlushMerges(...).
+ * If this time is reached, we proceed with the commit based on segments merged up to that point.
+ * The merges are not cancelled, and may still run to completion independent of the commit.
+ */
+ public long getMaxCommitMergeWaitSeconds() {
+ return maxCommitMergeWaitSeconds;
+ }
+
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
@@ -484,6 +496,7 @@ public String toString() {
sb.append("indexSort=").append(getIndexSort()).append("\n");
sb.append("checkPendingFlushOnUpdate=").append(isCheckPendingFlushOnUpdate()).append("\n");
sb.append("softDeletesField=").append(getSoftDeletesField()).append("\n");
+ sb.append("maxCommitMergeWaitSeconds=").append(getMaxCommitMergeWaitSeconds()).append("\n");
return sb.toString();
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java
index 3ac391451911..5a090da82967 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java
@@ -23,7 +23,12 @@
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
+import java.util.Optional;
import java.util.Set;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
@@ -37,6 +42,7 @@
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.IOSupplier;
import org.apache.lucene.util.InfoStream;
+import org.apache.lucene.util.ThreadInterruptedException;
/**
*
Expert: a MergePolicy determines the sequence of
@@ -76,7 +82,7 @@ public abstract class MergePolicy {
* @lucene.experimental */
public static class OneMergeProgress {
/** Reason for pausing the merge thread. */
- public static enum PauseReason {
+ public enum PauseReason {
/** Stopped (because of throughput rate set to 0, typically). */
STOPPED,
/** Temporarily paused because of exceeded throughput rate. */
@@ -196,6 +202,7 @@ final void setMergeThread(Thread owner) {
*
* @lucene.experimental */
public static class OneMerge {
+ private final CompletableFuture mergeCompleted = new CompletableFuture<>();
SegmentCommitInfo info; // used by IndexWriter
boolean registerDone; // used by IndexWriter
long mergeGen; // used by IndexWriter
@@ -222,7 +229,7 @@ public static class OneMerge {
volatile long mergeStartNS = -1;
/** Total number of documents in segments to be merged, not accounting for deletions. */
- public final int totalMaxDoc;
+ final int totalMaxDoc;
Throwable error;
/** Sole constructor.
@@ -233,13 +240,8 @@ public OneMerge(List segments) {
throw new RuntimeException("segments must include at least one segment");
}
// clone the list, as the in list may be based off original SegmentInfos and may be modified
- this.segments = new ArrayList<>(segments);
- int count = 0;
- for(SegmentCommitInfo info : segments) {
- count += info.info.maxDoc();
- }
- totalMaxDoc = count;
-
+ this.segments = List.copyOf(segments);
+ totalMaxDoc = segments.stream().mapToInt(i -> i.info.maxDoc()).sum();
mergeProgress = new OneMergeProgress();
}
@@ -250,9 +252,15 @@ public OneMerge(List segments) {
public void mergeInit() throws IOException {
mergeProgress.setMergeThread(Thread.currentThread());
}
-
- /** Called by {@link IndexWriter} after the merge is done and all readers have been closed. */
- public void mergeFinished() throws IOException {
+
+ /** Called by {@link IndexWriter} after the merge is done and all readers have been closed.
+ * @param success true iff the merge finished successfully ie. was committed */
+ public void mergeFinished(boolean success) throws IOException {
+ mergeCompleted.complete(success);
+ // https://issues.apache.org/jira/browse/LUCENE-9408
+ // if (mergeCompleted.complete(success) == false) {
+ // throw new IllegalStateException("merge has already finished");
+ // }
}
/** Wrap the reader in order to add/remove information to the merged segment. */
@@ -362,6 +370,37 @@ public void checkAborted() throws MergeAbortedException {
public OneMergeProgress getMergeProgress() {
return mergeProgress;
}
+
+ /**
+ * Waits for this merge to be completed
+ * @return true if the merge finished within the specified timeout
+ */
+ boolean await(long timeout, TimeUnit timeUnit) {
+ try {
+ mergeCompleted.get(timeout, timeUnit);
+ return true;
+ } catch (InterruptedException e) {
+ throw new ThreadInterruptedException(e);
+ } catch (ExecutionException | TimeoutException e) {
+ return false;
+ }
+ }
+
+ /**
+ * Returns true if the merge has finished or false if it's still running or
+ * has not been started. This method will not block.
+ */
+ boolean isDone() {
+ return mergeCompleted.isDone();
+ }
+
+ /**
+ * Returns true iff the merge completed successfully or false if the merge succeeded with a failure.
+ * This method will not block and return an empty Optional if the merge has not finished yet
+ */
+ Optional hasCompletedSuccessfully() {
+ return Optional.ofNullable(mergeCompleted.getNow(null));
+ }
}
/**
@@ -399,6 +438,22 @@ public String segString(Directory dir) {
}
return b.toString();
}
+
+ /**
+ * Waits if necessary for at most the given time for all merges.
+ */
+ boolean await(long timeout, TimeUnit unit) {
+ try {
+ CompletableFuture future = CompletableFuture.allOf(merges.stream()
+ .map(m -> m.mergeCompleted).collect(Collectors.toList()).toArray(CompletableFuture>[]::new));
+ future.get(timeout, unit);
+ return true;
+ } catch (InterruptedException e) {
+ throw new ThreadInterruptedException(e);
+ } catch (ExecutionException | TimeoutException e) {
+ return false;
+ }
+ }
}
/** Exception thrown if there are any problems while executing a merge. */
@@ -500,7 +555,7 @@ public abstract MergeSpecification findMerges(MergeTrigger mergeTrigger, Segment
* an original segment present in the
* to-be-merged index; else, it was a segment
* produced by a cascaded merge.
- * @param mergeContext the IndexWriter to find the merges on
+ * @param mergeContext the MergeContext to find the merges on
*/
public abstract MergeSpecification findForcedMerges(
SegmentInfos segmentInfos, int maxSegmentCount, Map segmentsToMerge, MergeContext mergeContext)
@@ -511,11 +566,33 @@ public abstract MergeSpecification findForcedMerges(
* deletes from the index.
* @param segmentInfos
* the total set of segments in the index
- * @param mergeContext the IndexWriter to find the merges on
+ * @param mergeContext the MergeContext to find the merges on
*/
public abstract MergeSpecification findForcedDeletesMerges(
SegmentInfos segmentInfos, MergeContext mergeContext) throws IOException;
+ /**
+ * Identifies merges that we want to execute (synchronously) on commit. By default, do not synchronously merge on commit.
+ *
+ * Any merges returned here will make {@link IndexWriter#commit()} or {@link IndexWriter#prepareCommit()} block until
+ * the merges complete or until {@link IndexWriterConfig#getMaxCommitMergeWaitSeconds()} have elapsed. This may be
+ * used to merge small segments that have just been flushed as part of the commit, reducing the number of segments in
+ * the commit. If a merge does not complete in the allotted time, it will continue to execute, but will not be reflected
+ * in the commit.
+ *
+ * If a {@link OneMerge} in the returned {@link MergeSpecification} includes a segment already included in a registered
+ * merge, then {@link IndexWriter#commit()} or {@link IndexWriter#prepareCommit()} will throw a {@link IllegalStateException}.
+ * Use {@link MergeContext#getMergingSegments()} to determine which segments are currently registered to merge.
+ *
+ * @param mergeTrigger the event that triggered the merge (COMMIT or FULL_FLUSH).
+ * @param segmentInfos the total set of segments in the index (while preparing the commit)
+ * @param mergeContext the MergeContext to find the merges on, which should be used to determine which segments are
+ * already in a registered merge (see {@link MergeContext#getMergingSegments()}).
+ */
+ public MergeSpecification findFullFlushMerges(MergeTrigger mergeTrigger, SegmentInfos segmentInfos, MergeContext mergeContext) throws IOException {
+ return null;
+ }
+
/**
* Returns true if a new segment (regardless of its origin) should use the
* compound file format. The default implementation returns true
diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeTrigger.java b/lucene/core/src/java/org/apache/lucene/index/MergeTrigger.java
index d165a27008f4..01a6b15a0358 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MergeTrigger.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MergeTrigger.java
@@ -47,5 +47,10 @@ public enum MergeTrigger {
/**
* Merge was triggered by a closing IndexWriter.
*/
- CLOSING
+ CLOSING,
+
+ /**
+ * Merge was triggered on commit.
+ */
+ COMMIT,
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java
index b9ee2f58bb73..d515b6dfc9e1 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java
@@ -434,7 +434,7 @@ public static SortedNumericDocValues getSortedNumericValues(final IndexReader r,
LeafReaderContext context = leaves.get(i);
SortedNumericDocValues v = context.reader().getSortedNumericDocValues(field);
if (v == null) {
- v = DocValues.emptySortedNumeric(context.reader().maxDoc());
+ v = DocValues.emptySortedNumeric();
} else {
anyReal = true;
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java
index 23487014ea3b..35d1441155a7 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java
@@ -24,8 +24,6 @@
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.Bits;
-import org.apache.lucene.util.LongValues;
-import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedLongValues;
@@ -41,10 +39,14 @@ static MergeState.DocMap[] sort(Sort sort, List readers) throws IOE
// TODO: optimize if only 1 reader is incoming, though that's a rare case
SortField fields[] = sort.getSort();
- final ComparableProvider[][] comparables = new ComparableProvider[fields.length][];
+ final IndexSorter.ComparableProvider[][] comparables = new IndexSorter.ComparableProvider[fields.length][];
final int[] reverseMuls = new int[fields.length];
for(int i=0;i readers, SortField sortField) throws IOException {
-
- ComparableProvider[] providers = new ComparableProvider[readers.size()];
- final SortField.Type sortType = Sorter.getSortFieldType(sortField);
-
- switch(sortType) {
-
- case STRING:
- {
- // this uses the efficient segment-local ordinal map:
- final SortedDocValues[] values = new SortedDocValues[readers.size()];
- for(int i=0;i {
private PackedLongValues.Builder pending;
private PackedLongValues finalValues;
@@ -70,21 +69,11 @@ private void updateBytesUsed() {
}
@Override
- public void finish(int maxDoc) {
- }
-
- @Override
- Sorter.DocComparator getDocComparator(int maxDoc, SortField sortField) throws IOException {
- assert finalValues == null;
- finalValues = pending.build();
- final BufferedNumericDocValues docValues =
- new BufferedNumericDocValues(finalValues, docsWithField.iterator());
- return Sorter.getDocComparator(maxDoc, sortField, () -> null, () -> docValues);
- }
-
- @Override
- DocIdSetIterator getDocIdSet() {
- return docsWithField.iterator();
+ NumericDocValues getDocValues() {
+ if (finalValues == null) {
+ finalValues = pending.build();
+ }
+ return new BufferedNumericDocValues(finalValues, docsWithField.iterator());
}
static SortingLeafReader.CachedNumericDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, NumericDocValues oldDocValues) throws IOException {
@@ -104,16 +93,12 @@ static SortingLeafReader.CachedNumericDVs sortDocValues(int maxDoc, Sorter.DocMa
@Override
public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsumer dvConsumer) throws IOException {
- final PackedLongValues values;
if (finalValues == null) {
- values = pending.build();
- } else {
- values = finalValues;
+ finalValues = pending.build();
}
-
final SortingLeafReader.CachedNumericDVs sorted;
if (sortMap != null) {
- NumericDocValues oldValues = new BufferedNumericDocValues(values, docsWithField.iterator());
+ NumericDocValues oldValues = new BufferedNumericDocValues(finalValues, docsWithField.iterator());
sorted = sortDocValues(state.segmentInfo.maxDoc(), sortMap, oldValues);
} else {
sorted = null;
@@ -127,7 +112,7 @@ public NumericDocValues getNumeric(FieldInfo fieldInfo) {
throw new IllegalArgumentException("wrong fieldInfo");
}
if (sorted == null) {
- return new BufferedNumericDocValues(values, docsWithField.iterator());
+ return new BufferedNumericDocValues(finalValues, docsWithField.iterator());
} else {
return new SortingLeafReader.SortingNumericDocValues(sorted);
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/OneMergeWrappingMergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/OneMergeWrappingMergePolicy.java
index d08711eb061b..a5fd66a7c0a0 100644
--- a/lucene/core/src/java/org/apache/lucene/index/OneMergeWrappingMergePolicy.java
+++ b/lucene/core/src/java/org/apache/lucene/index/OneMergeWrappingMergePolicy.java
@@ -59,6 +59,11 @@ public MergeSpecification findForcedDeletesMerges(SegmentInfos segmentInfos, Mer
return wrapSpec(in.findForcedDeletesMerges(segmentInfos, mergeContext));
}
+ @Override
+ public MergeSpecification findFullFlushMerges(MergeTrigger mergeTrigger, SegmentInfos segmentInfos, MergeContext mergeContext) throws IOException {
+ return wrapSpec(in.findFullFlushMerges(mergeTrigger, segmentInfos, mergeContext));
+ }
+
private MergeSpecification wrapSpec(MergeSpecification spec) {
MergeSpecification wrapped = spec == null ? null : new MergeSpecification();
if (wrapped != null) {
diff --git a/lucene/core/src/java/org/apache/lucene/index/ParallelPostingsArray.java b/lucene/core/src/java/org/apache/lucene/index/ParallelPostingsArray.java
index 35e8e4f89217..245216f198b3 100644
--- a/lucene/core/src/java/org/apache/lucene/index/ParallelPostingsArray.java
+++ b/lucene/core/src/java/org/apache/lucene/index/ParallelPostingsArray.java
@@ -22,14 +22,14 @@ class ParallelPostingsArray {
final static int BYTES_PER_POSTING = 3 * Integer.BYTES;
final int size;
- final int[] textStarts;
- final int[] intStarts;
- final int[] byteStarts;
+ final int[] textStarts; // maps term ID to the terms's text start in the bytesHash
+ final int[] addressOffset; // maps term ID to current stream address
+ final int[] byteStarts; // maps term ID to stream start offset in the byte pool
ParallelPostingsArray(final int size) {
this.size = size;
textStarts = new int[size];
- intStarts = new int[size];
+ addressOffset = new int[size];
byteStarts = new int[size];
}
@@ -50,7 +50,7 @@ final ParallelPostingsArray grow() {
void copyTo(ParallelPostingsArray toArray, int numToCopy) {
System.arraycopy(textStarts, 0, toArray.textStarts, 0, numToCopy);
- System.arraycopy(intStarts, 0, toArray.intStarts, 0, numToCopy);
+ System.arraycopy(addressOffset, 0, toArray.addressOffset, 0, numToCopy);
System.arraycopy(byteStarts, 0, toArray.byteStarts, 0, numToCopy);
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java b/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java
index f9edccd46f58..dc379ab3cd91 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java
@@ -304,136 +304,146 @@ public static final SegmentInfos readCommit(Directory directory, String segmentF
/** Read the commit from the provided {@link ChecksumIndexInput}. */
public static final SegmentInfos readCommit(Directory directory, ChecksumIndexInput input, long generation) throws IOException {
+ Throwable priorE = null;
+ int format = -1;
+ try {
+ // NOTE: as long as we want to throw indexformattooold (vs corruptindexexception), we need
+ // to read the magic ourselves.
+ int magic = input.readInt();
+ if (magic != CodecUtil.CODEC_MAGIC) {
+ throw new IndexFormatTooOldException(input, magic, CodecUtil.CODEC_MAGIC, CodecUtil.CODEC_MAGIC);
+ }
+ format = CodecUtil.checkHeaderNoMagic(input, "segments", VERSION_70, VERSION_CURRENT);
+ byte id[] = new byte[StringHelper.ID_LENGTH];
+ input.readBytes(id, 0, id.length);
+ CodecUtil.checkIndexHeaderSuffix(input, Long.toString(generation, Character.MAX_RADIX));
+
+ Version luceneVersion = Version.fromBits(input.readVInt(), input.readVInt(), input.readVInt());
+ int indexCreatedVersion = input.readVInt();
+ if (luceneVersion.major < indexCreatedVersion) {
+ throw new CorruptIndexException("Creation version [" + indexCreatedVersion
+ + ".x] can't be greater than the version that wrote the segment infos: [" + luceneVersion + "]" , input);
+ }
- // NOTE: as long as we want to throw indexformattooold (vs corruptindexexception), we need
- // to read the magic ourselves.
- int magic = input.readInt();
- if (magic != CodecUtil.CODEC_MAGIC) {
- throw new IndexFormatTooOldException(input, magic, CodecUtil.CODEC_MAGIC, CodecUtil.CODEC_MAGIC);
- }
- int format = CodecUtil.checkHeaderNoMagic(input, "segments", VERSION_70, VERSION_CURRENT);
- byte id[] = new byte[StringHelper.ID_LENGTH];
- input.readBytes(id, 0, id.length);
- CodecUtil.checkIndexHeaderSuffix(input, Long.toString(generation, Character.MAX_RADIX));
-
- Version luceneVersion = Version.fromBits(input.readVInt(), input.readVInt(), input.readVInt());
- int indexCreatedVersion = input.readVInt();
- if (luceneVersion.major < indexCreatedVersion) {
- throw new CorruptIndexException("Creation version [" + indexCreatedVersion
- + ".x] can't be greater than the version that wrote the segment infos: [" + luceneVersion + "]" , input);
- }
-
- if (indexCreatedVersion < Version.LATEST.major - 1) {
- throw new IndexFormatTooOldException(input, "This index was initially created with Lucene "
- + indexCreatedVersion + ".x while the current version is " + Version.LATEST
- + " and Lucene only supports reading the current and previous major versions.");
- }
-
- SegmentInfos infos = new SegmentInfos(indexCreatedVersion);
- infos.id = id;
- infos.generation = generation;
- infos.lastGeneration = generation;
- infos.luceneVersion = luceneVersion;
-
- infos.version = input.readLong();
- //System.out.println("READ sis version=" + infos.version);
- if (format > VERSION_70) {
- infos.counter = input.readVLong();
- } else {
- infos.counter = input.readInt();
- }
- int numSegments = input.readInt();
- if (numSegments < 0) {
- throw new CorruptIndexException("invalid segment count: " + numSegments, input);
- }
+ if (indexCreatedVersion < Version.LATEST.major - 1) {
+ throw new IndexFormatTooOldException(input, "This index was initially created with Lucene "
+ + indexCreatedVersion + ".x while the current version is " + Version.LATEST
+ + " and Lucene only supports reading the current and previous major versions.");
+ }
- if (numSegments > 0) {
- infos.minSegmentLuceneVersion = Version.fromBits(input.readVInt(), input.readVInt(), input.readVInt());
- } else {
- // else leave as null: no segments
- }
+ SegmentInfos infos = new SegmentInfos(indexCreatedVersion);
+ infos.id = id;
+ infos.generation = generation;
+ infos.lastGeneration = generation;
+ infos.luceneVersion = luceneVersion;
- long totalDocs = 0;
- for (int seg = 0; seg < numSegments; seg++) {
- String segName = input.readString();
- byte[] segmentID = new byte[StringHelper.ID_LENGTH];
- input.readBytes(segmentID, 0, segmentID.length);
- Codec codec = readCodec(input);
- SegmentInfo info = codec.segmentInfoFormat().read(directory, segName, segmentID, IOContext.READ);
- info.setCodec(codec);
- totalDocs += info.maxDoc();
- long delGen = input.readLong();
- int delCount = input.readInt();
- if (delCount < 0 || delCount > info.maxDoc()) {
- throw new CorruptIndexException("invalid deletion count: " + delCount + " vs maxDoc=" + info.maxDoc(), input);
- }
- long fieldInfosGen = input.readLong();
- long dvGen = input.readLong();
- int softDelCount = format > VERSION_72 ? input.readInt() : 0;
- if (softDelCount < 0 || softDelCount > info.maxDoc()) {
- throw new CorruptIndexException("invalid deletion count: " + softDelCount + " vs maxDoc=" + info.maxDoc(), input);
+ infos.version = input.readLong();
+ //System.out.println("READ sis version=" + infos.version);
+ if (format > VERSION_70) {
+ infos.counter = input.readVLong();
+ } else {
+ infos.counter = input.readInt();
}
- if (softDelCount + delCount > info.maxDoc()) {
- throw new CorruptIndexException("invalid deletion count: " + softDelCount + delCount + " vs maxDoc=" + info.maxDoc(), input);
+ int numSegments = input.readInt();
+ if (numSegments < 0) {
+ throw new CorruptIndexException("invalid segment count: " + numSegments, input);
}
- final byte[] sciId;
- if (format > VERSION_74) {
- byte marker = input.readByte();
- switch (marker) {
- case 1:
- sciId = new byte[StringHelper.ID_LENGTH];
- input.readBytes(sciId, 0, sciId.length);
- break;
- case 0:
- sciId = null;
- break;
- default:
- throw new CorruptIndexException("invalid SegmentCommitInfo ID marker: " + marker, input);
- }
+
+ if (numSegments > 0) {
+ infos.minSegmentLuceneVersion = Version.fromBits(input.readVInt(), input.readVInt(), input.readVInt());
} else {
- sciId = null;
+ // else leave as null: no segments
}
- SegmentCommitInfo siPerCommit = new SegmentCommitInfo(info, delCount, softDelCount, delGen, fieldInfosGen, dvGen, sciId);
- siPerCommit.setFieldInfosFiles(input.readSetOfStrings());
- final Map> dvUpdateFiles;
- final int numDVFields = input.readInt();
- if (numDVFields == 0) {
- dvUpdateFiles = Collections.emptyMap();
- } else {
- Map> map = new HashMap<>(numDVFields);
- for (int i = 0; i < numDVFields; i++) {
- map.put(input.readInt(), input.readSetOfStrings());
+
+ long totalDocs = 0;
+ for (int seg = 0; seg < numSegments; seg++) {
+ String segName = input.readString();
+ byte[] segmentID = new byte[StringHelper.ID_LENGTH];
+ input.readBytes(segmentID, 0, segmentID.length);
+ Codec codec = readCodec(input);
+ SegmentInfo info = codec.segmentInfoFormat().read(directory, segName, segmentID, IOContext.READ);
+ info.setCodec(codec);
+ totalDocs += info.maxDoc();
+ long delGen = input.readLong();
+ int delCount = input.readInt();
+ if (delCount < 0 || delCount > info.maxDoc()) {
+ throw new CorruptIndexException("invalid deletion count: " + delCount + " vs maxDoc=" + info.maxDoc(), input);
}
- dvUpdateFiles = Collections.unmodifiableMap(map);
- }
- siPerCommit.setDocValuesUpdatesFiles(dvUpdateFiles);
- infos.add(siPerCommit);
+ long fieldInfosGen = input.readLong();
+ long dvGen = input.readLong();
+ int softDelCount = format > VERSION_72 ? input.readInt() : 0;
+ if (softDelCount < 0 || softDelCount > info.maxDoc()) {
+ throw new CorruptIndexException("invalid deletion count: " + softDelCount + " vs maxDoc=" + info.maxDoc(), input);
+ }
+ if (softDelCount + delCount > info.maxDoc()) {
+ throw new CorruptIndexException("invalid deletion count: " + softDelCount + delCount + " vs maxDoc=" + info.maxDoc(), input);
+ }
+ final byte[] sciId;
+ if (format > VERSION_74) {
+ byte marker = input.readByte();
+ switch (marker) {
+ case 1:
+ sciId = new byte[StringHelper.ID_LENGTH];
+ input.readBytes(sciId, 0, sciId.length);
+ break;
+ case 0:
+ sciId = null;
+ break;
+ default:
+ throw new CorruptIndexException("invalid SegmentCommitInfo ID marker: " + marker, input);
+ }
+ } else {
+ sciId = null;
+ }
+ SegmentCommitInfo siPerCommit = new SegmentCommitInfo(info, delCount, softDelCount, delGen, fieldInfosGen, dvGen, sciId);
+ siPerCommit.setFieldInfosFiles(input.readSetOfStrings());
+ final Map> dvUpdateFiles;
+ final int numDVFields = input.readInt();
+ if (numDVFields == 0) {
+ dvUpdateFiles = Collections.emptyMap();
+ } else {
+ Map> map = new HashMap<>(numDVFields);
+ for (int i = 0; i < numDVFields; i++) {
+ map.put(input.readInt(), input.readSetOfStrings());
+ }
+ dvUpdateFiles = Collections.unmodifiableMap(map);
+ }
+ siPerCommit.setDocValuesUpdatesFiles(dvUpdateFiles);
+ infos.add(siPerCommit);
- Version segmentVersion = info.getVersion();
+ Version segmentVersion = info.getVersion();
- if (segmentVersion.onOrAfter(infos.minSegmentLuceneVersion) == false) {
- throw new CorruptIndexException("segments file recorded minSegmentLuceneVersion=" + infos.minSegmentLuceneVersion + " but segment=" + info + " has older version=" + segmentVersion, input);
- }
+ if (segmentVersion.onOrAfter(infos.minSegmentLuceneVersion) == false) {
+ throw new CorruptIndexException("segments file recorded minSegmentLuceneVersion=" + infos.minSegmentLuceneVersion + " but segment=" + info + " has older version=" + segmentVersion, input);
+ }
- if (infos.indexCreatedVersionMajor >= 7 && segmentVersion.major < infos.indexCreatedVersionMajor) {
- throw new CorruptIndexException("segments file recorded indexCreatedVersionMajor=" + infos.indexCreatedVersionMajor + " but segment=" + info + " has older version=" + segmentVersion, input);
- }
+ if (infos.indexCreatedVersionMajor >= 7 && segmentVersion.major < infos.indexCreatedVersionMajor) {
+ throw new CorruptIndexException("segments file recorded indexCreatedVersionMajor=" + infos.indexCreatedVersionMajor + " but segment=" + info + " has older version=" + segmentVersion, input);
+ }
- if (infos.indexCreatedVersionMajor >= 7 && info.getMinVersion() == null) {
- throw new CorruptIndexException("segments infos must record minVersion with indexCreatedVersionMajor=" + infos.indexCreatedVersionMajor, input);
+ if (infos.indexCreatedVersionMajor >= 7 && info.getMinVersion() == null) {
+ throw new CorruptIndexException("segments infos must record minVersion with indexCreatedVersionMajor=" + infos.indexCreatedVersionMajor, input);
+ }
}
- }
- infos.userData = input.readMapOfStrings();
+ infos.userData = input.readMapOfStrings();
- CodecUtil.checkFooter(input);
+ // LUCENE-6299: check we are in bounds
+ if (totalDocs > IndexWriter.getActualMaxDocs()) {
+ throw new CorruptIndexException("Too many documents: an index cannot exceed " + IndexWriter.getActualMaxDocs() + " but readers have total maxDoc=" + totalDocs, input);
+ }
- // LUCENE-6299: check we are in bounds
- if (totalDocs > IndexWriter.getActualMaxDocs()) {
- throw new CorruptIndexException("Too many documents: an index cannot exceed " + IndexWriter.getActualMaxDocs() + " but readers have total maxDoc=" + totalDocs, input);
+ return infos;
+ } catch (Throwable t) {
+ priorE = t;
+ } finally {
+ if (format >= VERSION_70) { // oldest supported version
+ CodecUtil.checkFooter(input, priorE);
+ } else {
+ throw IOUtils.rethrowAlways(priorE);
+ }
}
-
- return infos;
+ throw new Error("Unreachable code");
}
private static Codec readCodec(DataInput input) throws IOException {
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortFieldProvider.java b/lucene/core/src/java/org/apache/lucene/index/SortFieldProvider.java
new file mode 100644
index 000000000000..290decd73c3b
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/index/SortFieldProvider.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.index;
+
+import java.io.IOException;
+import java.util.Set;
+
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.NamedSPILoader;
+
+/**
+ * Reads/Writes a named SortField from a segment info file, used to record index sorts
+ */
+public abstract class SortFieldProvider implements NamedSPILoader.NamedSPI {
+
+ private static class Holder {
+ private static final NamedSPILoader LOADER = new NamedSPILoader<>(SortFieldProvider.class);
+
+ static NamedSPILoader getLoader() {
+ if (LOADER == null) {
+ throw new IllegalStateException("You tried to lookup a SortFieldProvider by name before all SortFieldProviders could be initialized. "+
+ "This likely happens if you call SortFieldProvider#forName from a SortFieldProviders's ctor.");
+ }
+ return LOADER;
+ }
+ }
+
+ /**
+ * Looks up a SortFieldProvider by name
+ */
+ public static SortFieldProvider forName(String name) {
+ return Holder.getLoader().lookup(name);
+ }
+
+ /**
+ * Lists all available SortFieldProviders
+ */
+ public static Set availableSortFieldProviders() {
+ return Holder.getLoader().availableServices();
+ }
+
+ /**
+ * Reloads the SortFieldProvider list from the given {@link ClassLoader}.
+ * Changes to the list are visible after the method ends, all
+ * iterators ({@link #availableSortFieldProviders()} ()},...) stay consistent.
+ *
+ *
NOTE: Only new SortFieldProviders are added, existing ones are
+ * never removed or replaced.
+ *
+ *
This method is expensive and should only be called for discovery
+ * of new SortFieldProviders on the given classpath/classloader!
+ */
+ public static void reloadSortFieldProviders(ClassLoader classLoader) {
+ Holder.getLoader().reload(classLoader);
+ }
+
+ /**
+ * Writes a SortField to a DataOutput
+ */
+ public static void write(SortField sf, DataOutput output) throws IOException {
+ IndexSorter sorter = sf.getIndexSorter();
+ if (sorter == null) {
+ throw new IllegalArgumentException("Cannot serialize sort field " + sf);
+ }
+ SortFieldProvider provider = SortFieldProvider.forName(sorter.getProviderName());
+ provider.writeSortField(sf, output);
+ }
+
+ /** The name this SortFieldProvider is registered under */
+ protected final String name;
+
+ /**
+ * Creates a new SortFieldProvider.
+ *
+ * The provided name will be written into the index segment: in order to
+ * for the segment to be read this class should be registered with Java's
+ * SPI mechanism (registered in META-INF/ of your jar file, etc).
+ * @param name must be all ascii alphanumeric, and less than 128 characters in length.
+ */
+ protected SortFieldProvider(String name) {
+ this.name = name;
+ }
+
+ @Override
+ public String getName() {
+ return name;
+ }
+
+ /**
+ * Reads a SortField from serialized bytes
+ */
+ public abstract SortField readSortField(DataInput in) throws IOException;
+
+ /**
+ * Writes a SortField to a DataOutput
+ *
+ * This is used to record index sort information in segment headers
+ */
+ public abstract void writeSortField(SortField sf, DataOutput out) throws IOException;
+
+}
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
index 86d0f0bab338..2252f003cb2d 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
@@ -21,7 +21,6 @@
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.search.SortField;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
@@ -35,7 +34,7 @@
/** Buffers up pending byte[] per doc, deref and sorting via
* int ord, then flushes when segment flushes. */
-class SortedDocValuesWriter extends DocValuesWriter {
+class SortedDocValuesWriter extends DocValuesWriter {
final BytesRefHash hash;
private PackedLongValues.Builder pending;
private DocsWithFieldSet docsWithField;
@@ -79,11 +78,6 @@ public void addValue(int docID, BytesRef value) {
lastDocID = docID;
}
- @Override
- public void finish(int maxDoc) {
- updateBytesUsed();
- }
-
private void addOneValue(BytesRef value) {
int termID = hash.add(value);
if (termID < 0) {
@@ -107,20 +101,20 @@ private void updateBytesUsed() {
}
@Override
- Sorter.DocComparator getDocComparator(int maxDoc, SortField sortField) throws IOException {
- assert sortField.getType().equals(SortField.Type.STRING);
- assert finalSortedValues == null && finalOrdMap == null &&finalOrds == null;
+ SortedDocValues getDocValues() {
int valueCount = hash.size();
- finalSortedValues = hash.sort();
- finalOrds = pending.build();
- finalOrdMap = new int[valueCount];
+ if (finalSortedValues == null) {
+ updateBytesUsed();
+ assert finalOrdMap == null && finalOrds == null;
+ finalSortedValues = hash.sort();
+ finalOrds = pending.build();
+ finalOrdMap = new int[valueCount];
+ }
for (int ord = 0; ord < valueCount; ord++) {
finalOrdMap[finalSortedValues[ord]] = ord;
}
- final SortedDocValues docValues =
- new BufferedSortedDocValues(hash, valueCount, finalOrds, finalSortedValues, finalOrdMap,
+ return new BufferedSortedDocValues(hash, valueCount, finalOrds, finalSortedValues, finalOrdMap,
docsWithField.iterator());
- return Sorter.getDocComparator(maxDoc, sortField, () -> docValues, () -> null);
}
private int[] sortDocValues(int maxDoc, Sorter.DocMap sortMap, SortedDocValues oldValues) throws IOException {
@@ -137,26 +131,20 @@ private int[] sortDocValues(int maxDoc, Sorter.DocMap sortMap, SortedDocValues o
@Override
public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsumer dvConsumer) throws IOException {
final int valueCount = hash.size();
- final PackedLongValues ords;
- final int[] sortedValues;
- final int[] ordMap;
if (finalOrds == null) {
- sortedValues = hash.sort();
- ords = pending.build();
- ordMap = new int[valueCount];
+ updateBytesUsed();
+ finalSortedValues = hash.sort();
+ finalOrds = pending.build();
+ finalOrdMap = new int[valueCount];
for (int ord = 0; ord < valueCount; ord++) {
- ordMap[sortedValues[ord]] = ord;
+ finalOrdMap[finalSortedValues[ord]] = ord;
}
- } else {
- sortedValues = finalSortedValues;
- ords = finalOrds;
- ordMap = finalOrdMap;
}
final int[] sorted;
if (sortMap != null) {
sorted = sortDocValues(state.segmentInfo.maxDoc(), sortMap,
- new BufferedSortedDocValues(hash, valueCount, ords, sortedValues, ordMap, docsWithField.iterator()));
+ new BufferedSortedDocValues(hash, valueCount, finalOrds, finalSortedValues, finalOrdMap, docsWithField.iterator()));
} else {
sorted = null;
}
@@ -168,7 +156,7 @@ public SortedDocValues getSorted(FieldInfo fieldInfoIn) {
throw new IllegalArgumentException("wrong fieldInfo");
}
final SortedDocValues buf =
- new BufferedSortedDocValues(hash, valueCount, ords, sortedValues, ordMap, docsWithField.iterator());
+ new BufferedSortedDocValues(hash, valueCount, finalOrds, finalSortedValues, finalOrdMap, docsWithField.iterator());
if (sorted == null) {
return buf;
}
@@ -245,8 +233,4 @@ public int getValueCount() {
}
}
- @Override
- DocIdSetIterator getDocIdSet() {
- return docsWithField.iterator();
- }
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java
index bdc65cc80574..83c394fc207b 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java
@@ -22,9 +22,6 @@
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.search.SortField;
-import org.apache.lucene.search.SortedNumericSelector;
-import org.apache.lucene.search.SortedNumericSortField;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.RamUsageEstimator;
@@ -34,7 +31,7 @@
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
/** Buffers up pending long[] per doc, sorts, then flushes when segment flushes. */
-class SortedNumericDocValuesWriter extends DocValuesWriter {
+class SortedNumericDocValuesWriter extends DocValuesWriter {
private PackedLongValues.Builder pending; // stream of all values
private PackedLongValues.Builder pendingCounts; // count of values per doc
private DocsWithFieldSet docsWithField;
@@ -85,11 +82,6 @@ private void finishCurrentDoc() {
docsWithField.add(currentDoc);
}
- @Override
- public void finish(int maxDoc) {
- finishCurrentDoc();
- }
-
private void addOneValue(long value) {
if (currentUpto == currentValues.length) {
currentValues = ArrayUtil.grow(currentValues, currentValues.length+1);
@@ -106,16 +98,14 @@ private void updateBytesUsed() {
}
@Override
- Sorter.DocComparator getDocComparator(int maxDoc, SortField sortField) throws IOException {
- assert sortField instanceof SortedNumericSortField;
- assert finalValues == null && finalValuesCount == null;
- finalValues = pending.build();
- finalValuesCount = pendingCounts.build();
- final SortedNumericDocValues docValues =
- new BufferedSortedNumericDocValues(finalValues, finalValuesCount, docsWithField.iterator());
- SortedNumericSortField sf = (SortedNumericSortField) sortField;
- return Sorter.getDocComparator(maxDoc, sf, () -> null,
- () -> SortedNumericSelector.wrap(docValues, sf.getSelector(), sf.getNumericType()));
+ SortedNumericDocValues getDocValues() {
+ if (finalValues == null) {
+ assert finalValuesCount == null;
+ finishCurrentDoc();
+ finalValues = pending.build();
+ finalValuesCount = pendingCounts.build();
+ }
+ return new BufferedSortedNumericDocValues(finalValues, finalValuesCount, docsWithField.iterator());
}
private long[][] sortDocValues(int maxDoc, Sorter.DocMap sortMap, SortedNumericDocValues oldValues) throws IOException {
@@ -137,6 +127,7 @@ public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsu
final PackedLongValues values;
final PackedLongValues valueCounts;
if (finalValues == null) {
+ finishCurrentDoc();
values = pending.build();
valueCounts = pendingCounts.build();
} else {
@@ -232,8 +223,4 @@ public long cost() {
}
}
- @Override
- DocIdSetIterator getDocIdSet() {
- return docsWithField.iterator();
- }
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
index 71a14a5cb7cb..022b17da0956 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
@@ -21,9 +21,6 @@
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.search.SortField;
-import org.apache.lucene.search.SortedSetSelector;
-import org.apache.lucene.search.SortedSetSortField;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
@@ -39,7 +36,7 @@
/** Buffers up pending byte[]s per doc, deref and sorting via
* int ord, then flushes when segment flushes. */
-class SortedSetDocValuesWriter extends DocValuesWriter {
+class SortedSetDocValuesWriter extends DocValuesWriter {
final BytesRefHash hash;
private PackedLongValues.Builder pending; // stream of all termIDs
private PackedLongValues.Builder pendingCounts; // termIDs per doc
@@ -115,11 +112,6 @@ private void finishCurrentDoc() {
docsWithField.add(currentDoc);
}
- @Override
- public void finish(int maxDoc) {
- finishCurrentDoc();
- }
-
private void addOneValue(BytesRef value) {
int termID = hash.add(value);
if (termID < 0) {
@@ -170,22 +162,20 @@ private long[][] sortDocValues(int maxDoc, Sorter.DocMap sortMap, SortedSetDocVa
}
@Override
- Sorter.DocComparator getDocComparator(int maxDoc, SortField sortField) throws IOException {
- assert sortField instanceof SortedSetSortField;
- assert finalOrds == null && finalOrdCounts == null && finalSortedValues == null && finalOrdMap == null;
- int valueCount = hash.size();
- finalOrds = pending.build();
- finalOrdCounts = pendingCounts.build();
- finalSortedValues = hash.sort();
- finalOrdMap = new int[valueCount];
- for (int ord = 0; ord < valueCount; ord++) {
+ SortedSetDocValues getDocValues() {
+ if (finalOrds == null) {
+ assert finalOrdCounts == null && finalSortedValues == null && finalOrdMap == null;
+ finishCurrentDoc();
+ int valueCount = hash.size();
+ finalOrds = pending.build();
+ finalOrdCounts = pendingCounts.build();
+ finalSortedValues = hash.sort();
+ finalOrdMap = new int[valueCount];
+ }
+ for (int ord = 0; ord < finalOrdMap.length; ord++) {
finalOrdMap[finalSortedValues[ord]] = ord;
}
-
- SortedSetSortField sf = (SortedSetSortField) sortField;
- final SortedSetDocValues dvs =
- new BufferedSortedSetDocValues(finalSortedValues, finalOrdMap, hash, finalOrds, finalOrdCounts, maxCount, docsWithField.iterator());
- return Sorter.getDocComparator(maxDoc, sf, () -> SortedSetSelector.wrap(dvs, sf.getSelector()), () -> null);
+ return new BufferedSortedSetDocValues(finalSortedValues, finalOrdMap, hash, finalOrds, finalOrdCounts, maxCount, docsWithField.iterator());
}
@Override
@@ -196,7 +186,9 @@ public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsu
final int[] sortedValues;
final int[] ordMap;
- if (finalOrdCounts == null) {
+ if (finalOrds == null) {
+ assert finalOrdCounts == null && finalSortedValues == null && finalOrdMap == null;
+ finishCurrentDoc();
ords = pending.build();
ordCounts = pendingCounts.build();
sortedValues = hash.sort();
@@ -315,8 +307,5 @@ public BytesRef lookupOrd(long ord) {
return scratch;
}
}
- @Override
- DocIdSetIterator getDocIdSet() {
- return docsWithField.iterator();
- }
+
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/Sorter.java b/lucene/core/src/java/org/apache/lucene/index/Sorter.java
index 5f43c5ad1566..c8605696fd35 100644
--- a/lucene/core/src/java/org/apache/lucene/index/Sorter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/Sorter.java
@@ -17,22 +17,13 @@
package org.apache.lucene.index;
import java.io.IOException;
-import java.util.Arrays;
-import java.util.Comparator;
-import org.apache.lucene.search.FieldComparator;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
-import org.apache.lucene.search.SortedNumericSelector;
-import org.apache.lucene.search.SortedNumericSortField;
-import org.apache.lucene.search.SortedSetSelector;
-import org.apache.lucene.search.SortedSetSortField;
import org.apache.lucene.util.TimSorter;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedLongValues;
-import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
-
/**
* Sorts documents of a given index by returning a permutation on the document
* IDs.
@@ -84,21 +75,13 @@ static boolean isConsistent(DocMap docMap) {
return true;
}
- /** A comparator of doc IDs. */
- static abstract class DocComparator {
-
- /** Compare docID1 against docID2. The contract for the return value is the
- * same as {@link Comparator#compare(Object, Object)}. */
- public abstract int compare(int docID1, int docID2);
- }
-
private static final class DocValueSorter extends TimSorter {
private final int[] docs;
- private final Sorter.DocComparator comparator;
+ private final IndexSorter.DocComparator comparator;
private final int[] tmp;
- DocValueSorter(int[] docs, Sorter.DocComparator comparator) {
+ DocValueSorter(int[] docs, IndexSorter.DocComparator comparator) {
super(docs.length / 64);
this.docs = docs;
this.comparator = comparator;
@@ -139,7 +122,7 @@ protected int compareSaved(int i, int j) {
}
/** Computes the old-to-new permutation over the given comparator. */
- private static Sorter.DocMap sort(final int maxDoc, DocComparator comparator) {
+ private static Sorter.DocMap sort(final int maxDoc, IndexSorter.DocComparator comparator) {
// check if the index is sorted
boolean sorted = true;
for (int i = 1; i < maxDoc; ++i) {
@@ -202,196 +185,10 @@ public int size() {
};
}
- /** Returns the native sort type for {@link SortedSetSortField} and {@link SortedNumericSortField},
- * {@link SortField#getType()} otherwise */
- static SortField.Type getSortFieldType(SortField sortField) {
- if (sortField instanceof SortedSetSortField) {
- return SortField.Type.STRING;
- } else if (sortField instanceof SortedNumericSortField) {
- return ((SortedNumericSortField) sortField).getNumericType();
- } else {
- return sortField.getType();
- }
- }
-
- /** Wraps a {@link SortedNumericDocValues} as a single-valued view if the field is an instance of {@link SortedNumericSortField},
- * returns {@link NumericDocValues} for the field otherwise. */
- static NumericDocValues getOrWrapNumeric(LeafReader reader, SortField sortField) throws IOException {
- if (sortField instanceof SortedNumericSortField) {
- SortedNumericSortField sf = (SortedNumericSortField) sortField;
- return SortedNumericSelector.wrap(DocValues.getSortedNumeric(reader, sf.getField()), sf.getSelector(), sf.getNumericType());
- } else {
- return DocValues.getNumeric(reader, sortField.getField());
- }
- }
-
- /** Wraps a {@link SortedSetDocValues} as a single-valued view if the field is an instance of {@link SortedSetSortField},
- * returns {@link SortedDocValues} for the field otherwise. */
- static SortedDocValues getOrWrapSorted(LeafReader reader, SortField sortField) throws IOException {
- if (sortField instanceof SortedSetSortField) {
- SortedSetSortField sf = (SortedSetSortField) sortField;
- return SortedSetSelector.wrap(DocValues.getSortedSet(reader, sf.getField()), sf.getSelector());
- } else {
- return DocValues.getSorted(reader, sortField.getField());
- }
- }
-
- static DocComparator getDocComparator(LeafReader reader, SortField sortField) throws IOException {
- return getDocComparator(reader.maxDoc(), sortField,
- () -> getOrWrapSorted(reader, sortField),
- () -> getOrWrapNumeric(reader, sortField));
- }
-
- interface NumericDocValuesSupplier {
- NumericDocValues get() throws IOException;
- }
-
- interface SortedDocValuesSupplier {
- SortedDocValues get() throws IOException;
- }
-
- /** We cannot use the {@link FieldComparator} API because that API requires that you send it docIDs in order. Note that this API
- * allocates arrays[maxDoc] to hold the native values needed for comparison, but 1) they are transient (only alive while sorting this one
- * segment), and 2) in the typical index sorting case, they are only used to sort newly flushed segments, which will be smaller than
- * merged segments. */
- static DocComparator getDocComparator(int maxDoc,
- SortField sortField,
- SortedDocValuesSupplier sortedProvider,
- NumericDocValuesSupplier numericProvider) throws IOException {
-
- final int reverseMul = sortField.getReverse() ? -1 : 1;
- final SortField.Type sortType = getSortFieldType(sortField);
-
- switch(sortType) {
-
- case STRING:
- {
- final SortedDocValues sorted = sortedProvider.get();
- final int missingOrd;
- if (sortField.getMissingValue() == SortField.STRING_LAST) {
- missingOrd = Integer.MAX_VALUE;
- } else {
- missingOrd = Integer.MIN_VALUE;
- }
-
- final int[] ords = new int[maxDoc];
- Arrays.fill(ords, missingOrd);
- int docID;
- while ((docID = sorted.nextDoc()) != NO_MORE_DOCS) {
- ords[docID] = sorted.ordValue();
- }
-
- return new DocComparator() {
- @Override
- public int compare(int docID1, int docID2) {
- return reverseMul * Integer.compare(ords[docID1], ords[docID2]);
- }
- };
- }
-
- case LONG:
- {
- final NumericDocValues dvs = numericProvider.get();
- long[] values = new long[maxDoc];
- if (sortField.getMissingValue() != null) {
- Arrays.fill(values, (Long) sortField.getMissingValue());
- }
- while (true) {
- int docID = dvs.nextDoc();
- if (docID == NO_MORE_DOCS) {
- break;
- }
- values[docID] = dvs.longValue();
- }
-
- return new DocComparator() {
- @Override
- public int compare(int docID1, int docID2) {
- return reverseMul * Long.compare(values[docID1], values[docID2]);
- }
- };
- }
-
- case INT:
- {
- final NumericDocValues dvs = numericProvider.get();
- int[] values = new int[maxDoc];
- if (sortField.getMissingValue() != null) {
- Arrays.fill(values, (Integer) sortField.getMissingValue());
- }
-
- while (true) {
- int docID = dvs.nextDoc();
- if (docID == NO_MORE_DOCS) {
- break;
- }
- values[docID] = (int) dvs.longValue();
- }
-
- return new DocComparator() {
- @Override
- public int compare(int docID1, int docID2) {
- return reverseMul * Integer.compare(values[docID1], values[docID2]);
- }
- };
- }
-
- case DOUBLE:
- {
- final NumericDocValues dvs = numericProvider.get();
- double[] values = new double[maxDoc];
- if (sortField.getMissingValue() != null) {
- Arrays.fill(values, (Double) sortField.getMissingValue());
- }
- while (true) {
- int docID = dvs.nextDoc();
- if (docID == NO_MORE_DOCS) {
- break;
- }
- values[docID] = Double.longBitsToDouble(dvs.longValue());
- }
-
- return new DocComparator() {
- @Override
- public int compare(int docID1, int docID2) {
- return reverseMul * Double.compare(values[docID1], values[docID2]);
- }
- };
- }
-
- case FLOAT:
- {
- final NumericDocValues dvs = numericProvider.get();
- float[] values = new float[maxDoc];
- if (sortField.getMissingValue() != null) {
- Arrays.fill(values, (Float) sortField.getMissingValue());
- }
- while (true) {
- int docID = dvs.nextDoc();
- if (docID == NO_MORE_DOCS) {
- break;
- }
- values[docID] = Float.intBitsToFloat((int) dvs.longValue());
- }
-
- return new DocComparator() {
- @Override
- public int compare(int docID1, int docID2) {
- return reverseMul * Float.compare(values[docID1], values[docID2]);
- }
- };
- }
-
- default:
- throw new IllegalArgumentException("unhandled SortField.getType()=" + sortField.getType());
- }
- }
-
-
/**
* Returns a mapping from the old document ID to its new location in the
* sorted index. Implementations can use the auxiliary
- * {@link #sort(int, DocComparator)} to compute the old-to-new permutation
+ * {@link #sort(int, IndexSorter.DocComparator)} to compute the old-to-new permutation
* given a list of documents and their corresponding values.
*
* A return value of null is allowed and means that
@@ -401,28 +198,29 @@ public int compare(int docID1, int docID2) {
* well, they will however be marked as deleted in the sorted view.
*/
DocMap sort(LeafReader reader) throws IOException {
- SortField fields[] = sort.getSort();
- final DocComparator comparators[] = new DocComparator[fields.length];
+ SortField[] fields = sort.getSort();
+ final IndexSorter.DocComparator[] comparators = new IndexSorter.DocComparator[fields.length];
for (int i = 0; i < fields.length; i++) {
- comparators[i] = getDocComparator(reader, fields[i]);
+ IndexSorter sorter = fields[i].getIndexSorter();
+ if (sorter == null) {
+ throw new IllegalArgumentException("Cannot use sortfield + " + fields[i] + " to sort indexes");
+ }
+ comparators[i] = sorter.getDocComparator(reader, reader.maxDoc());
}
return sort(reader.maxDoc(), comparators);
}
- DocMap sort(int maxDoc, DocComparator[] comparators) throws IOException {
- final DocComparator comparator = new DocComparator() {
- @Override
- public int compare(int docID1, int docID2) {
- for (int i = 0; i < comparators.length; i++) {
- int comp = comparators[i].compare(docID1, docID2);
- if (comp != 0) {
- return comp;
- }
+ DocMap sort(int maxDoc, IndexSorter.DocComparator[] comparators) throws IOException {
+ final IndexSorter.DocComparator comparator = (docID1, docID2) -> {
+ for (int i = 0; i < comparators.length; i++) {
+ int comp = comparators[i].compare(docID1, docID2);
+ if (comp != 0) {
+ return comp;
}
- return Integer.compare(docID1, docID2); // docid order tiebreak
}
+ return Integer.compare(docID1, docID2); // docid order tiebreak
};
return sort(maxDoc, comparator);
diff --git a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java
index f9b851760945..80213f92a9e9 100644
--- a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java
@@ -44,11 +44,11 @@ class TermVectorsConsumer extends TermsHash {
final ByteSliceReader vectorSliceReaderOff = new ByteSliceReader();
boolean hasVectors;
- int numVectorFields;
+ private int numVectorFields;
int lastDocID;
private TermVectorsConsumerPerField[] perFields = new TermVectorsConsumerPerField[1];
- public TermVectorsConsumer(DocumentsWriterPerThread docWriter) {
+ TermVectorsConsumer(DocumentsWriterPerThread docWriter) {
super(docWriter, false, null);
this.docWriter = docWriter;
}
@@ -91,7 +91,7 @@ void initTermVectorsWriter() throws IOException {
}
@Override
- void finishDocument() throws IOException {
+ void finishDocument(int docID) throws IOException {
if (!hasVectors) {
return;
@@ -102,7 +102,7 @@ void finishDocument() throws IOException {
initTermVectorsWriter();
- fill(docState.docID);
+ fill(docID);
// Append term vectors to the real outputs:
writer.startDocument(numVectorFields);
@@ -111,7 +111,7 @@ void finishDocument() throws IOException {
}
writer.finishDocument();
- assert lastDocID == docState.docID: "lastDocID=" + lastDocID + " docState.docID=" + docState.docID;
+ assert lastDocID == docID: "lastDocID=" + lastDocID + " docID=" + docID;
lastDocID++;
diff --git a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java
index 4e0aa3cdaadb..a1abd985f31c 100644
--- a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java
+++ b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java
@@ -20,27 +20,37 @@
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
import org.apache.lucene.codecs.TermVectorsWriter;
+import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
final class TermVectorsConsumerPerField extends TermsHashPerField {
private TermVectorsPostingsArray termVectorsPostingsArray;
- final TermVectorsConsumer termsWriter;
+ private final TermVectorsConsumer termsWriter;
+ private final FieldInvertState fieldState;
+ private final FieldInfo fieldInfo;
- boolean doVectors;
- boolean doVectorPositions;
- boolean doVectorOffsets;
- boolean doVectorPayloads;
+ private boolean doVectors;
+ private boolean doVectorPositions;
+ private boolean doVectorOffsets;
+ private boolean doVectorPayloads;
- OffsetAttribute offsetAttribute;
- PayloadAttribute payloadAttribute;
- boolean hasPayloads; // if enabled, and we actually saw any for this field
+ private OffsetAttribute offsetAttribute;
+ private PayloadAttribute payloadAttribute;
+ private TermFrequencyAttribute termFreqAtt;
+ private final ByteBlockPool termBytePool;
- public TermVectorsConsumerPerField(FieldInvertState invertState, TermVectorsConsumer termsWriter, FieldInfo fieldInfo) {
- super(2, invertState, termsWriter, null, fieldInfo);
- this.termsWriter = termsWriter;
+ private boolean hasPayloads; // if enabled, and we actually saw any for this field
+
+ TermVectorsConsumerPerField(FieldInvertState invertState, TermVectorsConsumer termsHash, FieldInfo fieldInfo) {
+ super(2, termsHash.intPool, termsHash.bytePool, termsHash.termBytePool, termsHash.bytesUsed, null, fieldInfo.name, fieldInfo.getIndexOptions());
+ this.termsWriter = termsHash;
+ this.fieldInfo = fieldInfo;
+ this.fieldState = invertState;
+ termBytePool = termsHash.termBytePool;
}
/** Called once per field per document if term vectors
@@ -48,7 +58,7 @@ public TermVectorsConsumerPerField(FieldInvertState invertState, TermVectorsCons
* RAMOutputStream, which is then quickly flushed to
* the real term vectors files in the Directory. */ @Override
void finish() {
- if (!doVectors || bytesHash.size() == 0) {
+ if (!doVectors || getNumTerms() == 0) {
return;
}
termsWriter.addFieldToFlush(this);
@@ -61,7 +71,7 @@ void finishDocument() throws IOException {
doVectors = false;
- final int numPostings = bytesHash.size();
+ final int numPostings = getNumTerms();
final BytesRef flushTerm = termsWriter.flushTerm;
@@ -74,7 +84,8 @@ void finishDocument() throws IOException {
TermVectorsPostingsArray postings = termVectorsPostingsArray;
final TermVectorsWriter tv = termsWriter.writer;
- final int[] termIDs = sortPostings();
+ sortTerms();
+ final int[] termIDs = getSortedTermIDs();
tv.startField(fieldInfo, numPostings, doVectorPositions, doVectorOffsets, hasPayloads);
@@ -110,18 +121,19 @@ void finishDocument() throws IOException {
@Override
boolean start(IndexableField field, boolean first) {
super.start(field, first);
+ termFreqAtt = fieldState.termFreqAttribute;
assert field.fieldType().indexOptions() != IndexOptions.NONE;
if (first) {
- if (bytesHash.size() != 0) {
+ if (getNumTerms() != 0) {
// Only necessary if previous doc hit a
// non-aborting exception while writing vectors in
// this field:
reset();
}
- bytesHash.reinit();
+ reinitHash();
hasPayloads = false;
@@ -189,8 +201,8 @@ boolean start(IndexableField field, boolean first) {
return doVectors;
}
-
- void writeProx(TermVectorsPostingsArray postings, int termID) {
+
+ void writeProx(TermVectorsPostingsArray postings, int termID) {
if (doVectorOffsets) {
int startOffset = fieldState.offset + offsetAttribute.startOffset();
int endOffset = fieldState.offset + offsetAttribute.endOffset();
@@ -222,7 +234,7 @@ void writeProx(TermVectorsPostingsArray postings, int termID) {
}
@Override
- void newTerm(final int termID) {
+ void newTerm(final int termID, final int docID) {
TermVectorsPostingsArray postings = termVectorsPostingsArray;
postings.freqs[termID] = getTermFreq();
@@ -233,7 +245,7 @@ void newTerm(final int termID) {
}
@Override
- void addTerm(final int termID) {
+ void addTerm(final int termID, final int docID) {
TermVectorsPostingsArray postings = termVectorsPostingsArray;
postings.freqs[termID] += getTermFreq();
@@ -245,10 +257,10 @@ private int getTermFreq() {
int freq = termFreqAtt.getTermFrequency();
if (freq != 1) {
if (doVectorPositions) {
- throw new IllegalArgumentException("field \"" + fieldInfo.name + "\": cannot index term vector positions while using custom TermFrequencyAttribute");
+ throw new IllegalArgumentException("field \"" + getFieldName() + "\": cannot index term vector positions while using custom TermFrequencyAttribute");
}
if (doVectorOffsets) {
- throw new IllegalArgumentException("field \"" + fieldInfo.name + "\": cannot index term vector offsets while using custom TermFrequencyAttribute");
+ throw new IllegalArgumentException("field \"" + getFieldName() + "\": cannot index term vector offsets while using custom TermFrequencyAttribute");
}
}
@@ -266,7 +278,7 @@ ParallelPostingsArray createPostingsArray(int size) {
}
static final class TermVectorsPostingsArray extends ParallelPostingsArray {
- public TermVectorsPostingsArray(int size) {
+ TermVectorsPostingsArray(int size) {
super(size);
freqs = new int[size];
lastOffsets = new int[size];
diff --git a/lucene/core/src/java/org/apache/lucene/index/TermsHash.java b/lucene/core/src/java/org/apache/lucene/index/TermsHash.java
index f420aca65b23..0f702d925b8e 100644
--- a/lucene/core/src/java/org/apache/lucene/index/TermsHash.java
+++ b/lucene/core/src/java/org/apache/lucene/index/TermsHash.java
@@ -40,14 +40,10 @@ abstract class TermsHash {
final ByteBlockPool bytePool;
ByteBlockPool termBytePool;
final Counter bytesUsed;
-
- final DocumentsWriterPerThread.DocState docState;
-
final boolean trackAllocations;
TermsHash(final DocumentsWriterPerThread docWriter, boolean trackAllocations, TermsHash nextTermsHash) {
- this.docState = docWriter.docState;
- this.trackAllocations = trackAllocations;
+ this.trackAllocations = trackAllocations;
this.nextTermsHash = nextTermsHash;
this.bytesUsed = trackAllocations ? docWriter.bytesUsed : Counter.newCounter();
intPool = new IntBlockPool(docWriter.intBlockAllocator);
@@ -82,7 +78,7 @@ void flush(Map fieldsToFlush, final SegmentWriteState
if (nextTermsHash != null) {
Map nextChildFields = new HashMap<>();
for (final Map.Entry entry : fieldsToFlush.entrySet()) {
- nextChildFields.put(entry.getKey(), entry.getValue().nextPerField);
+ nextChildFields.put(entry.getKey(), entry.getValue().getNextPerField());
}
nextTermsHash.flush(nextChildFields, state, sortMap, norms);
}
@@ -90,9 +86,9 @@ void flush(Map fieldsToFlush, final SegmentWriteState
abstract TermsHashPerField addField(FieldInvertState fieldInvertState, FieldInfo fieldInfo);
- void finishDocument() throws IOException {
+ void finishDocument(int docID) throws IOException {
if (nextTermsHash != null) {
- nextTermsHash.finishDocument();
+ nextTermsHash.finishDocument(docID);
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java b/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java
index 2586378267da..d3e048703d2d 100644
--- a/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java
+++ b/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java
@@ -19,182 +19,186 @@
import java.io.IOException;
-import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.util.ByteBlockPool;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash.BytesStartArray;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.IntBlockPool;
+/**
+ * This class stores streams of information per term without knowing
+ * the size of the stream ahead of time. Each stream typically encodes one level
+ * of information like term frequency per document or term proximity. Internally
+ * this class allocates a linked list of slices that can be read by a {@link ByteSliceReader}
+ * for each term. Terms are first deduplicated in a {@link BytesRefHash} once this is done
+ * internal data-structures point to the current offset of each stream that can be written to.
+ */
abstract class TermsHashPerField implements Comparable {
private static final int HASH_INIT_SIZE = 4;
- final TermsHash termsHash;
-
- final TermsHashPerField nextPerField;
- protected final DocumentsWriterPerThread.DocState docState;
- protected final FieldInvertState fieldState;
- TermToBytesRefAttribute termAtt;
- protected TermFrequencyAttribute termFreqAtt;
-
- // Copied from our perThread
- final IntBlockPool intPool;
+ private final TermsHashPerField nextPerField;
+ private final IntBlockPool intPool;
final ByteBlockPool bytePool;
- final ByteBlockPool termBytePool;
-
- final int streamCount;
- final int numPostingInt;
-
- protected final FieldInfo fieldInfo;
-
- final BytesRefHash bytesHash;
+ // for each term we store an integer per stream that points into the bytePool above
+ // the address is updated once data is written to the stream to point to the next free offset
+ // in the terms stream. The start address for the stream is stored in postingsArray.byteStarts[termId]
+ // This is initialized in the #addTerm method, either to a brand new per term stream if the term is new or
+ // to the addresses where the term stream was written to when we saw it the last time.
+ private int[] termStreamAddressBuffer;
+ private int streamAddressOffset;
+ private final int streamCount;
+ private final String fieldName;
+ final IndexOptions indexOptions;
+ /* This stores the actual term bytes for postings and offsets into the parent hash in the case that this
+ * TermsHashPerField is hashing term vectors.*/
+ private final BytesRefHash bytesHash;
ParallelPostingsArray postingsArray;
- private final Counter bytesUsed;
+ private int lastDocID; // only with assert
/** streamCount: how many streams this field stores per term.
* E.g. doc(+freq) is 1 stream, prox+offset is a second. */
-
- public TermsHashPerField(int streamCount, FieldInvertState fieldState, TermsHash termsHash, TermsHashPerField nextPerField, FieldInfo fieldInfo) {
- intPool = termsHash.intPool;
- bytePool = termsHash.bytePool;
- termBytePool = termsHash.termBytePool;
- docState = termsHash.docState;
- this.termsHash = termsHash;
- bytesUsed = termsHash.bytesUsed;
- this.fieldState = fieldState;
+ TermsHashPerField(int streamCount, IntBlockPool intPool, ByteBlockPool bytePool, ByteBlockPool termBytePool,
+ Counter bytesUsed, TermsHashPerField nextPerField, String fieldName, IndexOptions indexOptions) {
+ this.intPool = intPool;
+ this.bytePool = bytePool;
this.streamCount = streamCount;
- numPostingInt = 2*streamCount;
- this.fieldInfo = fieldInfo;
+ this.fieldName = fieldName;
this.nextPerField = nextPerField;
+ assert indexOptions != IndexOptions.NONE;
+ this.indexOptions = indexOptions;
PostingsBytesStartArray byteStarts = new PostingsBytesStartArray(this, bytesUsed);
bytesHash = new BytesRefHash(termBytePool, HASH_INIT_SIZE, byteStarts);
}
void reset() {
bytesHash.clear(false);
+ sortedTermIDs = null;
if (nextPerField != null) {
nextPerField.reset();
}
}
- public void initReader(ByteSliceReader reader, int termID, int stream) {
+ final void initReader(ByteSliceReader reader, int termID, int stream) {
assert stream < streamCount;
- int intStart = postingsArray.intStarts[termID];
- final int[] ints = intPool.buffers[intStart >> IntBlockPool.INT_BLOCK_SHIFT];
- final int upto = intStart & IntBlockPool.INT_BLOCK_MASK;
+ int streamStartOffset = postingsArray.addressOffset[termID];
+ final int[] streamAddressBuffer = intPool.buffers[streamStartOffset >> IntBlockPool.INT_BLOCK_SHIFT];
+ final int offsetInAddressBuffer = streamStartOffset & IntBlockPool.INT_BLOCK_MASK;
reader.init(bytePool,
postingsArray.byteStarts[termID]+stream*ByteBlockPool.FIRST_LEVEL_SIZE,
- ints[upto+stream]);
+ streamAddressBuffer[offsetInAddressBuffer+stream]);
}
- int[] sortedTermIDs;
+ private int[] sortedTermIDs;
/** Collapse the hash table and sort in-place; also sets
- * this.sortedTermIDs to the results */
- public int[] sortPostings() {
+ * this.sortedTermIDs to the results
+ * This method must not be called twice unless {@link #reset()}
+ * or {@link #reinitHash()} was called. */
+ final void sortTerms() {
+ assert sortedTermIDs == null;
sortedTermIDs = bytesHash.sort();
+ }
+
+ /**
+ * Returns the sorted term IDs. {@link #sortTerms()} must be called before
+ */
+ final int[] getSortedTermIDs() {
+ assert sortedTermIDs != null;
return sortedTermIDs;
}
+ final void reinitHash() {
+ sortedTermIDs = null;
+ bytesHash.reinit();
+ }
+
private boolean doNextCall;
// Secondary entry point (for 2nd & subsequent TermsHash),
// because token text has already been "interned" into
// textStart, so we hash by textStart. term vectors use
// this API.
- public void add(int textStart) throws IOException {
+ private void add(int textStart, final int docID) throws IOException {
int termID = bytesHash.addByPoolOffset(textStart);
if (termID >= 0) { // New posting
// First time we are seeing this token since we last
// flushed the hash.
- // Init stream slices
- if (numPostingInt + intPool.intUpto > IntBlockPool.INT_BLOCK_SIZE) {
- intPool.nextBuffer();
- }
-
- if (ByteBlockPool.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) {
- bytePool.nextBuffer();
- }
+ initStreamSlices(termID, docID);
+ } else {
+ positionStreamSlice(termID, docID);
+ }
+ }
- intUptos = intPool.buffer;
- intUptoStart = intPool.intUpto;
- intPool.intUpto += streamCount;
+ private void initStreamSlices(int termID, int docID) throws IOException {
+ // Init stream slices
+ // TODO: figure out why this is 2*streamCount here. streamCount should be enough?
+ if ((2*streamCount) + intPool.intUpto > IntBlockPool.INT_BLOCK_SIZE) {
+ // can we fit all the streams in the current buffer?
+ intPool.nextBuffer();
+ }
- postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;
+ if (ByteBlockPool.BYTE_BLOCK_SIZE - bytePool.byteUpto < (2*streamCount) * ByteBlockPool.FIRST_LEVEL_SIZE) {
+ // can we fit at least one byte per stream in the current buffer, if not allocate a new one
+ bytePool.nextBuffer();
+ }
- for(int i=0;i> IntBlockPool.INT_BLOCK_SHIFT];
- intUptoStart = intStart & IntBlockPool.INT_BLOCK_MASK;
- addTerm(termID);
+ for (int i = 0; i < streamCount; i++) {
+ // initialize each stream with a slice we start with ByteBlockPool.FIRST_LEVEL_SIZE)
+ // and grow as we need more space. see ByteBlockPool.LEVEL_SIZE_ARRAY
+ final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
+ termStreamAddressBuffer[streamAddressOffset + i] = upto + bytePool.byteOffset;
}
+ postingsArray.byteStarts[termID] = termStreamAddressBuffer[streamAddressOffset];
+ newTerm(termID, docID);
+ }
+
+ private boolean assertDocId(int docId) {
+ assert docId >= lastDocID : "docID must be >= " + lastDocID + " but was: " + docId;
+ lastDocID = docId;
+ return true;
}
/** Called once per inverted token. This is the primary
* entry point (for first TermsHash); postings use this
* API. */
- void add() throws IOException {
+ void add(BytesRef termBytes, final int docID) throws IOException {
+ assert assertDocId(docID);
// We are first in the chain so we must "intern" the
// term text into textStart address
// Get the text & hash of this term.
- int termID = bytesHash.add(termAtt.getBytesRef());
-
+ int termID = bytesHash.add(termBytes);
//System.out.println("add term=" + termBytesRef.utf8ToString() + " doc=" + docState.docID + " termID=" + termID);
-
- if (termID >= 0) {// New posting
- bytesHash.byteStart(termID);
+ if (termID >= 0) { // New posting
// Init stream slices
- if (numPostingInt + intPool.intUpto > IntBlockPool.INT_BLOCK_SIZE) {
- intPool.nextBuffer();
- }
-
- if (ByteBlockPool.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) {
- bytePool.nextBuffer();
- }
-
- intUptos = intPool.buffer;
- intUptoStart = intPool.intUpto;
- intPool.intUpto += streamCount;
-
- postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;
-
- for(int i=0;i> IntBlockPool.INT_BLOCK_SHIFT];
- intUptoStart = intStart & IntBlockPool.INT_BLOCK_MASK;
- addTerm(termID);
+ termID = positionStreamSlice(termID, docID);
}
-
if (doNextCall) {
- nextPerField.add(postingsArray.textStarts[termID]);
+ nextPerField.add(postingsArray.textStarts[termID], docID);
}
}
- int[] intUptos;
- int intUptoStart;
+ private int positionStreamSlice(int termID, final int docID) throws IOException {
+ termID = (-termID) - 1;
+ int intStart = postingsArray.addressOffset[termID];
+ termStreamAddressBuffer = intPool.buffers[intStart >> IntBlockPool.INT_BLOCK_SHIFT];
+ streamAddressOffset = intStart & IntBlockPool.INT_BLOCK_MASK;
+ addTerm(termID, docID);
+ return termID;
+ }
- void writeByte(int stream, byte b) {
- int upto = intUptos[intUptoStart+stream];
+ final void writeByte(int stream, byte b) {
+ int streamAddress = streamAddressOffset + stream;
+ int upto = termStreamAddressBuffer[streamAddress];
byte[] bytes = bytePool.buffers[upto >> ByteBlockPool.BYTE_BLOCK_SHIFT];
assert bytes != null;
int offset = upto & ByteBlockPool.BYTE_BLOCK_MASK;
@@ -202,20 +206,20 @@ void writeByte(int stream, byte b) {
// End of slice; allocate a new one
offset = bytePool.allocSlice(bytes, offset);
bytes = bytePool.buffer;
- intUptos[intUptoStart+stream] = offset + bytePool.byteOffset;
+ termStreamAddressBuffer[streamAddress] = offset + bytePool.byteOffset;
}
bytes[offset] = b;
- (intUptos[intUptoStart+stream])++;
+ (termStreamAddressBuffer[streamAddress])++;
}
- public void writeBytes(int stream, byte[] b, int offset, int len) {
+ final void writeBytes(int stream, byte[] b, int offset, int len) {
// TODO: optimize
final int end = offset + len;
for(int i=offset;i LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
+ throw new IllegalArgumentException("max edits must be 0.." + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got: " + maxEdits);
+ }
+ if (prefixLength < 0) {
+ throw new IllegalArgumentException("prefixLength cannot be less than 0");
+ }
+ this.term = term;
+ this.maxEdits = maxEdits;
+ int[] codePoints = stringToUTF32(term);
+ this.termLength = codePoints.length;
+ prefixLength = Math.min(prefixLength, codePoints.length);
+ int[] suffix = new int[codePoints.length - prefixLength];
+ System.arraycopy(codePoints, prefixLength, suffix, 0, suffix.length);
+ this.levBuilder = new LevenshteinAutomata(suffix, Character.MAX_CODE_POINT, transpositions);
+ this.prefix = UnicodeUtil.newString(codePoints, 0, prefixLength);
+ }
+
+ CompiledAutomaton[] buildAutomatonSet() {
+ CompiledAutomaton[] compiled = new CompiledAutomaton[maxEdits + 1];
+ for (int i = 0; i <= maxEdits; i++) {
+ try {
+ compiled[i] = new CompiledAutomaton(levBuilder.toAutomaton(i, prefix), true, false);
+ }
+ catch (TooComplexToDeterminizeException e) {
+ throw new FuzzyTermsEnum.FuzzyTermsException(term, e);
+ }
+ }
+ return compiled;
+ }
+
+ CompiledAutomaton buildMaxEditAutomaton() {
+ try {
+ return new CompiledAutomaton(levBuilder.toAutomaton(maxEdits, prefix), true, false);
+ } catch (TooComplexToDeterminizeException e) {
+ throw new FuzzyTermsEnum.FuzzyTermsException(term, e);
+ }
+ }
+
+ int getTermLength() {
+ return this.termLength;
+ }
+
+ private static int[] stringToUTF32(String text) {
+ int[] termText = new int[text.codePointCount(0, text.length())];
+ for (int cp, i = 0, j = 0; i < text.length(); i += Character.charCount(cp)) {
+ termText[j++] = cp = text.codePointAt(i);
+ }
+ return termText;
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java b/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java
index c4b4d1b6adab..041f0ca180ae 100644
--- a/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java
@@ -18,14 +18,13 @@
import java.io.IOException;
+import java.util.Objects;
import org.apache.lucene.index.SingleTermsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.AttributeSource;
-import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
@@ -53,9 +52,7 @@
* not match an indexed term "ab", and FuzzyQuery on term "a" with maxEdits=2 will not
* match an indexed term "abc".
*/
-public class FuzzyQuery extends MultiTermQuery implements Accountable {
-
- private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(AutomatonQuery.class);
+public class FuzzyQuery extends MultiTermQuery {
public final static int defaultMaxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
public final static int defaultPrefixLength = 0;
@@ -67,10 +64,6 @@ public class FuzzyQuery extends MultiTermQuery implements Accountable {
private final boolean transpositions;
private final int prefixLength;
private final Term term;
- private final int termLength;
- private final CompiledAutomaton[] automata;
-
- private final long ramBytesUsed;
/**
* Create a new FuzzyQuery that will match terms with an edit distance
@@ -106,22 +99,7 @@ public FuzzyQuery(Term term, int maxEdits, int prefixLength, int maxExpansions,
this.prefixLength = prefixLength;
this.transpositions = transpositions;
this.maxExpansions = maxExpansions;
- int[] codePoints = FuzzyTermsEnum.stringToUTF32(term.text());
- this.termLength = codePoints.length;
- this.automata = FuzzyTermsEnum.buildAutomata(term.text(), codePoints, prefixLength, transpositions, maxEdits);
setRewriteMethod(new MultiTermQuery.TopTermsBlendedFreqScoringRewrite(maxExpansions));
- this.ramBytesUsed = calculateRamBytesUsed(term, this.automata);
- }
-
- private static long calculateRamBytesUsed(Term term, CompiledAutomaton[] automata) {
- long bytes = BASE_RAM_BYTES + term.ramBytesUsed();
- for (CompiledAutomaton a : automata) {
- bytes += a.ramBytesUsed();
- }
- bytes += 4 * Integer.BYTES;
- bytes += Long.BYTES;
- bytes += 1;
- return bytes;
}
/**
@@ -173,8 +151,9 @@ public boolean getTranspositions() {
/**
* Returns the compiled automata used to match terms
*/
- public CompiledAutomaton[] getAutomata() {
- return automata;
+ public CompiledAutomaton getAutomata() {
+ FuzzyAutomatonBuilder builder = new FuzzyAutomatonBuilder(term.text(), maxEdits, prefixLength, transpositions);
+ return builder.buildMaxEditAutomaton();
}
@Override
@@ -183,17 +162,17 @@ public void visit(QueryVisitor visitor) {
if (maxEdits == 0 || prefixLength >= term.text().length()) {
visitor.consumeTerms(this, term);
} else {
- automata[automata.length - 1].visit(visitor, this, field);
+ visitor.consumeTermsMatching(this, term.field(), () -> getAutomata().runAutomaton);
}
}
}
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
- if (maxEdits == 0 || prefixLength >= term.text().length()) { // can only match if it's exact
+ if (maxEdits == 0) { // can only match if it's exact
return new SingleTermsEnum(terms.iterator(), term.bytes());
}
- return new FuzzyTermsEnum(terms, atts, getTerm(), termLength, maxEdits, automata);
+ return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits, prefixLength, transpositions);
}
/**
@@ -237,22 +216,9 @@ public boolean equals(Object obj) {
if (getClass() != obj.getClass())
return false;
FuzzyQuery other = (FuzzyQuery) obj;
- // Note that we don't need to compare termLength or automata because they
- // are entirely determined by the other fields
- if (maxEdits != other.maxEdits)
- return false;
- if (prefixLength != other.prefixLength)
- return false;
- if (maxExpansions != other.maxExpansions)
- return false;
- if (transpositions != other.transpositions)
- return false;
- if (term == null) {
- if (other.term != null)
- return false;
- } else if (!term.equals(other.term))
- return false;
- return true;
+ return Objects.equals(maxEdits, other.maxEdits) && Objects.equals(prefixLength, other.prefixLength)
+ && Objects.equals(maxExpansions, other.maxExpansions) && Objects.equals(transpositions, other.transpositions)
+ && Objects.equals(term, other.term);
}
/**
@@ -274,8 +240,4 @@ public static int floatToEdits(float minimumSimilarity, int termLen) {
}
}
- @Override
- public long ramBytesUsed() {
- return ramBytesUsed;
- }
}
diff --git a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
index 91a44d5245c0..4c49d8accd77 100644
--- a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
@@ -18,6 +18,7 @@
import java.io.IOException;
+import java.util.function.Supplier;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.PostingsEnum;
@@ -25,14 +26,14 @@
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.Attribute;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeReflector;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.UnicodeUtil;
-import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
-import org.apache.lucene.util.automaton.LevenshteinAutomata;
-import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
/** Subclass of TermsEnum for enumerating all terms that are similar
* to the specified filter term.
@@ -57,21 +58,21 @@ public final class FuzzyTermsEnum extends TermsEnum {
private final MaxNonCompetitiveBoostAttribute maxBoostAtt;
private final CompiledAutomaton[] automata;
+ private final Terms terms;
+ private final int termLength;
+ private final Term term;
private float bottom;
private BytesRef bottomTerm;
private BytesRef queuedBottom;
- private final int termLength;
// Maximum number of edits we will accept. This is either 2 or 1 (or, degenerately, 0) passed by the user originally,
// but as we collect terms, we can lower this (e.g. from 2 to 1) if we detect that the term queue is full, and all
// collected terms are ed=1:
private int maxEdits;
- private final Terms terms;
- private final Term term;
/**
* Constructor for enumeration of all terms from specified reader which share a prefix of
@@ -88,43 +89,44 @@ public final class FuzzyTermsEnum extends TermsEnum {
* @throws IOException if there is a low-level IO error
*/
public FuzzyTermsEnum(Terms terms, Term term, int maxEdits, int prefixLength, boolean transpositions) throws IOException {
- this(terms, term, stringToUTF32(term.text()), maxEdits, prefixLength, transpositions);
- }
-
- private FuzzyTermsEnum(Terms terms, Term term, int[] codePoints, int maxEdits, int prefixLength, boolean transpositions) throws IOException {
- this(terms, new AttributeSource(), term, codePoints.length, maxEdits,
- buildAutomata(term.text(), codePoints, prefixLength, transpositions, maxEdits));
+ this(terms, new AttributeSource(), term, () -> new FuzzyAutomatonBuilder(term.text(), maxEdits, prefixLength, transpositions));
}
/**
* Constructor for enumeration of all terms from specified reader which share a prefix of
* length prefixLength with term and which have at most {@code maxEdits} edits.
*
- * After calling the constructor the enumeration is already pointing to the first
- * valid term if such a term exists.
- *
+ * After calling the constructor the enumeration is already pointing to the first
+ * valid term if such a term exists.
+ *
* @param terms Delivers terms.
- * @param atts {@link AttributeSource} created by the rewrite method of {@link MultiTermQuery}
- * that contains information about competitive boosts during rewrite
+ * @param atts An AttributeSource used to share automata between segments
* @param term Pattern term.
* @param maxEdits Maximum edit distance.
- * @param automata An array of levenshtein automata to match against terms,
- * see {@link #buildAutomata(String, int[], int, boolean, int)}
+ * @param prefixLength the length of the required common prefix
+ * @param transpositions whether transpositions should count as a single edit
* @throws IOException if there is a low-level IO error
*/
- public FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, int termLength,
- final int maxEdits, CompiledAutomaton[] automata) throws IOException {
+ FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, int maxEdits, int prefixLength, boolean transpositions) throws IOException {
+ this(terms, atts, term, () -> new FuzzyAutomatonBuilder(term.text(), maxEdits, prefixLength, transpositions));
+ }
+
+ private FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, Supplier automatonBuilder) throws IOException {
- this.maxEdits = maxEdits;
this.terms = terms;
- this.term = term;
this.atts = atts;
- this.termLength = termLength;
+ this.term = term;
this.maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
this.boostAtt = atts.addAttribute(BoostAttribute.class);
- this.automata = automata;
+ atts.addAttributeImpl(new AutomatonAttributeImpl());
+ AutomatonAttribute aa = atts.addAttribute(AutomatonAttribute.class);
+ aa.init(automatonBuilder);
+
+ this.automata = aa.getAutomata();
+ this.termLength = aa.getTermLength();
+ this.maxEdits = this.automata.length - 1;
bottom = maxBoostAtt.getMaxNonCompetitiveBoost();
bottomTerm = maxBoostAtt.getCompetitiveTerm();
@@ -145,47 +147,6 @@ public void setMaxNonCompetitiveBoost(float boost) {
public float getBoost() {
return boostAtt.getBoost();
}
-
- static CompiledAutomaton[] buildAutomata(String text, int[] termText, int prefixLength, boolean transpositions, int maxEdits) {
- CompiledAutomaton[] compiled = new CompiledAutomaton[maxEdits + 1];
- Automaton[] automata = buildAutomata(termText, prefixLength, transpositions, maxEdits);
- for (int i = 0; i <= maxEdits; i++) {
- try {
- compiled[i] = new CompiledAutomaton(automata[i], true, false);
- }
- catch (TooComplexToDeterminizeException e) {
- throw new FuzzyTermsException(text, e);
- }
- }
- return compiled;
- }
-
- static int[] stringToUTF32(String text) {
- int[] termText = new int[text.codePointCount(0, text.length())];
- for (int cp, i = 0, j = 0; i < text.length(); i += Character.charCount(cp)) {
- termText[j++] = cp = text.codePointAt(i);
- }
- return termText;
- }
-
- private static Automaton[] buildAutomata(int[] termText, int prefixLength, boolean transpositions, int maxEdits) {
- if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
- throw new IllegalArgumentException("max edits must be 0.." + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got: " + maxEdits);
- }
- if (prefixLength < 0) {
- throw new IllegalArgumentException("prefixLength cannot be less than 0");
- }
- Automaton[] automata = new Automaton[maxEdits + 1];
- int termLength = termText.length;
- prefixLength = Math.min(prefixLength, termLength);
- String suffix = UnicodeUtil.newString(termText, prefixLength, termText.length - prefixLength);
- LevenshteinAutomata builder = new LevenshteinAutomata(suffix, transpositions);
- String prefix = UnicodeUtil.newString(termText, 0, prefixLength);
- for (int i = 0; i <= maxEdits; i++) {
- automata[i] = builder.toAutomaton(i, prefix);
- }
- return automata;
- }
/**
* return an automata-based enum for matching up to editDistance from
@@ -274,7 +235,7 @@ public BytesRef next() throws IOException {
final float bottom = maxBoostAtt.getMaxNonCompetitiveBoost();
final BytesRef bottomTerm = maxBoostAtt.getCompetitiveTerm();
- if (term != null && (bottom != this.bottom || bottomTerm != this.bottomTerm)) {
+ if (bottom != this.bottom || bottomTerm != this.bottomTerm) {
this.bottom = bottom;
this.bottomTerm = bottomTerm;
// clone the term before potentially doing something with it
@@ -364,4 +325,60 @@ public static class FuzzyTermsException extends RuntimeException {
}
}
+ /**
+ * Used for sharing automata between segments
+ *
+ * Levenshtein automata are large and expensive to build; we don't want to build
+ * them directly on the query because this can blow up caches that use queries
+ * as keys; we also don't want to rebuild them for every segment. This attribute
+ * allows the FuzzyTermsEnum to build the automata once for its first segment
+ * and then share them for subsequent segment calls.
+ */
+ private interface AutomatonAttribute extends Attribute {
+ CompiledAutomaton[] getAutomata();
+ int getTermLength();
+ void init(Supplier builder);
+ }
+
+ private static class AutomatonAttributeImpl extends AttributeImpl implements AutomatonAttribute {
+
+ private CompiledAutomaton[] automata;
+ private int termLength;
+
+ @Override
+ public CompiledAutomaton[] getAutomata() {
+ return automata;
+ }
+
+ @Override
+ public int getTermLength() {
+ return termLength;
+ }
+
+ @Override
+ public void init(Supplier supplier) {
+ if (automata != null) {
+ return;
+ }
+ FuzzyAutomatonBuilder builder = supplier.get();
+ this.termLength = builder.getTermLength();
+ this.automata = builder.buildAutomatonSet();
+ }
+
+ @Override
+ public void clear() {
+ this.automata = null;
+ }
+
+ @Override
+ public void reflectWith(AttributeReflector reflector) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void copyTo(AttributeImpl target) {
+ throw new UnsupportedOperationException();
+ }
+ }
+
}
diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiCollector.java b/lucene/core/src/java/org/apache/lucene/search/MultiCollector.java
index 82251e43cfe8..5cb6db8acd48 100644
--- a/lucene/core/src/java/org/apache/lucene/search/MultiCollector.java
+++ b/lucene/core/src/java/org/apache/lucene/search/MultiCollector.java
@@ -117,7 +117,7 @@ public ScoreMode scoreMode() {
@Override
public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
- final List leafCollectors = new ArrayList<>();
+ final List leafCollectors = new ArrayList<>(collectors.length);
for (Collector collector : collectors) {
final LeafCollector leafCollector;
try {
@@ -134,7 +134,7 @@ public LeafCollector getLeafCollector(LeafReaderContext context) throws IOExcept
case 1:
return leafCollectors.get(0);
default:
- return new MultiLeafCollector(leafCollectors, cacheScores);
+ return new MultiLeafCollector(leafCollectors, cacheScores, scoreMode() == ScoreMode.TOP_SCORES);
}
}
@@ -142,12 +142,14 @@ private static class MultiLeafCollector implements LeafCollector {
private final boolean cacheScores;
private final LeafCollector[] collectors;
- private int numCollectors;
+ private final float[] minScores;
+ private final boolean skipNonCompetitiveScores;
- private MultiLeafCollector(List collectors, boolean cacheScores) {
+ private MultiLeafCollector(List collectors, boolean cacheScores, boolean skipNonCompetitive) {
this.collectors = collectors.toArray(new LeafCollector[collectors.size()]);
this.cacheScores = cacheScores;
- this.numCollectors = this.collectors.length;
+ this.skipNonCompetitiveScores = skipNonCompetitive;
+ this.minScores = this.skipNonCompetitiveScores ? new float[this.collectors.length] : null;
}
@Override
@@ -155,48 +157,89 @@ public void setScorer(Scorable scorer) throws IOException {
if (cacheScores) {
scorer = new ScoreCachingWrappingScorer(scorer);
}
- scorer = new FilterScorable(scorer) {
- @Override
- public void setMinCompetitiveScore(float minScore) {
- // Ignore calls to setMinCompetitiveScore so that if we wrap two
- // collectors and one of them wants to skip low-scoring hits, then
- // the other collector still sees all hits. We could try to reconcile
- // min scores and take the maximum min score across collectors, but
- // this is very unlikely to be helpful in practice.
+ if (skipNonCompetitiveScores) {
+ for (int i = 0; i < collectors.length; ++i) {
+ final LeafCollector c = collectors[i];
+ if (c != null) {
+ c.setScorer(new MinCompetitiveScoreAwareScorable(scorer, i, minScores));
+ }
}
+ } else {
+ scorer = new FilterScorable(scorer) {
+ @Override
+ public void setMinCompetitiveScore(float minScore) throws IOException {
+ // Ignore calls to setMinCompetitiveScore so that if we wrap two
+ // collectors and one of them wants to skip low-scoring hits, then
+ // the other collector still sees all hits.
+ }
- };
- for (int i = 0; i < numCollectors; ++i) {
- final LeafCollector c = collectors[i];
- c.setScorer(scorer);
+ };
+ for (int i = 0; i < collectors.length; ++i) {
+ final LeafCollector c = collectors[i];
+ if (c != null) {
+ c.setScorer(scorer);
+ }
+ }
}
}
- private void removeCollector(int i) {
- System.arraycopy(collectors, i + 1, collectors, i, numCollectors - i - 1);
- --numCollectors;
- collectors[numCollectors] = null;
- }
-
@Override
public void collect(int doc) throws IOException {
- final LeafCollector[] collectors = this.collectors;
- int numCollectors = this.numCollectors;
- for (int i = 0; i < numCollectors; ) {
+ for (int i = 0; i < collectors.length; i++) {
final LeafCollector collector = collectors[i];
- try {
- collector.collect(doc);
- ++i;
- } catch (CollectionTerminatedException e) {
- removeCollector(i);
- numCollectors = this.numCollectors;
- if (numCollectors == 0) {
- throw new CollectionTerminatedException();
+ if (collector != null) {
+ try {
+ collector.collect(doc);
+ } catch (CollectionTerminatedException e) {
+ collectors[i] = null;
+ if (allCollectorsTerminated()) {
+ throw new CollectionTerminatedException();
+ }
}
}
}
}
+ private boolean allCollectorsTerminated() {
+ for (int i = 0; i < collectors.length; i++) {
+ if (collectors[i] != null) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ }
+
+ final static class MinCompetitiveScoreAwareScorable extends FilterScorable {
+
+ private final int idx;
+ private final float[] minScores;
+
+ MinCompetitiveScoreAwareScorable(Scorable in, int idx, float[] minScores) {
+ super(in);
+ this.idx = idx;
+ this.minScores = minScores;
+ }
+
+ @Override
+ public void setMinCompetitiveScore(float minScore) throws IOException {
+ if (minScore > minScores[idx]) {
+ minScores[idx] = minScore;
+ in.setMinCompetitiveScore(minScore());
+ }
+ }
+
+ private float minScore() {
+ float min = Float.MAX_VALUE;
+ for (int i = 0; i < minScores.length; i++) {
+ if (minScores[i] < min) {
+ min = minScores[i];
+ }
+ }
+ return min;
+ }
+
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java b/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java
index 327227afe151..e3c4ff7097f4 100644
--- a/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java
@@ -286,9 +286,9 @@ public MultiTermQuery(final String field) {
* (should instead return {@link TermsEnum#EMPTY} if no
* terms match). The TermsEnum must already be
* positioned to the first matching term.
- * The given {@link AttributeSource} is passed by the {@link RewriteMethod} to
- * provide attributes, the rewrite method uses to inform about e.g. maximum competitive boosts.
- * This is currently only used by {@link TopTermsRewrite}
+ * The given {@link AttributeSource} is passed by the {@link RewriteMethod} to
+ * share information between segments, for example {@link TopTermsRewrite} uses
+ * it to share maximum competitive boosts
*/
protected abstract TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException;
diff --git a/lucene/core/src/java/org/apache/lucene/search/SortField.java b/lucene/core/src/java/org/apache/lucene/search/SortField.java
index 2cfae46f01d8..7512ec934416 100644
--- a/lucene/core/src/java/org/apache/lucene/search/SortField.java
+++ b/lucene/core/src/java/org/apache/lucene/search/SortField.java
@@ -21,7 +21,13 @@
import java.util.Comparator;
import java.util.Objects;
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.IndexSorter;
+import org.apache.lucene.index.SortFieldProvider;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.NumericUtils;
/**
* Stores information about how to sort documents by terms in an individual
@@ -120,6 +126,106 @@ public SortField(String field, Type type, boolean reverse) {
this.reverse = reverse;
}
+ /** A SortFieldProvider for field sorts */
+ public static final class Provider extends SortFieldProvider {
+
+ /** The name this Provider is registered under */
+ public static final String NAME = "SortField";
+
+ /** Creates a new Provider */
+ public Provider() {
+ super(NAME);
+ }
+
+ @Override
+ public SortField readSortField(DataInput in) throws IOException {
+ SortField sf = new SortField(in.readString(), readType(in), in.readInt() == 1);
+ if (in.readInt() == 1) {
+ // missing object
+ switch (sf.type) {
+ case STRING:
+ int missingString = in.readInt();
+ if (missingString == 1) {
+ sf.setMissingValue(STRING_FIRST);
+ }
+ else {
+ sf.setMissingValue(STRING_LAST);
+ }
+ break;
+ case INT:
+ sf.setMissingValue(in.readInt());
+ break;
+ case LONG:
+ sf.setMissingValue(in.readLong());
+ break;
+ case FLOAT:
+ sf.setMissingValue(NumericUtils.sortableIntToFloat(in.readInt()));
+ break;
+ case DOUBLE:
+ sf.setMissingValue(NumericUtils.sortableLongToDouble(in.readLong()));
+ break;
+ default:
+ throw new IllegalArgumentException("Cannot deserialize sort of type " + sf.type);
+ }
+ }
+ return sf;
+ }
+
+ @Override
+ public void writeSortField(SortField sf, DataOutput out) throws IOException {
+ sf.serialize(out);
+ }
+ }
+
+ protected static Type readType(DataInput in) throws IOException {
+ String type = in.readString();
+ try {
+ return Type.valueOf(type);
+ }
+ catch (IllegalArgumentException e) {
+ throw new IllegalArgumentException("Can't deserialize SortField - unknown type " + type);
+ }
+ }
+
+ private void serialize(DataOutput out) throws IOException {
+ out.writeString(field);
+ out.writeString(type.toString());
+ out.writeInt(reverse ? 1 : 0);
+ if (missingValue == null) {
+ out.writeInt(0);
+ }
+ else {
+ out.writeInt(1);
+ switch (type) {
+ case STRING:
+ if (missingValue == STRING_LAST) {
+ out.writeInt(0);
+ }
+ else if (missingValue == STRING_FIRST) {
+ out.writeInt(1);
+ }
+ else {
+ throw new IllegalArgumentException("Cannot serialize missing value of " + missingValue + " for type STRING");
+ }
+ break;
+ case INT:
+ out.writeInt((int)missingValue);
+ break;
+ case LONG:
+ out.writeLong((long)missingValue);
+ break;
+ case FLOAT:
+ out.writeInt(NumericUtils.floatToSortableInt((float)missingValue));
+ break;
+ case DOUBLE:
+ out.writeLong(NumericUtils.doubleToSortableLong((double)missingValue));
+ break;
+ default:
+ throw new IllegalArgumentException("Cannot serialize SortField of type " + type);
+ }
+ }
+ }
+
/** Pass this to {@link #setMissingValue} to have missing
* string values sort first. */
public final static Object STRING_FIRST = new Object() {
@@ -392,4 +498,33 @@ public SortField rewrite(IndexSearcher searcher) throws IOException {
public boolean needsScores() {
return type == Type.SCORE;
}
+
+ /**
+ * Returns an {@link IndexSorter} used for sorting index segments by this SortField.
+ *
+ * If the SortField cannot be used for index sorting (for example, if it uses scores or
+ * other query-dependent values) then this method should return {@code null}
+ *
+ * SortFields that implement this method should also implement a companion
+ * {@link SortFieldProvider} to serialize and deserialize the sort in index segment
+ * headers
+ *
+ * @lucene.experimental
+ */
+ public IndexSorter getIndexSorter() {
+ switch (type) {
+ case STRING:
+ return new IndexSorter.StringSorter(Provider.NAME, missingValue, reverse, reader -> DocValues.getSorted(reader, field));
+ case INT:
+ return new IndexSorter.IntSorter(Provider.NAME, (Integer)missingValue, reverse, reader -> DocValues.getNumeric(reader, field));
+ case LONG:
+ return new IndexSorter.LongSorter(Provider.NAME, (Long)missingValue, reverse, reader -> DocValues.getNumeric(reader, field));
+ case DOUBLE:
+ return new IndexSorter.DoubleSorter(Provider.NAME, (Double)missingValue, reverse, reader -> DocValues.getNumeric(reader, field));
+ case FLOAT:
+ return new IndexSorter.FloatSorter(Provider.NAME, (Float)missingValue, reverse, reader -> DocValues.getNumeric(reader, field));
+ default: return null;
+ }
+ }
+
}
diff --git a/lucene/core/src/java/org/apache/lucene/search/SortedNumericSortField.java b/lucene/core/src/java/org/apache/lucene/search/SortedNumericSortField.java
index fff000b96f6a..6c5154a3ee35 100644
--- a/lucene/core/src/java/org/apache/lucene/search/SortedNumericSortField.java
+++ b/lucene/core/src/java/org/apache/lucene/search/SortedNumericSortField.java
@@ -20,9 +20,15 @@
import java.io.IOException;
import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.IndexSorter;
+import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.SortFieldProvider;
import org.apache.lucene.index.SortedNumericDocValues;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.NumericUtils;
/**
* SortField for {@link SortedNumericDocValues}.
@@ -83,6 +89,86 @@ public SortedNumericSortField(String field, SortField.Type type, boolean reverse
this.type = type;
}
+ /** A SortFieldProvider for this sort field */
+ public static final class Provider extends SortFieldProvider {
+
+ /** The name this provider is registered under */
+ public static final String NAME = "SortedNumericSortField";
+
+ /** Creates a new Provider */
+ public Provider() {
+ super(NAME);
+ }
+
+ @Override
+ public SortField readSortField(DataInput in) throws IOException {
+ SortedNumericSortField sf = new SortedNumericSortField(in.readString(), readType(in), in.readInt() == 1, readSelectorType(in));
+ if (in.readInt() == 1) {
+ switch (sf.type) {
+ case INT:
+ sf.setMissingValue(in.readInt());
+ break;
+ case LONG:
+ sf.setMissingValue(in.readLong());
+ break;
+ case FLOAT:
+ sf.setMissingValue(NumericUtils.sortableIntToFloat(in.readInt()));
+ break;
+ case DOUBLE:
+ sf.setMissingValue(NumericUtils.sortableLongToDouble(in.readLong()));
+ break;
+ default:
+ throw new AssertionError();
+ }
+ }
+ return sf;
+ }
+
+ @Override
+ public void writeSortField(SortField sf, DataOutput out) throws IOException {
+ assert sf instanceof SortedNumericSortField;
+ ((SortedNumericSortField)sf).serialize(out);
+ }
+ }
+
+ private static SortedNumericSelector.Type readSelectorType(DataInput in) throws IOException {
+ int selectorType = in.readInt();
+ if (selectorType >= SortedNumericSelector.Type.values().length) {
+ throw new IllegalArgumentException("Can't deserialize SortedNumericSortField - unknown selector type " + selectorType);
+ }
+ return SortedNumericSelector.Type.values()[selectorType];
+ }
+
+ private void serialize(DataOutput out) throws IOException {
+ out.writeString(getField());
+ out.writeString(type.toString());
+ out.writeInt(reverse ? 1 : 0);
+ out.writeInt(selector.ordinal());
+ if (missingValue == null) {
+ out.writeInt(0);
+ }
+ else {
+ out.writeInt(1);
+ // oh for switch expressions...
+ switch (type) {
+ case INT:
+ out.writeInt((int)missingValue);
+ break;
+ case LONG:
+ out.writeLong((long)missingValue);
+ break;
+ case FLOAT:
+ out.writeInt(NumericUtils.floatToSortableInt((float)missingValue));
+ break;
+ case DOUBLE:
+ out.writeLong(NumericUtils.doubleToSortableLong((double)missingValue));
+ break;
+ default:
+ throw new AssertionError();
+ }
+ }
+ }
+
/** Returns the numeric type in use for this sort */
public SortField.Type getNumericType() {
return type;
@@ -170,4 +256,24 @@ protected NumericDocValues getNumericDocValues(LeafReaderContext context, String
throw new AssertionError();
}
}
+
+ private NumericDocValues getValue(LeafReader reader) throws IOException {
+ return SortedNumericSelector.wrap(DocValues.getSortedNumeric(reader, getField()), selector, type);
+ }
+
+ @Override
+ public IndexSorter getIndexSorter() {
+ switch(type) {
+ case INT:
+ return new IndexSorter.IntSorter(Provider.NAME, (Integer)missingValue, reverse, this::getValue);
+ case LONG:
+ return new IndexSorter.LongSorter(Provider.NAME, (Long)missingValue, reverse, this::getValue);
+ case DOUBLE:
+ return new IndexSorter.DoubleSorter(Provider.NAME, (Double)missingValue, reverse, this::getValue);
+ case FLOAT:
+ return new IndexSorter.FloatSorter(Provider.NAME, (Float)missingValue, reverse, this::getValue);
+ default:
+ throw new AssertionError();
+ }
+ }
}
diff --git a/lucene/core/src/java/org/apache/lucene/search/SortedSetSortField.java b/lucene/core/src/java/org/apache/lucene/search/SortedSetSortField.java
index b095c6e88fcd..2321a667bdb9 100644
--- a/lucene/core/src/java/org/apache/lucene/search/SortedSetSortField.java
+++ b/lucene/core/src/java/org/apache/lucene/search/SortedSetSortField.java
@@ -16,13 +16,17 @@
*/
package org.apache.lucene.search;
-
import java.io.IOException;
import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.IndexSorter;
+import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.SortFieldProvider;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
/**
* SortField for {@link SortedSetDocValues}.
@@ -68,6 +72,60 @@ public SortedSetSortField(String field, boolean reverse, SortedSetSelector.Type
}
this.selector = selector;
}
+
+ /** A SortFieldProvider for this sort */
+ public static final class Provider extends SortFieldProvider {
+
+ /** The name this provider is registered under */
+ public static final String NAME = "SortedSetSortField";
+
+ /** Creates a new Provider */
+ public Provider() {
+ super(NAME);
+ }
+
+ @Override
+ public SortField readSortField(DataInput in) throws IOException {
+ SortField sf = new SortedSetSortField(in.readString(), in.readInt() == 1, readSelectorType(in));
+ int missingValue = in.readInt();
+ if (missingValue == 1) {
+ sf.setMissingValue(SortField.STRING_FIRST);
+ }
+ else if (missingValue == 2) {
+ sf.setMissingValue(SortField.STRING_LAST);
+ }
+ return sf;
+ }
+
+ @Override
+ public void writeSortField(SortField sf, DataOutput out) throws IOException {
+ assert sf instanceof SortedSetSortField;
+ ((SortedSetSortField)sf).serialize(out);
+ }
+ }
+
+ private static SortedSetSelector.Type readSelectorType(DataInput in) throws IOException {
+ int type = in.readInt();
+ if (type >= SortedSetSelector.Type.values().length) {
+ throw new IllegalArgumentException("Cannot deserialize SortedSetSortField: unknown selector type " + type);
+ }
+ return SortedSetSelector.Type.values()[type];
+ }
+
+ private void serialize(DataOutput out) throws IOException {
+ out.writeString(getField());
+ out.writeInt(reverse ? 1 : 0);
+ out.writeInt(selector.ordinal());
+ if (missingValue == SortField.STRING_FIRST) {
+ out.writeInt(1);
+ }
+ else if (missingValue == SortField.STRING_LAST) {
+ out.writeInt(2);
+ }
+ else {
+ out.writeInt(0);
+ }
+ }
/** Returns the selector in use for this sort */
public SortedSetSelector.Type getSelector() {
@@ -126,4 +184,13 @@ protected SortedDocValues getSortedDocValues(LeafReaderContext context, String f
}
};
}
+
+ private SortedDocValues getValues(LeafReader reader) throws IOException {
+ return SortedSetSelector.wrap(DocValues.getSortedSet(reader, getField()), selector);
+ }
+
+ @Override
+ public IndexSorter getIndexSorter() {
+ return new IndexSorter.StringSorter(Provider.NAME, missingValue, reverse, this::getValues);
+ }
}
diff --git a/lucene/core/src/java/org/apache/lucene/store/BufferedChecksum.java b/lucene/core/src/java/org/apache/lucene/store/BufferedChecksum.java
index 4378c9a7705d..74b97d2d13cc 100644
--- a/lucene/core/src/java/org/apache/lucene/store/BufferedChecksum.java
+++ b/lucene/core/src/java/org/apache/lucene/store/BufferedChecksum.java
@@ -27,8 +27,8 @@ public class BufferedChecksum implements Checksum {
private final Checksum in;
private final byte buffer[];
private int upto;
- /** Default buffer size: 256 */
- public static final int DEFAULT_BUFFERSIZE = 256;
+ /** Default buffer size: 1024 */
+ public static final int DEFAULT_BUFFERSIZE = 1024;
/** Create a new BufferedChecksum with {@link #DEFAULT_BUFFERSIZE} */
public BufferedChecksum(Checksum in) {
diff --git a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersIndexOutput.java b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersIndexOutput.java
index 19dc4004853b..f6785bef79d8 100644
--- a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersIndexOutput.java
+++ b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersIndexOutput.java
@@ -81,24 +81,10 @@ public long getChecksum() throws IOException {
if (lastChecksumPosition != delegate.size()) {
lastChecksumPosition = delegate.size();
checksum.reset();
- byte [] buffer = null;
for (ByteBuffer bb : delegate.toBufferList()) {
- if (bb.hasArray()) {
- checksum.update(bb.array(), bb.arrayOffset() + bb.position(), bb.remaining());
- } else {
- if (buffer == null) buffer = new byte [1024 * 4];
-
- bb = bb.asReadOnlyBuffer();
- int remaining = bb.remaining();
- while (remaining > 0) {
- int len = Math.min(remaining, buffer.length);
- bb.get(buffer, 0, len);
- checksum.update(buffer, 0, len);
- remaining -= len;
- }
- }
+ checksum.update(bb);
}
- lastChecksum = checksum.getValue();
+ lastChecksum = checksum.getValue();
}
return lastChecksum;
}
diff --git a/lucene/core/src/java/org/apache/lucene/util/IntBlockPool.java b/lucene/core/src/java/org/apache/lucene/util/IntBlockPool.java
index 46500fcdf073..d09cb83e2cc3 100644
--- a/lucene/core/src/java/org/apache/lucene/util/IntBlockPool.java
+++ b/lucene/core/src/java/org/apache/lucene/util/IntBlockPool.java
@@ -175,7 +175,7 @@ private int newSlice(final int size) {
return upto;
}
- private static final boolean assertSliceBuffer(int[] buffer) {
+ private static boolean assertSliceBuffer(int[] buffer) {
int count = 0;
for (int i = 0; i < buffer.length; i++) {
count += buffer[i]; // for slices the buffer must only have 0 values
diff --git a/lucene/core/src/java/org/apache/lucene/util/Version.java b/lucene/core/src/java/org/apache/lucene/util/Version.java
index 5ed1a959da30..f5dbcc1e8e72 100644
--- a/lucene/core/src/java/org/apache/lucene/util/Version.java
+++ b/lucene/core/src/java/org/apache/lucene/util/Version.java
@@ -102,6 +102,13 @@ public final class Version {
@Deprecated
public static final Version LUCENE_8_5_1 = new Version(8, 5, 1);
+ /**
+ * Match settings and bugs in Lucene's 8.5.2 release.
+ * @deprecated Use latest
+ */
+ @Deprecated
+ public static final Version LUCENE_8_5_2 = new Version(8, 5, 2);
+
/**
* Match settings and bugs in Lucene's 8.6.0 release.
* @deprecated Use latest
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
index e8b37e59a3ff..0874cde71015 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
@@ -290,6 +290,55 @@
*
(a single non-reserved character)
*
*
+ *
+ *
+ *
+ *
|
+ *
\d
+ *
(a digit [0-9])
+ *
+ *
+ *
+ *
+ *
+ *
|
+ *
\D
+ *
(a non-digit [^0-9])
+ *
+ *
+ *
+ *
+ *
+ *
|
+ *
\s
+ *
(whitespace [ \t\n\r])
+ *
+ *
+ *
+ *
+ *
+ *
|
+ *
\S
+ *
(non whitespace [^\s])
+ *
+ *
+ *
+ *
+ *
+ *
|
+ *
\w
+ *
(a word character [a-zA-Z_0-9])
+ *
+ *
+ *
+ *
+ *
+ *
|
+ *
\W
+ *
(a non word character [^\w])
+ *
+ *
+ *
*
*
*
|
@@ -316,8 +365,44 @@
*/
public class RegExp {
- enum Kind {
- REGEXP_UNION, REGEXP_CONCATENATION, REGEXP_INTERSECTION, REGEXP_OPTIONAL, REGEXP_REPEAT, REGEXP_REPEAT_MIN, REGEXP_REPEAT_MINMAX, REGEXP_COMPLEMENT, REGEXP_CHAR, REGEXP_CHAR_RANGE, REGEXP_ANYCHAR, REGEXP_EMPTY, REGEXP_STRING, REGEXP_ANYSTRING, REGEXP_AUTOMATON, REGEXP_INTERVAL
+ /**
+ * The type of expression represented by a RegExp node.
+ */
+ public enum Kind {
+ /** The union of two expressions */
+ REGEXP_UNION,
+ /** A sequence of two expressions */
+ REGEXP_CONCATENATION,
+ /** The intersection of two expressions */
+ REGEXP_INTERSECTION,
+ /** An optional expression */
+ REGEXP_OPTIONAL,
+ /** An expression that repeats */
+ REGEXP_REPEAT,
+ /** An expression that repeats a minimum number of times*/
+ REGEXP_REPEAT_MIN,
+ /** An expression that repeats a minimum and maximum number of times*/
+ REGEXP_REPEAT_MINMAX,
+ /** The complement of an expression */
+ REGEXP_COMPLEMENT,
+ /** A Character */
+ REGEXP_CHAR,
+ /** A Character range*/
+ REGEXP_CHAR_RANGE,
+ /** Any Character allowed*/
+ REGEXP_ANYCHAR,
+ /** An empty expression*/
+ REGEXP_EMPTY,
+ /** A string expression*/
+ REGEXP_STRING,
+ /** Any string allowed */
+ REGEXP_ANYSTRING,
+ /** An Automaton expression*/
+ REGEXP_AUTOMATON,
+ /** An Interval expression */
+ REGEXP_INTERVAL,
+ /** An expression for a pre-defined class e.g. \w */
+ REGEXP_PRE_CLASS
}
/**
@@ -361,21 +446,37 @@ enum Kind {
*/
public static final int NONE = 0x0000;
+ //Immutable parsed state
+ /**
+ * The type of expression
+ */
+ public final Kind kind;
+ /**
+ * Child expressions held by a container type expression
+ */
+ public final RegExp exp1, exp2;
+ /**
+ * String expression
+ */
+ public final String s;
+ /**
+ * Character expression
+ */
+ public final int c;
+ /**
+ * Limits for repeatable type expressions
+ */
+ public final int min, max, digits;
+ /**
+ * Extents for range type expressions
+ */
+ public final int from, to;
+
+ // Parser variables
private final String originalString;
- Kind kind;
- RegExp exp1, exp2;
- String s;
- int c;
- int min, max, digits;
- int from, to;
-
int flags;
int pos;
-
- RegExp() {
- this.originalString = null;
- }
-
+
/**
* Constructs new RegExp from a string. Same as
* RegExp(s, ALL).
@@ -418,6 +519,37 @@ public RegExp(String s, int syntax_flags) throws IllegalArgumentException {
from = e.from;
to = e.to;
}
+
+ RegExp(Kind kind, RegExp exp1, RegExp exp2, String s, int c, int min, int max, int digits, int from, int to){
+ this.originalString = null;
+ this.kind = kind;
+ this.flags = 0;
+ this.exp1 = exp1;
+ this.exp2 = exp2;
+ this.s = s;
+ this.c = c;
+ this.min = min;
+ this.max = max;
+ this.digits = digits;
+ this.from = from;
+ this.to = to;
+ }
+
+ // Simplified construction of container nodes
+ static RegExp newContainerNode(Kind kind, RegExp exp1, RegExp exp2) {
+ return new RegExp(kind, exp1, exp2, null, 0, 0, 0, 0, 0, 0);
+ }
+
+ // Simplified construction of repeating nodes
+ static RegExp newRepeatingNode(Kind kind, RegExp exp, int min, int max) {
+ return new RegExp(kind, exp, null, null, 0, min, max, 0, 0, 0);
+ }
+
+
+ // Simplified construction of leaf nodes
+ static RegExp newLeafNode(Kind kind, String s, int c, int min, int max, int digits, int from, int to) {
+ return new RegExp(kind, null, null, s, c, min, max, digits, from, to);
+ }
/**
* Constructs new Automaton from this RegExp. Same
@@ -506,6 +638,10 @@ private Automaton toAutomatonInternal(Map automata,
List list;
Automaton a = null;
switch (kind) {
+ case REGEXP_PRE_CLASS:
+ RegExp expanded = expandPredefined();
+ a = expanded.toAutomatonInternal(automata, automaton_provider, maxDeterminizedStates);
+ break;
case REGEXP_UNION:
list = new ArrayList<>();
findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider,
@@ -716,6 +852,9 @@ void toStringBuilder(StringBuilder b) {
b.append('0');
b.append(s2).append(">");
break;
+ case REGEXP_PRE_CLASS:
+ b.append("\\").appendCodePoint(from);
+ break;
}
}
@@ -774,6 +913,13 @@ void toStringTree(StringBuilder b, String indent) {
b.appendCodePoint(c);
b.append('\n');
break;
+ case REGEXP_PRE_CLASS:
+ b.append(indent);
+ b.append(kind);
+ b.append(" class=\\");
+ b.appendCodePoint(from);
+ b.append('\n');
+ break;
case REGEXP_CHAR_RANGE:
b.append(indent);
b.append(kind);
@@ -855,34 +1001,29 @@ void getIdentifiers(Set set) {
}
static RegExp makeUnion(RegExp exp1, RegExp exp2) {
- RegExp r = new RegExp();
- r.kind = Kind.REGEXP_UNION;
- r.exp1 = exp1;
- r.exp2 = exp2;
- return r;
+ return newContainerNode(Kind.REGEXP_UNION, exp1, exp2);
}
static RegExp makeConcatenation(RegExp exp1, RegExp exp2) {
if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING)
&& (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) return makeString(
exp1, exp2);
- RegExp r = new RegExp();
- r.kind = Kind.REGEXP_CONCATENATION;
+ RegExp rexp1, rexp2;
if (exp1.kind == Kind.REGEXP_CONCATENATION
&& (exp1.exp2.kind == Kind.REGEXP_CHAR || exp1.exp2.kind == Kind.REGEXP_STRING)
&& (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) {
- r.exp1 = exp1.exp1;
- r.exp2 = makeString(exp1.exp2, exp2);
+ rexp1 = exp1.exp1;
+ rexp2 = makeString(exp1.exp2, exp2);
} else if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING)
&& exp2.kind == Kind.REGEXP_CONCATENATION
&& (exp2.exp1.kind == Kind.REGEXP_CHAR || exp2.exp1.kind == Kind.REGEXP_STRING)) {
- r.exp1 = makeString(exp1, exp2.exp1);
- r.exp2 = exp2.exp2;
+ rexp1 = makeString(exp1, exp2.exp1);
+ rexp2 = exp2.exp2;
} else {
- r.exp1 = exp1;
- r.exp2 = exp2;
+ rexp1 = exp1;
+ rexp2 = exp2;
}
- return r;
+ return newContainerNode(Kind.REGEXP_CONCATENATION, rexp1, rexp2);
}
static private RegExp makeString(RegExp exp1, RegExp exp2) {
@@ -895,107 +1036,61 @@ static private RegExp makeString(RegExp exp1, RegExp exp2) {
}
static RegExp makeIntersection(RegExp exp1, RegExp exp2) {
- RegExp r = new RegExp();
- r.kind = Kind.REGEXP_INTERSECTION;
- r.exp1 = exp1;
- r.exp2 = exp2;
- return r;
+ return newContainerNode(Kind.REGEXP_INTERSECTION, exp1, exp2);
}
static RegExp makeOptional(RegExp exp) {
- RegExp r = new RegExp();
- r.kind = Kind.REGEXP_OPTIONAL;
- r.exp1 = exp;
- return r;
+ return newContainerNode(Kind.REGEXP_OPTIONAL, exp, null);
}
static RegExp makeRepeat(RegExp exp) {
- RegExp r = new RegExp();
- r.kind = Kind.REGEXP_REPEAT;
- r.exp1 = exp;
- return r;
+ return newContainerNode(Kind.REGEXP_REPEAT, exp, null);
}
static RegExp makeRepeat(RegExp exp, int min) {
- RegExp r = new RegExp();
- r.kind = Kind.REGEXP_REPEAT_MIN;
- r.exp1 = exp;
- r.min = min;
- return r;
+ return newRepeatingNode(Kind.REGEXP_REPEAT_MIN, exp, min, 0);
}
static RegExp makeRepeat(RegExp exp, int min, int max) {
- RegExp r = new RegExp();
- r.kind = Kind.REGEXP_REPEAT_MINMAX;
- r.exp1 = exp;
- r.min = min;
- r.max = max;
- return r;
+ return newRepeatingNode(Kind.REGEXP_REPEAT_MINMAX, exp, min, max);
}
static RegExp makeComplement(RegExp exp) {
- RegExp r = new RegExp();
- r.kind = Kind.REGEXP_COMPLEMENT;
- r.exp1 = exp;
- return r;
+ return newContainerNode(Kind.REGEXP_COMPLEMENT, exp, null);
}
static RegExp makeChar(int c) {
- RegExp r = new RegExp();
- r.kind = Kind.REGEXP_CHAR;
- r.c = c;
- return r;
+ return newLeafNode(Kind.REGEXP_CHAR, null, c, 0, 0, 0, 0, 0);
}
static RegExp makeCharRange(int from, int to) {
if (from > to)
throw new IllegalArgumentException("invalid range: from (" + from + ") cannot be > to (" + to + ")");
- RegExp r = new RegExp();
- r.kind = Kind.REGEXP_CHAR_RANGE;
- r.from = from;
- r.to = to;
- return r;
+ return newLeafNode(Kind.REGEXP_CHAR_RANGE, null, 0, 0, 0, 0, from, to);
}
static RegExp makeAnyChar() {
- RegExp r = new RegExp();
- r.kind = Kind.REGEXP_ANYCHAR;
- return r;
+ return newContainerNode(Kind.REGEXP_ANYCHAR, null, null);
}
static RegExp makeEmpty() {
- RegExp r = new RegExp();
- r.kind = Kind.REGEXP_EMPTY;
- return r;
+ return newContainerNode(Kind.REGEXP_EMPTY, null, null);
}
static RegExp makeString(String s) {
- RegExp r = new RegExp();
- r.kind = Kind.REGEXP_STRING;
- r.s = s;
- return r;
+ return newLeafNode(Kind.REGEXP_STRING, s, 0, 0, 0, 0, 0, 0);
}
static RegExp makeAnyString() {
- RegExp r = new RegExp();
- r.kind = Kind.REGEXP_ANYSTRING;
- return r;
+ return newContainerNode(Kind.REGEXP_ANYSTRING, null, null);
}
static RegExp makeAutomaton(String s) {
- RegExp r = new RegExp();
- r.kind = Kind.REGEXP_AUTOMATON;
- r.s = s;
- return r;
+ return newLeafNode(Kind.REGEXP_AUTOMATON, s, 0, 0, 0, 0, 0, 0);
}
static RegExp makeInterval(int min, int max, int digits) {
- RegExp r = new RegExp();
- r.kind = Kind.REGEXP_INTERVAL;
- r.min = min;
- r.max = max;
- r.digits = digits;
- return r;
+ return newLeafNode(Kind.REGEXP_INTERVAL, null, 0, min, max, digits, 0, 0);
}
private boolean peek(String s) {
@@ -1101,10 +1196,60 @@ final RegExp parseCharClasses() throws IllegalArgumentException {
}
final RegExp parseCharClass() throws IllegalArgumentException {
+ RegExp predefinedExp = matchPredefinedCharacterClass();
+ if (predefinedExp != null) {
+ return predefinedExp;
+ }
+
int c = parseCharExp();
if (match('-')) return makeCharRange(c, parseCharExp());
else return makeChar(c);
}
+
+ RegExp expandPredefined() {
+ //See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
+ switch (from) {
+ case 'd':
+ return new RegExp("[0-9]"); // digit
+ case 'D':
+ return new RegExp("[^0-9]"); // non-digit
+ case 's':
+ return new RegExp("[ \t\n\r]"); // whitespace
+ case 'S':
+ return new RegExp("[^\\s]"); // non-whitespace
+ case 'w':
+ return new RegExp("[a-zA-Z_0-9]"); // word
+ case 'W':
+ return new RegExp("[^\\w]"); // non-word
+ default:
+ throw new IllegalArgumentException(
+ "invalid character class " + from);
+ }
+ }
+
+
+ final RegExp matchPredefinedCharacterClass() {
+ //See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
+ if (match('\\')) {
+ if (peek("dDwWsS")) {
+ return newLeafNode(Kind.REGEXP_PRE_CLASS, null, 0, 0, 0, 0, next(), 0);
+ }
+
+ if (peek("\\")) {
+ return makeChar(next());
+ }
+
+ // From https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#bs
+ // "It is an error to use a backslash prior to any alphabetic character that does not denote an escaped
+ // construct;"
+ if (peek("abcefghijklmnopqrtuvxyz") || peek("ABCEFGHIJKLMNOPQRTUVXYZ")) {
+ throw new IllegalArgumentException("invalid character class \\" + next());
+ }
+ }
+
+ return null;
+ }
+
final RegExp parseSimpleExp() throws IllegalArgumentException {
if (match('.')) return makeAnyChar();
@@ -1158,7 +1303,13 @@ else if (match('"')) {
"interval syntax error at position " + (pos - 1));
}
}
- } else return makeChar(parseCharExp());
+ } else {
+ RegExp predefined = matchPredefinedCharacterClass();
+ if (predefined != null) {
+ return predefined;
+ }
+ return makeChar(parseCharExp());
+ }
}
final int parseCharExp() throws IllegalArgumentException {
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
index 490432693efc..1e3702db84c9 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
@@ -24,11 +24,7 @@
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.store.ByteArrayDataInput;
-import org.apache.lucene.store.ByteBufferIndexInput;
-import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.MathUtil;
@@ -36,124 +32,7 @@
*
* @lucene.experimental */
-public final class BKDReader extends PointValues implements Accountable {
-
- private static abstract class BKDInput extends DataInput implements Cloneable {
- abstract long getMinLeafBlockFP();
- abstract long ramBytesUsed();
-
- abstract int getPosition();
- abstract void setPosition(int pos) throws IOException;
-
- @Override
- public BKDInput clone() {
- return (BKDInput)super.clone();
- }
- }
-
- private static class BKDOffHeapInput extends BKDInput implements Cloneable {
-
- private final IndexInput packedIndex;
- private final long minLeafBlockFP;
-
- BKDOffHeapInput(IndexInput packedIndex) throws IOException {
- this.packedIndex = packedIndex;
- this.minLeafBlockFP = packedIndex.clone().readVLong();
- }
-
- private BKDOffHeapInput(IndexInput packedIndex, long minLeadBlockFP) {
- this.packedIndex = packedIndex;
- this.minLeafBlockFP = minLeadBlockFP;
- }
-
- @Override
- public BKDOffHeapInput clone() {
- return new BKDOffHeapInput(packedIndex.clone(), minLeafBlockFP);
- }
-
- @Override
- long getMinLeafBlockFP() {
- return minLeafBlockFP;
- }
-
- @Override
- long ramBytesUsed() {
- return 0;
- }
-
- @Override
- int getPosition() {
- return (int)packedIndex.getFilePointer();
- }
-
- @Override
- void setPosition(int pos) throws IOException {
- packedIndex.seek(pos);
- }
-
- @Override
- public byte readByte() throws IOException {
- return packedIndex.readByte();
- }
-
- @Override
- public void readBytes(byte[] b, int offset, int len) throws IOException {
- packedIndex.readBytes(b, offset, len);
- }
- }
-
- private static class BKDOnHeapInput extends BKDInput implements Cloneable {
-
- private final ByteArrayDataInput packedIndex;
- private final long minLeafBlockFP;
-
- BKDOnHeapInput(IndexInput packedIndex, int numBytes) throws IOException {
- byte[] packedBytes = new byte[numBytes];
- packedIndex.readBytes(packedBytes, 0, numBytes);
- this.packedIndex = new ByteArrayDataInput(packedBytes);
- this.minLeafBlockFP = this.packedIndex.clone().readVLong();
- }
-
- private BKDOnHeapInput(ByteArrayDataInput packedIndex, long minLeadBlockFP) {
- this.packedIndex = packedIndex;
- this.minLeafBlockFP = minLeadBlockFP;
- }
-
- @Override
- public BKDOnHeapInput clone() {
- return new BKDOnHeapInput((ByteArrayDataInput)packedIndex.clone(), minLeafBlockFP);
- }
-
- @Override
- long getMinLeafBlockFP() {
- return minLeafBlockFP;
- }
-
- @Override
- long ramBytesUsed() {
- return packedIndex.length();
- }
-
- @Override
- int getPosition() {
- return packedIndex.getPosition();
- }
-
- @Override
- void setPosition(int pos) {
- packedIndex.setPosition(pos);
- }
-
- @Override
- public byte readByte() throws IOException {
- return packedIndex.readByte();
- }
-
- @Override
- public void readBytes(byte[] b, int offset, int len) throws IOException {
- packedIndex.readBytes(b, offset, len);
- }
- }
+public final class BKDReader extends PointValues {
// Packed array of byte[] holding all split values in the full binary tree:
final int leafNodeOffset;
@@ -170,67 +49,64 @@ public void readBytes(byte[] b, int offset, int len) throws IOException {
final int version;
protected final int packedBytesLength;
protected final int packedIndexBytesLength;
+ final long minLeafBlockFP;
- final BKDInput packedIndex;
-
- /** Caller must pre-seek the provided {@link IndexInput} to the index location that {@link BKDWriter#finish} returned */
- public BKDReader(IndexInput in) throws IOException {
- this(in, in instanceof ByteBufferIndexInput);
- }
+ final IndexInput packedIndex;
- /**
- * Caller must pre-seek the provided {@link IndexInput} to the index location that {@link BKDWriter#finish} returned
- * and specify {@code true} to store BKD off-heap ({@code false} otherwise)
- */
- public BKDReader(IndexInput in, boolean offHeap) throws IOException {
- version = CodecUtil.checkHeader(in, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, BKDWriter.VERSION_CURRENT);
- numDataDims = in.readVInt();
+ /** Caller must pre-seek the provided {@link IndexInput} to the index location that {@link BKDWriter#finish} returned.
+ * BKD tree is always stored off-heap. */
+ public BKDReader(IndexInput metaIn, IndexInput indexIn, IndexInput dataIn) throws IOException {
+ version = CodecUtil.checkHeader(metaIn, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, BKDWriter.VERSION_CURRENT);
+ numDataDims = metaIn.readVInt();
if (version >= BKDWriter.VERSION_SELECTIVE_INDEXING) {
- numIndexDims = in.readVInt();
+ numIndexDims = metaIn.readVInt();
} else {
numIndexDims = numDataDims;
}
- maxPointsInLeafNode = in.readVInt();
- bytesPerDim = in.readVInt();
+ maxPointsInLeafNode = metaIn.readVInt();
+ bytesPerDim = metaIn.readVInt();
packedBytesLength = numDataDims * bytesPerDim;
packedIndexBytesLength = numIndexDims * bytesPerDim;
// Read index:
- numLeaves = in.readVInt();
+ numLeaves = metaIn.readVInt();
assert numLeaves > 0;
leafNodeOffset = numLeaves;
minPackedValue = new byte[packedIndexBytesLength];
maxPackedValue = new byte[packedIndexBytesLength];
- in.readBytes(minPackedValue, 0, packedIndexBytesLength);
- in.readBytes(maxPackedValue, 0, packedIndexBytesLength);
+ metaIn.readBytes(minPackedValue, 0, packedIndexBytesLength);
+ metaIn.readBytes(maxPackedValue, 0, packedIndexBytesLength);
for(int dim=0;dim 0) {
- throw new CorruptIndexException("minPackedValue " + new BytesRef(minPackedValue) + " is > maxPackedValue " + new BytesRef(maxPackedValue) + " for dim=" + dim, in);
+ throw new CorruptIndexException("minPackedValue " + new BytesRef(minPackedValue) + " is > maxPackedValue " + new BytesRef(maxPackedValue) + " for dim=" + dim, metaIn);
}
}
- pointCount = in.readVLong();
- docCount = in.readVInt();
-
- int numBytes = in.readVInt();
- IndexInput slice = in.slice("packedIndex", in.getFilePointer(), numBytes);
- if (offHeap) {
- packedIndex = new BKDOffHeapInput(slice);
+ pointCount = metaIn.readVLong();
+ docCount = metaIn.readVInt();
+
+ int numIndexBytes = metaIn.readVInt();
+ long indexStartPointer;
+ if (version >= BKDWriter.VERSION_META_FILE) {
+ minLeafBlockFP = metaIn.readLong();
+ indexStartPointer = metaIn.readLong();
} else {
- packedIndex = new BKDOnHeapInput(slice, numBytes);
+ indexStartPointer = indexIn.getFilePointer();
+ minLeafBlockFP = indexIn.readVLong();
+ indexIn.seek(indexStartPointer);
}
-
- this.in = in;
+ this.packedIndex = indexIn.slice("packedIndex", indexStartPointer, numIndexBytes);
+ this.in = dataIn;
}
long getMinLeafBlockFP() {
- return packedIndex.getMinLeafBlockFP();
+ return minLeafBlockFP;
}
- /** Used to walk the in-heap index. The format takes advantage of the limited
+ /** Used to walk the off-heap index. The format takes advantage of the limited
* access pattern to the BKD tree at search time, i.e. starting at the root
* node and recursing downwards one child at a time.
* @lucene.internal */
@@ -240,13 +116,11 @@ public class IndexTree implements Cloneable {
private int level;
private int splitDim;
private final byte[][] splitPackedValueStack;
- // used to read the packed byte[]
- private final BKDInput in;
+ // used to read the packed tree off-heap
+ private final IndexInput in;
// holds the minimum (left most) leaf block file pointer for each level we've recursed to:
private final long[] leafBlockFPStack;
- // holds the address, in the packed byte[] index, of the left-node of each level:
- private final int[] leftNodePositions;
- // holds the address, in the packed byte[] index, of the right-node of each level:
+ // holds the address, in the off-heap index, of the right-node of each level:
private final int[] rightNodePositions;
// holds the splitDim for each level:
private final int[] splitDims;
@@ -260,52 +134,41 @@ public class IndexTree implements Cloneable {
private final BytesRef scratch;
IndexTree() {
+ this(packedIndex.clone(), 1, 1);
+ // read root node
+ readNodeData(false);
+ }
+
+ private IndexTree(IndexInput in, int nodeID, int level) {
int treeDepth = getTreeDepth();
splitPackedValueStack = new byte[treeDepth+1][];
- nodeID = 1;
- level = 1;
+ this.nodeID = nodeID;
+ this.level = level;
splitPackedValueStack[level] = new byte[packedIndexBytesLength];
leafBlockFPStack = new long[treeDepth+1];
- leftNodePositions = new int[treeDepth+1];
rightNodePositions = new int[treeDepth+1];
splitValuesStack = new byte[treeDepth+1][];
splitDims = new int[treeDepth+1];
negativeDeltas = new boolean[numIndexDims*(treeDepth+1)];
-
- in = packedIndex.clone();
+ this.in = in;
splitValuesStack[0] = new byte[packedIndexBytesLength];
- readNodeData(false);
scratch = new BytesRef();
scratch.length = bytesPerDim;
}
public void pushLeft() {
- int nodePosition = leftNodePositions[level];
nodeID *= 2;
level++;
- if (splitPackedValueStack[level] == null) {
- splitPackedValueStack[level] = new byte[packedIndexBytesLength];
- }
- System.arraycopy(negativeDeltas, (level-1)*numIndexDims, negativeDeltas, level*numIndexDims, numIndexDims);
- assert splitDim != -1;
- negativeDeltas[level*numIndexDims+splitDim] = true;
- try {
- in.setPosition(nodePosition);
- } catch (IOException e) {
- throw new UncheckedIOException(e);
- }
readNodeData(true);
}
/** Clone, but you are not allowed to pop up past the point where the clone happened. */
@Override
public IndexTree clone() {
- IndexTree index = new IndexTree();
- index.nodeID = nodeID;
- index.level = level;
+ IndexTree index = new IndexTree(in.clone(), nodeID, level);
+ // copy node data
index.splitDim = splitDim;
index.leafBlockFPStack[level] = leafBlockFPStack[level];
- index.leftNodePositions[level] = leftNodePositions[level];
index.rightNodePositions[level] = rightNodePositions[level];
index.splitValuesStack[index.level] = splitValuesStack[index.level].clone();
System.arraycopy(negativeDeltas, level*numIndexDims, index.negativeDeltas, level*numIndexDims, numIndexDims);
@@ -314,17 +177,12 @@ public IndexTree clone() {
}
public void pushRight() {
- int nodePosition = rightNodePositions[level];
+ final int nodePosition = rightNodePositions[level];
+ assert nodePosition >= in.getFilePointer() : "nodePosition = " + nodePosition + " < currentPosition=" + in.getFilePointer();
nodeID = nodeID * 2 + 1;
level++;
- if (splitPackedValueStack[level] == null) {
- splitPackedValueStack[level] = new byte[packedIndexBytesLength];
- }
- System.arraycopy(negativeDeltas, (level-1)*numIndexDims, negativeDeltas, level*numIndexDims, numIndexDims);
- assert splitDim != -1;
- negativeDeltas[level*numIndexDims+splitDim] = false;
try {
- in.setPosition(nodePosition);
+ in.seek(nodePosition);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
@@ -412,6 +270,13 @@ private int getNumLeavesSlow(int node) {
}
private void readNodeData(boolean isLeft) {
+ if (splitPackedValueStack[level] == null) {
+ splitPackedValueStack[level] = new byte[packedIndexBytesLength];
+ }
+ System.arraycopy(negativeDeltas, (level-1)*numIndexDims, negativeDeltas, level*numIndexDims, numIndexDims);
+ assert splitDim != -1;
+ negativeDeltas[level*numIndexDims+splitDim] = isLeft;
+
try {
leafBlockFPStack[level] = leafBlockFPStack[level - 1];
@@ -454,9 +319,7 @@ private void readNodeData(boolean isLeft) {
} else {
leftNumBytes = 0;
}
-
- leftNodePositions[level] = in.getPosition();
- rightNodePositions[level] = leftNodePositions[level] + leftNumBytes;
+ rightNodePositions[level] = Math.toIntExact(in.getFilePointer()) + leftNumBytes;
}
} catch (IOException e) {
throw new UncheckedIOException(e);
@@ -880,11 +743,6 @@ private long estimatePointCount(IntersectState state, byte[] cellMinPacked, byte
}
}
- @Override
- public long ramBytesUsed() {
- return packedIndex.ramBytesUsed();
- }
-
@Override
public byte[] getMinPackedValue() {
return minPackedValue.clone();
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
index 727b824f7214..de71941ec52e 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@@ -18,6 +18,7 @@
import java.io.Closeable;
import java.io.IOException;
+import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@@ -57,9 +58,10 @@
* Recursively builds a block KD-tree to assign all incoming points in N-dim space to smaller
* and smaller N-dim rectangles (cells) until the number of points in a given
* rectangle is <= maxPointsInLeafNode. The tree is
- * fully balanced, which means the leaf nodes will have between 50% and 100% of
- * the requested maxPointsInLeafNode. Values that fall exactly
- * on a cell boundary may be in either cell.
+ * partially balanced, which means the leaf nodes will have
+ * the requested maxPointsInLeafNode except one that might have less.
+ * Leaf nodes may straddle the two bottom levels of the binary tree.
+ * Values that fall exactly on a cell boundary may be in either cell.
*
*
The number of dimensions can be 1 to 8, but every byte[] value is fixed length.
*
@@ -68,7 +70,7 @@
* {@code maxMBSortInHeap} heap space for writing.
*
*
- * NOTE: This can write at most Integer.MAX_VALUE * maxPointsInLeafNode / (1+bytesPerDim)
+ * NOTE: This can write at most Integer.MAX_VALUE * maxPointsInLeafNode / bytesPerDim
* total points.
*
* @lucene.experimental */
@@ -81,13 +83,14 @@ public class BKDWriter implements Closeable {
public static final int VERSION_LEAF_STORES_BOUNDS = 5;
public static final int VERSION_SELECTIVE_INDEXING = 6;
public static final int VERSION_LOW_CARDINALITY_LEAVES = 7;
- public static final int VERSION_CURRENT = VERSION_LOW_CARDINALITY_LEAVES;
+ public static final int VERSION_META_FILE = 9;
+ public static final int VERSION_CURRENT = VERSION_META_FILE;
/** How many bytes each docs takes in the fixed-width offline format */
private final int bytesPerDoc;
/** Default maximum number of point in each leaf block */
- public static final int DEFAULT_MAX_POINTS_IN_LEAF_NODE = 1024;
+ public static final int DEFAULT_MAX_POINTS_IN_LEAF_NODE = 512;
/** Default maximum heap to use, before spilling to (slower) disk */
public static final float DEFAULT_MAX_MB_SORT_IN_HEAP = 16.0f;
@@ -252,11 +255,6 @@ public void add(byte[] packedValue, int docID) throws IOException {
docsSeen.set(docID);
}
- /** How many points have been added so far */
- public long getPointCount() {
- return pointCount;
- }
-
private static class MergeReader {
final BKDReader bkd;
final BKDReader.IntersectState state;
@@ -371,16 +369,32 @@ public boolean lessThan(MergeReader a, MergeReader b) {
}
}
+ /** flat representation of a kd-tree */
+ private interface BKDTreeLeafNodes {
+ /** number of leaf nodes */
+ int numLeaves();
+ /** pointer to the leaf node previously written. Leaves are order from
+ * left to right, so leaf at {@code index} 0 is the leftmost leaf and
+ * the the leaf at {@code numleaves()} -1 is the rightmost leaf */
+ long getLeafLP(int index);
+ /** split value between two leaves. The split value at position n corresponds to the
+ * leaves at (n -1) and n. */
+ BytesRef getSplitValue(int index);
+ /** split dimension between two leaves. The split dimension at position n corresponds to the
+ * leaves at (n -1) and n.*/
+ int getSplitDimension(int index);
+ }
+
/** Write a field from a {@link MutablePointValues}. This way of writing
* points is faster than regular writes with {@link BKDWriter#add} since
* there is opportunity for reordering points before writing them to
* disk. This method does not use transient disk in order to reorder points.
*/
- public long writeField(IndexOutput out, String fieldName, MutablePointValues reader) throws IOException {
+ public Runnable writeField(IndexOutput metaOut, IndexOutput indexOut, IndexOutput dataOut, String fieldName, MutablePointValues reader) throws IOException {
if (numDataDims == 1) {
- return writeField1Dim(out, fieldName, reader);
+ return writeField1Dim(metaOut, indexOut, dataOut, fieldName, reader);
} else {
- return writeFieldNDims(out, fieldName, reader);
+ return writeFieldNDims(metaOut, indexOut, dataOut, fieldName, reader);
}
}
@@ -407,7 +421,7 @@ private void computePackedValueBounds(MutablePointValues values, int from, int t
/* In the 2+D case, we recursively pick the split dimension, compute the
* median value and partition other values around it. */
- private long writeFieldNDims(IndexOutput out, String fieldName, MutablePointValues values) throws IOException {
+ private Runnable writeFieldNDims(IndexOutput metaOut, IndexOutput indexOut, IndexOutput dataOut, String fieldName, MutablePointValues values) throws IOException {
if (pointCount != 0) {
throw new IllegalStateException("cannot mix add and writeField");
}
@@ -420,19 +434,15 @@ private long writeFieldNDims(IndexOutput out, String fieldName, MutablePointValu
// Mark that we already finished:
finished = true;
- long countPerLeaf = pointCount = values.size();
- long innerNodeCount = 1;
-
- while (countPerLeaf > maxPointsInLeafNode) {
- countPerLeaf = (countPerLeaf+1)/2;
- innerNodeCount *= 2;
- }
+ pointCount = values.size();
- int numLeaves = Math.toIntExact(innerNodeCount);
+ final int numLeaves = Math.toIntExact((pointCount + maxPointsInLeafNode - 1) / maxPointsInLeafNode);
+ final int numSplits = numLeaves - 1;
checkMaxLeafNodeCount(numLeaves);
- final byte[] splitPackedValues = new byte[numLeaves * (bytesPerDim + 1)];
+ final byte[] splitPackedValues = new byte[numSplits * bytesPerDim];
+ final byte[] splitDimensionValues = new byte[numSplits];
final long[] leafBlockFPs = new long[numLeaves];
// compute the min/max for this slice
@@ -441,24 +451,55 @@ private long writeFieldNDims(IndexOutput out, String fieldName, MutablePointValu
docsSeen.set(values.getDocID(i));
}
+ final long dataStartFP = dataOut.getFilePointer();
final int[] parentSplits = new int[numIndexDims];
- build(1, numLeaves, values, 0, Math.toIntExact(pointCount), out,
+ build(0, numLeaves, values, 0, Math.toIntExact(pointCount), dataOut,
minPackedValue.clone(), maxPackedValue.clone(), parentSplits,
- splitPackedValues, leafBlockFPs,
+ splitPackedValues, splitDimensionValues, leafBlockFPs,
new int[maxPointsInLeafNode]);
assert Arrays.equals(parentSplits, new int[numIndexDims]);
- long indexFP = out.getFilePointer();
- writeIndex(out, Math.toIntExact(countPerLeaf), leafBlockFPs, splitPackedValues);
- return indexFP;
+ scratchBytesRef1.length = bytesPerDim;
+ scratchBytesRef1.bytes = splitPackedValues;
+
+ BKDTreeLeafNodes leafNodes = new BKDTreeLeafNodes() {
+ @Override
+ public long getLeafLP(int index) {
+ return leafBlockFPs[index];
+ }
+
+ @Override
+ public BytesRef getSplitValue(int index) {
+ scratchBytesRef1.offset = index * bytesPerDim;
+ return scratchBytesRef1;
+ }
+
+ @Override
+ public int getSplitDimension(int index) {
+ return splitDimensionValues[index] & 0xff;
+ }
+
+ @Override
+ public int numLeaves() {
+ return leafBlockFPs.length;
+ }
+ };
+
+ return () -> {
+ try {
+ writeIndex(metaOut, indexOut, maxPointsInLeafNode, leafNodes, dataStartFP);
+ } catch (IOException e) {
+ throw new UncheckedIOException(e);
+ }
+ };
}
/* In the 1D case, we can simply sort points in ascending order and use the
* same writing logic as we use at merge time. */
- private long writeField1Dim(IndexOutput out, String fieldName, MutablePointValues reader) throws IOException {
+ private Runnable writeField1Dim(IndexOutput metaOut, IndexOutput indexOut, IndexOutput dataOut, String fieldName, MutablePointValues reader) throws IOException {
MutablePointsReaderUtils.sort(maxDoc, packedIndexBytesLength, reader, 0, Math.toIntExact(reader.size()));
- final OneDimensionBKDWriter oneDimWriter = new OneDimensionBKDWriter(out);
+ final OneDimensionBKDWriter oneDimWriter = new OneDimensionBKDWriter(metaOut, indexOut, dataOut);
reader.intersect(new IntersectVisitor() {
@@ -484,7 +525,7 @@ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
/** More efficient bulk-add for incoming {@link BKDReader}s. This does a merge sort of the already
* sorted values and currently only works when numDims==1. This returns -1 if all documents containing
* dimensional values were deleted. */
- public long merge(IndexOutput out, List docMaps, List readers) throws IOException {
+ public Runnable merge(IndexOutput metaOut, IndexOutput indexOut, IndexOutput dataOut, List docMaps, List readers) throws IOException {
assert docMaps == null || readers.size() == docMaps.size();
BKDMergeQueue queue = new BKDMergeQueue(bytesPerDim, readers.size());
@@ -503,7 +544,7 @@ public long merge(IndexOutput out, List docMaps, List docMaps, List leafBlockFPs = new ArrayList<>();
final List leafBlockStartValues = new ArrayList<>();
final byte[] leafValues = new byte[maxPointsInLeafNode * packedBytesLength];
@@ -536,7 +578,7 @@ private class OneDimensionBKDWriter {
private int leafCount;
private int leafCardinality;
- OneDimensionBKDWriter(IndexOutput out) {
+ OneDimensionBKDWriter(IndexOutput metaOut, IndexOutput indexOut, IndexOutput dataOut) {
if (numIndexDims != 1) {
throw new UnsupportedOperationException("numIndexDims must be 1 but got " + numIndexDims);
}
@@ -552,7 +594,10 @@ private class OneDimensionBKDWriter {
// Mark that we already finished:
finished = true;
- this.out = out;
+ this.metaOut = metaOut;
+ this.indexOut = indexOut;
+ this.dataOut = dataOut;
+ this.dataStartFP = dataOut.getFilePointer();
lastPackedValue = new byte[packedBytesLength];
}
@@ -588,7 +633,7 @@ assert valueInOrder(valueCount + leafCount,
assert (lastDocID = docID) >= 0; // only assign when asserts are enabled
}
- public long finish() throws IOException {
+ public Runnable finish() throws IOException {
if (leafCount > 0) {
writeLeafBlock(leafCardinality);
leafCardinality = 0;
@@ -596,25 +641,43 @@ public long finish() throws IOException {
}
if (valueCount == 0) {
- return -1;
+ return null;
}
pointCount = valueCount;
- long indexFP = out.getFilePointer();
+ scratchBytesRef1.length = bytesPerDim;
+ scratchBytesRef1.offset = 0;
+ assert leafBlockStartValues.size() + 1 == leafBlockFPs.size();
+ BKDTreeLeafNodes leafNodes = new BKDTreeLeafNodes() {
+ @Override
+ public long getLeafLP(int index) {
+ return leafBlockFPs.get(index);
+ }
- int numInnerNodes = leafBlockStartValues.size();
+ @Override
+ public BytesRef getSplitValue(int index) {
+ scratchBytesRef1.bytes = leafBlockStartValues.get(index);
+ return scratchBytesRef1;
+ }
- //System.out.println("BKDW: now rotate numInnerNodes=" + numInnerNodes + " leafBlockStarts=" + leafBlockStartValues.size());
+ @Override
+ public int getSplitDimension(int index) {
+ return 0;
+ }
- byte[] index = new byte[(1+numInnerNodes) * (1+bytesPerDim)];
- rotateToTree(1, 0, numInnerNodes, index, leafBlockStartValues);
- long[] arr = new long[leafBlockFPs.size()];
- for(int i=0;i {
+ try {
+ writeIndex(metaOut, indexOut, maxPointsInLeafNode, leafNodes, dataStartFP);
+ } catch (IOException e) {
+ throw new UncheckedIOException(e);
+ }
+ };
}
private void writeLeafBlock(int leafCardinality) throws IOException {
@@ -630,7 +693,7 @@ private void writeLeafBlock(int leafCardinality) throws IOException {
// Save the first (minimum) value in each leaf block except the first, to build the split value index in the end:
leafBlockStartValues.add(ArrayUtil.copyOfSubArray(leafValues, 0, packedBytesLength));
}
- leafBlockFPs.add(out.getFilePointer());
+ leafBlockFPs.add(dataOut.getFilePointer());
checkMaxLeafNodeCount(leafBlockFPs.size());
// Find per-dim common prefix:
@@ -660,58 +723,26 @@ assert valuesInOrderAndBounds(leafCount, 0, ArrayUtil.copyOfSubArray(leafValues,
ArrayUtil.copyOfSubArray(leafValues, (leafCount - 1) * packedBytesLength, leafCount * packedBytesLength),
packedValues, leafDocs, 0);
writeLeafBlockPackedValues(scratchOut, commonPrefixLengths, leafCount, 0, packedValues, leafCardinality);
- scratchOut.copyTo(out);
+ scratchOut.copyTo(dataOut);
scratchOut.reset();
}
}
- // TODO: there must be a simpler way?
- private void rotateToTree(int nodeID, int offset, int count, byte[] index, List leafBlockStartValues) {
- //System.out.println("ROTATE: nodeID=" + nodeID + " offset=" + offset + " count=" + count + " bpd=" + bytesPerDim + " index.length=" + index.length);
- if (count == 1) {
- // Leaf index node
- //System.out.println(" leaf index node");
- //System.out.println(" index[" + nodeID + "] = blockStartValues[" + offset + "]");
- System.arraycopy(leafBlockStartValues.get(offset), 0, index, nodeID*(1+bytesPerDim)+1, bytesPerDim);
- } else if (count > 1) {
- // Internal index node: binary partition of count
- int countAtLevel = 1;
- int totalCount = 0;
- while (true) {
- int countLeft = count - totalCount;
- //System.out.println(" cycle countLeft=" + countLeft + " coutAtLevel=" + countAtLevel);
- if (countLeft <= countAtLevel) {
- // This is the last level, possibly partially filled:
- int lastLeftCount = Math.min(countAtLevel/2, countLeft);
- assert lastLeftCount >= 0;
- int leftHalf = (totalCount-1)/2 + lastLeftCount;
-
- int rootOffset = offset + leftHalf;
- /*
- System.out.println(" last left count " + lastLeftCount);
- System.out.println(" leftHalf " + leftHalf + " rightHalf=" + (count-leftHalf-1));
- System.out.println(" rootOffset=" + rootOffset);
- */
-
- System.arraycopy(leafBlockStartValues.get(rootOffset), 0, index, nodeID*(1+bytesPerDim)+1, bytesPerDim);
- //System.out.println(" index[" + nodeID + "] = blockStartValues[" + rootOffset + "]");
-
- // TODO: we could optimize/specialize, when we know it's simply fully balanced binary tree
- // under here, to save this while loop on each recursion
-
- // Recurse left
- rotateToTree(2*nodeID, offset, leftHalf, index, leafBlockStartValues);
-
- // Recurse right
- rotateToTree(2*nodeID+1, rootOffset+1, count-leftHalf-1, index, leafBlockStartValues);
- return;
- }
- totalCount += countAtLevel;
- countAtLevel *= 2;
- }
- } else {
- assert count == 0;
- }
+ private int getNumLeftLeafNodes(int numLeaves) {
+ assert numLeaves > 1: "getNumLeftLeaveNodes() called with " + numLeaves;
+ // return the level that can be filled with this number of leaves
+ int lastFullLevel = 31 - Integer.numberOfLeadingZeros(numLeaves);
+ // how many leaf nodes are in the full level
+ int leavesFullLevel = 1 << lastFullLevel;
+ // half of the leaf nodes from the full level goes to the left
+ int numLeftLeafNodes = leavesFullLevel / 2;
+ // leaf nodes that do not fit in the full level
+ int unbalancedLeafNodes = numLeaves - leavesFullLevel;
+ // distribute unbalanced leaf nodes
+ numLeftLeafNodes += Math.min(unbalancedLeafNodes, numLeftLeafNodes);
+ // we should always place unbalanced leaf nodes on the left
+ assert numLeftLeafNodes >= numLeaves - numLeftLeafNodes && numLeftLeafNodes <= 2L * (numLeaves - numLeftLeafNodes);
+ return numLeftLeafNodes;
}
// TODO: if we fixed each partition step to just record the file offset at the "split point", we could probably handle variable length
@@ -736,13 +767,14 @@ private void printPathSlice(String desc, PathSlice slice, int dim) throws IOExce
*/
private void checkMaxLeafNodeCount(int numLeaves) {
- if ((1+bytesPerDim) * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH) {
+ if (bytesPerDim * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH) {
throw new IllegalStateException("too many nodes; increase maxPointsInLeafNode (currently " + maxPointsInLeafNode + ") and reindex");
}
}
- /** Writes the BKD tree to the provided {@link IndexOutput} and returns the file offset where index was written. */
- public long finish(IndexOutput out) throws IOException {
+ /** Writes the BKD tree to the provided {@link IndexOutput}s and returns a {@link Runnable} that
+ * writes the index of the tree if at least one point has been added, or {@code null} otherwise. */
+ public Runnable finish(IndexOutput metaOut, IndexOutput indexOut, IndexOutput dataOut) throws IOException {
// System.out.println("\nBKDTreeWriter.finish pointCount=" + pointCount + " out=" + out + " heapWriter=" + heapPointWriter);
// TODO: specialize the 1D case? it's much faster at indexing time (no partitioning on recurse...)
@@ -753,7 +785,7 @@ public long finish(IndexOutput out) throws IOException {
}
if (pointCount == 0) {
- throw new IllegalStateException("must index at least one point");
+ return null;
}
//mark as finished
@@ -765,16 +797,8 @@ public long finish(IndexOutput out) throws IOException {
tempInput = null;
pointWriter = null;
-
- long countPerLeaf = pointCount;
- long innerNodeCount = 1;
-
- while (countPerLeaf > maxPointsInLeafNode) {
- countPerLeaf = (countPerLeaf+1)/2;
- innerNodeCount *= 2;
- }
-
- int numLeaves = (int) innerNodeCount;
+ final int numLeaves = Math.toIntExact((pointCount + maxPointsInLeafNode - 1) / maxPointsInLeafNode);
+ final int numSplits = numLeaves - 1;
checkMaxLeafNodeCount(numLeaves);
@@ -782,7 +806,8 @@ public long finish(IndexOutput out) throws IOException {
// step of the recursion to recompute the split dim:
// Indexed by nodeID, but first (root) nodeID is 1. We do 1+ because the lead byte at each recursion says which dim we split on.
- byte[] splitPackedValues = new byte[Math.toIntExact(numLeaves*(1+bytesPerDim))];
+ byte[] splitPackedValues = new byte[Math.toIntExact(numSplits*bytesPerDim)];
+ byte[] splitDimensionValues = new byte[numSplits];
// +1 because leaf count is power of 2 (e.g. 8), and innerNodeCount is power of 2 minus 1 (e.g. 7)
long[] leafBlockFPs = new long[numLeaves];
@@ -793,15 +818,17 @@ public long finish(IndexOutput out) throws IOException {
//We re-use the selector so we do not need to create an object every time.
BKDRadixSelector radixSelector = new BKDRadixSelector(numDataDims, numIndexDims, bytesPerDim, maxPointsSortInHeap, tempDir, tempFileNamePrefix);
+ final long dataStartFP = dataOut.getFilePointer();
boolean success = false;
try {
final int[] parentSplits = new int[numIndexDims];
- build(1, numLeaves, points,
- out, radixSelector,
+ build(0, numLeaves, points,
+ dataOut, radixSelector,
minPackedValue.clone(), maxPackedValue.clone(),
parentSplits,
splitPackedValues,
+ splitDimensionValues,
leafBlockFPs,
new int[maxPointsInLeafNode]);
assert Arrays.equals(parentSplits, new int[numIndexDims]);
@@ -818,43 +845,43 @@ public long finish(IndexOutput out) throws IOException {
}
}
- //System.out.println("Total nodes: " + innerNodeCount);
-
- // Write index:
- long indexFP = out.getFilePointer();
- writeIndex(out, Math.toIntExact(countPerLeaf), leafBlockFPs, splitPackedValues);
- return indexFP;
- }
+ scratchBytesRef1.bytes = splitPackedValues;
+ scratchBytesRef1.length = bytesPerDim;
+ BKDTreeLeafNodes leafNodes = new BKDTreeLeafNodes() {
+ @Override
+ public long getLeafLP(int index) {
+ return leafBlockFPs[index];
+ }
- /** Packs the two arrays, representing a balanced binary tree, into a compact byte[] structure. */
- private byte[] packIndex(long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
+ @Override
+ public BytesRef getSplitValue(int index) {
+ scratchBytesRef1.offset = index * bytesPerDim;
+ return scratchBytesRef1;
+ }
- int numLeaves = leafBlockFPs.length;
+ @Override
+ public int getSplitDimension(int index) {
+ return splitDimensionValues[index] & 0xff;
+ }
- // Possibly rotate the leaf block FPs, if the index not fully balanced binary tree (only happens
- // if it was created by OneDimensionBKDWriter). In this case the leaf nodes may straddle the two bottom
- // levels of the binary tree:
- if (numIndexDims == 1 && numLeaves > 1) {
- int levelCount = 2;
- while (true) {
- if (numLeaves >= levelCount && numLeaves <= 2*levelCount) {
- int lastLevel = 2*(numLeaves - levelCount);
- assert lastLevel >= 0;
- if (lastLevel != 0) {
- // Last level is partially filled, so we must rotate the leaf FPs to match. We do this here, after loading
- // at read-time, so that we can still delta code them on disk at write:
- long[] newLeafBlockFPs = new long[numLeaves];
- System.arraycopy(leafBlockFPs, lastLevel, newLeafBlockFPs, 0, leafBlockFPs.length - lastLevel);
- System.arraycopy(leafBlockFPs, 0, newLeafBlockFPs, leafBlockFPs.length - lastLevel, lastLevel);
- leafBlockFPs = newLeafBlockFPs;
- }
- break;
- }
+ @Override
+ public int numLeaves() {
+ return leafBlockFPs.length;
+ }
+ };
- levelCount *= 2;
+ return () -> {
+ // Write index:
+ try {
+ writeIndex(metaOut, indexOut, maxPointsInLeafNode, leafNodes, dataStartFP);
+ } catch (IOException e) {
+ throw new UncheckedIOException(e);
}
- }
+ };
+ }
+ /** Packs the two arrays, representing a semi-balanced binary tree, into a compact byte[] structure. */
+ private byte[] packIndex(BKDTreeLeafNodes leafNodes) throws IOException {
/** Reused while packing the index */
ByteBuffersDataOutput writeBuffer = ByteBuffersDataOutput.newResettableInstance();
@@ -862,7 +889,8 @@ private byte[] packIndex(long[] leafBlockFPs, byte[] splitPackedValues) throws I
List blocks = new ArrayList<>();
byte[] lastSplitValues = new byte[bytesPerDim * numIndexDims];
//System.out.println("\npack index");
- int totalSize = recursePackIndex(writeBuffer, leafBlockFPs, splitPackedValues, 0l, blocks, 1, lastSplitValues, new boolean[numIndexDims], false);
+ int totalSize = recursePackIndex(writeBuffer, leafNodes, 0l, blocks, lastSplitValues, new boolean[numIndexDims], false,
+ 0, leafNodes.numLeaves());
// Compact the byte[] blocks into single byte index:
byte[] index = new byte[totalSize];
@@ -887,45 +915,43 @@ private int appendBlock(ByteBuffersDataOutput writeBuffer, List blocks)
/**
* lastSplitValues is per-dimension split value previously seen; we use this to prefix-code the split byte[] on each inner node
*/
- private int recursePackIndex(ByteBuffersDataOutput writeBuffer, long[] leafBlockFPs, byte[] splitPackedValues, long minBlockFP, List blocks,
- int nodeID, byte[] lastSplitValues, boolean[] negativeDeltas, boolean isLeft) throws IOException {
- if (nodeID >= leafBlockFPs.length) {
- int leafID = nodeID - leafBlockFPs.length;
- //System.out.println("recursePack leaf nodeID=" + nodeID);
-
- // In the unbalanced case it's possible the left most node only has one child:
- if (leafID < leafBlockFPs.length) {
- long delta = leafBlockFPs[leafID] - minBlockFP;
- if (isLeft) {
- assert delta == 0;
- return 0;
- } else {
- assert nodeID == 1 || delta > 0: "nodeID=" + nodeID;
- writeBuffer.writeVLong(delta);
- return appendBlock(writeBuffer, blocks);
- }
- } else {
+ private int recursePackIndex(ByteBuffersDataOutput writeBuffer, BKDTreeLeafNodes leafNodes, long minBlockFP, List blocks,
+ byte[] lastSplitValues, boolean[] negativeDeltas, boolean isLeft, int leavesOffset, int numLeaves) throws IOException {
+ if (numLeaves == 1) {
+ if (isLeft) {
+ assert leafNodes.getLeafLP(leavesOffset) - minBlockFP == 0;
return 0;
+ } else {
+ long delta = leafNodes.getLeafLP(leavesOffset) - minBlockFP;
+ assert leafNodes.numLeaves() == numLeaves || delta > 0 : "expected delta > 0; got numLeaves =" + numLeaves + " and delta=" + delta;
+ writeBuffer.writeVLong(delta);
+ return appendBlock(writeBuffer, blocks);
}
} else {
long leftBlockFP;
- if (isLeft == false) {
- leftBlockFP = getLeftMostLeafBlockFP(leafBlockFPs, nodeID);
- long delta = leftBlockFP - minBlockFP;
- assert nodeID == 1 || delta > 0 : "expected nodeID=1 or delta > 0; got nodeID=" + nodeID + " and delta=" + delta;
- writeBuffer.writeVLong(delta);
- } else {
+ if (isLeft) {
// The left tree's left most leaf block FP is always the minimal FP:
+ assert leafNodes.getLeafLP(leavesOffset) == minBlockFP;
leftBlockFP = minBlockFP;
+ } else {
+ leftBlockFP = leafNodes.getLeafLP(leavesOffset);
+ long delta = leftBlockFP - minBlockFP;
+ assert leafNodes.numLeaves() == numLeaves || delta > 0 : "expected delta > 0; got numLeaves =" + numLeaves + " and delta=" + delta;
+ writeBuffer.writeVLong(delta);
}
- int address = nodeID * (1+bytesPerDim);
- int splitDim = splitPackedValues[address++] & 0xff;
+ int numLeftLeafNodes = getNumLeftLeafNodes(numLeaves);
+ final int rightOffset = leavesOffset + numLeftLeafNodes;
+ final int splitOffset = rightOffset - 1;
+
+ int splitDim = leafNodes.getSplitDimension(splitOffset);
+ BytesRef splitValue = leafNodes.getSplitValue(splitOffset);
+ int address = splitValue.offset;
//System.out.println("recursePack inner nodeID=" + nodeID + " splitDim=" + splitDim + " splitValue=" + new BytesRef(splitPackedValues, address, bytesPerDim));
// find common prefix with last split value in this dim:
- int prefix = Arrays.mismatch(splitPackedValues, address, address + bytesPerDim, lastSplitValues,
+ int prefix = Arrays.mismatch(splitValue.bytes, address, address + bytesPerDim, lastSplitValues,
splitDim * bytesPerDim, splitDim * bytesPerDim + bytesPerDim);
if (prefix == -1) {
prefix = bytesPerDim;
@@ -936,7 +962,7 @@ private int recursePackIndex(ByteBuffersDataOutput writeBuffer, long[] leafBlock
int firstDiffByteDelta;
if (prefix < bytesPerDim) {
//System.out.println(" delta byte cur=" + Integer.toHexString(splitPackedValues[address+prefix]&0xFF) + " prev=" + Integer.toHexString(lastSplitValues[splitDim * bytesPerDim + prefix]&0xFF) + " negated?=" + negativeDeltas[splitDim]);
- firstDiffByteDelta = (splitPackedValues[address+prefix]&0xFF) - (lastSplitValues[splitDim * bytesPerDim + prefix]&0xFF);
+ firstDiffByteDelta = (splitValue.bytes[address+prefix]&0xFF) - (lastSplitValues[splitDim * bytesPerDim + prefix]&0xFF);
if (negativeDeltas[splitDim]) {
firstDiffByteDelta = -firstDiffByteDelta;
}
@@ -958,7 +984,7 @@ private int recursePackIndex(ByteBuffersDataOutput writeBuffer, long[] leafBlock
int suffix = bytesPerDim - prefix;
byte[] savSplitValue = new byte[suffix];
if (suffix > 1) {
- writeBuffer.writeBytes(splitPackedValues, address+prefix+1, suffix-1);
+ writeBuffer.writeBytes(splitValue.bytes, address+prefix+1, suffix-1);
}
byte[] cmp = lastSplitValues.clone();
@@ -966,7 +992,7 @@ private int recursePackIndex(ByteBuffersDataOutput writeBuffer, long[] leafBlock
System.arraycopy(lastSplitValues, splitDim * bytesPerDim + prefix, savSplitValue, 0, suffix);
// copy our split value into lastSplitValues for our children to prefix-code against
- System.arraycopy(splitPackedValues, address+prefix, lastSplitValues, splitDim * bytesPerDim + prefix, suffix);
+ System.arraycopy(splitValue.bytes, address+prefix, lastSplitValues, splitDim * bytesPerDim + prefix, suffix);
int numBytes = appendBlock(writeBuffer, blocks);
@@ -978,9 +1004,11 @@ private int recursePackIndex(ByteBuffersDataOutput writeBuffer, long[] leafBlock
boolean savNegativeDelta = negativeDeltas[splitDim];
negativeDeltas[splitDim] = true;
- int leftNumBytes = recursePackIndex(writeBuffer, leafBlockFPs, splitPackedValues, leftBlockFP, blocks, 2*nodeID, lastSplitValues, negativeDeltas, true);
- if (nodeID * 2 < leafBlockFPs.length) {
+ int leftNumBytes = recursePackIndex(writeBuffer, leafNodes, leftBlockFP, blocks, lastSplitValues, negativeDeltas, true,
+ leavesOffset, numLeftLeafNodes);
+
+ if (numLeftLeafNodes != 1) {
writeBuffer.writeVInt(leftNumBytes);
} else {
assert leftNumBytes == 0: "leftNumBytes=" + leftNumBytes;
@@ -992,7 +1020,8 @@ private int recursePackIndex(ByteBuffersDataOutput writeBuffer, long[] leafBlock
blocks.set(idxSav, bytes2);
negativeDeltas[splitDim] = false;
- int rightNumBytes = recursePackIndex(writeBuffer, leafBlockFPs, splitPackedValues, leftBlockFP, blocks, 2*nodeID+1, lastSplitValues, negativeDeltas, false);
+ int rightNumBytes = recursePackIndex(writeBuffer, leafNodes, leftBlockFP, blocks, lastSplitValues, negativeDeltas, false,
+ rightOffset, numLeaves - numLeftLeafNodes);
negativeDeltas[splitDim] = savNegativeDelta;
@@ -1005,46 +1034,32 @@ private int recursePackIndex(ByteBuffersDataOutput writeBuffer, long[] leafBlock
}
}
- private long getLeftMostLeafBlockFP(long[] leafBlockFPs, int nodeID) {
- // TODO: can we do this cheaper, e.g. a closed form solution instead of while loop? Or
- // change the recursion while packing the index to return this left-most leaf block FP
- // from each recursion instead?
- //
- // Still, the overall cost here is minor: this method's cost is O(log(N)), and while writing
- // we call it O(N) times (N = number of leaf blocks)
- while (nodeID < leafBlockFPs.length) {
- nodeID *= 2;
- }
- int leafID = nodeID - leafBlockFPs.length;
- long result = leafBlockFPs[leafID];
- if (result < 0) {
- throw new AssertionError(result + " for leaf " + leafID);
- }
- return result;
- }
-
- private void writeIndex(IndexOutput out, int countPerLeaf, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
- byte[] packedIndex = packIndex(leafBlockFPs, splitPackedValues);
- writeIndex(out, countPerLeaf, leafBlockFPs.length, packedIndex);
+ private void writeIndex(IndexOutput metaOut, IndexOutput indexOut, int countPerLeaf, BKDTreeLeafNodes leafNodes, long dataStartFP) throws IOException {
+ byte[] packedIndex = packIndex(leafNodes);
+ writeIndex(metaOut, indexOut, countPerLeaf, leafNodes.numLeaves(), packedIndex, dataStartFP);
}
- private void writeIndex(IndexOutput out, int countPerLeaf, int numLeaves, byte[] packedIndex) throws IOException {
-
- CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
- out.writeVInt(numDataDims);
- out.writeVInt(numIndexDims);
- out.writeVInt(countPerLeaf);
- out.writeVInt(bytesPerDim);
+ private void writeIndex(IndexOutput metaOut, IndexOutput indexOut, int countPerLeaf, int numLeaves, byte[] packedIndex, long dataStartFP) throws IOException {
+ CodecUtil.writeHeader(metaOut, CODEC_NAME, VERSION_CURRENT);
+ metaOut.writeVInt(numDataDims);
+ metaOut.writeVInt(numIndexDims);
+ metaOut.writeVInt(countPerLeaf);
+ metaOut.writeVInt(bytesPerDim);
assert numLeaves > 0;
- out.writeVInt(numLeaves);
- out.writeBytes(minPackedValue, 0, packedIndexBytesLength);
- out.writeBytes(maxPackedValue, 0, packedIndexBytesLength);
-
- out.writeVLong(pointCount);
- out.writeVInt(docsSeen.cardinality());
- out.writeVInt(packedIndex.length);
- out.writeBytes(packedIndex, 0, packedIndex.length);
+ metaOut.writeVInt(numLeaves);
+ metaOut.writeBytes(minPackedValue, 0, packedIndexBytesLength);
+ metaOut.writeBytes(maxPackedValue, 0, packedIndexBytesLength);
+
+ metaOut.writeVLong(pointCount);
+ metaOut.writeVInt(docsSeen.cardinality());
+ metaOut.writeVInt(packedIndex.length);
+ metaOut.writeLong(dataStartFP);
+ // If metaOut and indexOut are the same file, we account for the fact that
+ // writing a long makes the index start 8 bytes later.
+ metaOut.writeLong(indexOut.getFilePointer() + (metaOut == indexOut ? Long.BYTES : 0));
+
+ indexOut.writeBytes(packedIndex, 0, packedIndex.length);
}
private void writeLeafBlockDocs(DataOutput out, int[] docIDs, int start, int count) throws IOException {
@@ -1319,16 +1334,17 @@ private HeapPointWriter switchToHeap(PointWriter source) throws IOException {
/* Recursively reorders the provided reader and writes the bkd-tree on the fly; this method is used
* when we are writing a new segment directly from IndexWriter's indexing buffer (MutablePointsReader). */
- private void build(int nodeID, int leafNodeOffset,
+ private void build(int leavesOffset, int numLeaves,
MutablePointValues reader, int from, int to,
IndexOutput out,
byte[] minPackedValue, byte[] maxPackedValue,
int[] parentSplits,
byte[] splitPackedValues,
+ byte[] splitDimensionValues,
long[] leafBlockFPs,
int[] spareDocIds) throws IOException {
- if (nodeID >= leafNodeOffset) {
+ if (numLeaves == 1) {
// leaf node
final int count = to - from;
assert count <= maxPointsInLeafNode;
@@ -1402,7 +1418,7 @@ private void build(int nodeID, int leafNodeOffset,
}
}
// Save the block file pointer:
- leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
+ leafBlockFPs[leavesOffset] = out.getFilePointer();
assert scratchOut.size() == 0;
@@ -1443,13 +1459,16 @@ assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue,
// for dimensions > 2 we recompute the bounds for the current inner node to help the algorithm choose best
// split dimensions. Because it is an expensive operation, the frequency we recompute the bounds is given
// by SPLITS_BEFORE_EXACT_BOUNDS.
- if (nodeID > 1 && numIndexDims > 2 && Arrays.stream(parentSplits).sum() % SPLITS_BEFORE_EXACT_BOUNDS == 0) {
+ if (numLeaves != leafBlockFPs.length && numIndexDims > 2 && Arrays.stream(parentSplits).sum() % SPLITS_BEFORE_EXACT_BOUNDS == 0) {
computePackedValueBounds(reader, from, to, minPackedValue, maxPackedValue, scratchBytesRef1);
}
splitDim = split(minPackedValue, maxPackedValue, parentSplits);
}
- final int mid = (from + to + 1) >>> 1;
+ // How many leaves will be in the left tree:
+ int numLeftLeafNodes = getNumLeftLeafNodes(numLeaves);
+ // How many points will be in the left tree:
+ final int mid = from + numLeftLeafNodes * maxPointsInLeafNode;
int commonPrefixLen = Arrays.mismatch(minPackedValue, splitDim * bytesPerDim,
splitDim * bytesPerDim + bytesPerDim, maxPackedValue, splitDim * bytesPerDim,
@@ -1461,11 +1480,13 @@ assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue,
MutablePointsReaderUtils.partition(numDataDims, numIndexDims, maxDoc, splitDim, bytesPerDim, commonPrefixLen,
reader, from, to, mid, scratchBytesRef1, scratchBytesRef2);
+ final int rightOffset = leavesOffset + numLeftLeafNodes;
+ final int splitOffset = rightOffset - 1;
// set the split value
- final int address = nodeID * (1+bytesPerDim);
- splitPackedValues[address] = (byte) splitDim;
+ final int address = splitOffset * bytesPerDim;
+ splitDimensionValues[splitOffset] = (byte) splitDim;
reader.getValue(mid, scratchBytesRef1);
- System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim, splitPackedValues, address + 1, bytesPerDim);
+ System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDim * bytesPerDim, splitPackedValues, address, bytesPerDim);
byte[] minSplitPackedValue = ArrayUtil.copyOfSubArray(minPackedValue, 0, packedIndexBytesLength);
byte[] maxSplitPackedValue = ArrayUtil.copyOfSubArray(maxPackedValue, 0, packedIndexBytesLength);
@@ -1476,12 +1497,12 @@ assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue,
// recurse
parentSplits[splitDim]++;
- build(nodeID * 2, leafNodeOffset, reader, from, mid, out,
+ build(leavesOffset, numLeftLeafNodes, reader, from, mid, out,
minPackedValue, maxSplitPackedValue, parentSplits,
- splitPackedValues, leafBlockFPs, spareDocIds);
- build(nodeID * 2 + 1, leafNodeOffset, reader, mid, to, out,
+ splitPackedValues, splitDimensionValues, leafBlockFPs, spareDocIds);
+ build(rightOffset, numLeaves - numLeftLeafNodes, reader, mid, to, out,
minSplitPackedValue, maxPackedValue, parentSplits,
- splitPackedValues, leafBlockFPs, spareDocIds);
+ splitPackedValues, splitDimensionValues, leafBlockFPs, spareDocIds);
parentSplits[splitDim]--;
}
}
@@ -1512,17 +1533,18 @@ private void computePackedValueBounds(BKDRadixSelector.PathSlice slice, byte[] m
/** The point writer contains the data that is going to be splitted using radix selection.
/* This method is used when we are merging previously written segments, in the numDims > 1 case. */
- private void build(int nodeID, int leafNodeOffset,
+ private void build(int leavesOffset, int numLeaves,
BKDRadixSelector.PathSlice points,
IndexOutput out,
BKDRadixSelector radixSelector,
byte[] minPackedValue, byte[] maxPackedValue,
int[] parentSplits,
byte[] splitPackedValues,
+ byte[] splitDimensionValues,
long[] leafBlockFPs,
int[] spareDocIds) throws IOException {
- if (nodeID >= leafNodeOffset) {
+ if (numLeaves == 1) {
// Leaf node: write block
// We can write the block in any order so by default we write it sorted by the dimension that has the
@@ -1573,13 +1595,13 @@ private void build(int nodeID, int leafNodeOffset,
int leafCardinality = heapSource.computeCardinality(from ,to, numDataDims, bytesPerDim, commonPrefixLengths);
// Save the block file pointer:
- leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
+ leafBlockFPs[leavesOffset] = out.getFilePointer();
//System.out.println(" write leaf block @ fp=" + out.getFilePointer());
// Write docIDs first, as their own chunk, so that at intersect time we can add all docIDs w/o
// loading the values:
int count = to - from;
- assert count > 0: "nodeID=" + nodeID + " leafNodeOffset=" + leafNodeOffset;
+ assert count > 0: "numLeaves=" + numLeaves + " leavesOffset=" + leavesOffset;
assert count <= spareDocIds.length : "count=" + count + " > length=" + spareDocIds.length;
// Write doc IDs
int[] docIDs = spareDocIds;
@@ -1622,17 +1644,18 @@ assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue,
// for dimensions > 2 we recompute the bounds for the current inner node to help the algorithm choose best
// split dimensions. Because it is an expensive operation, the frequency we recompute the bounds is given
// by SPLITS_BEFORE_EXACT_BOUNDS.
- if (nodeID > 1 && numIndexDims > 2 && Arrays.stream(parentSplits).sum() % SPLITS_BEFORE_EXACT_BOUNDS == 0) {
+ if (numLeaves != leafBlockFPs.length && numIndexDims > 2 && Arrays.stream(parentSplits).sum() % SPLITS_BEFORE_EXACT_BOUNDS == 0) {
computePackedValueBounds(points, minPackedValue, maxPackedValue);
}
splitDim = split(minPackedValue, maxPackedValue, parentSplits);
}
- assert nodeID < splitPackedValues.length : "nodeID=" + nodeID + " splitValues.length=" + splitPackedValues.length;
+ assert numLeaves <= leafBlockFPs.length : "numLeaves=" + numLeaves + " leafBlockFPs.length=" + leafBlockFPs.length;
+ // How many leaves will be in the left tree:
+ final int numLeftLeafNodes = getNumLeftLeafNodes(numLeaves);
// How many points will be in the left tree:
- long rightCount = points.count / 2;
- long leftCount = points.count - rightCount;
+ final long leftCount = numLeftLeafNodes * maxPointsInLeafNode;
BKDRadixSelector.PathSlice[] slices = new BKDRadixSelector.PathSlice[2];
@@ -1645,9 +1668,12 @@ assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue,
byte[] splitValue = radixSelector.select(points, slices, points.start, points.start + points.count, points.start + leftCount, splitDim, commonPrefixLen);
- int address = nodeID * (1 + bytesPerDim);
- splitPackedValues[address] = (byte) splitDim;
- System.arraycopy(splitValue, 0, splitPackedValues, address + 1, bytesPerDim);
+ final int rightOffset = leavesOffset + numLeftLeafNodes;
+ final int splitValueOffset = rightOffset - 1;
+
+ splitDimensionValues[splitValueOffset] = (byte) splitDim;
+ int address = splitValueOffset * bytesPerDim;
+ System.arraycopy(splitValue, 0, splitPackedValues, address, bytesPerDim);
byte[] minSplitPackedValue = new byte[packedIndexBytesLength];
System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, packedIndexBytesLength);
@@ -1660,14 +1686,14 @@ assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue,
parentSplits[splitDim]++;
// Recurse on left tree:
- build(2 * nodeID, leafNodeOffset, slices[0],
+ build(leavesOffset, numLeftLeafNodes, slices[0],
out, radixSelector, minPackedValue, maxSplitPackedValue,
- parentSplits, splitPackedValues, leafBlockFPs, spareDocIds);
+ parentSplits, splitPackedValues, splitDimensionValues, leafBlockFPs, spareDocIds);
// Recurse on right tree:
- build(2 * nodeID + 1, leafNodeOffset, slices[1],
- out, radixSelector, minSplitPackedValue, maxPackedValue
- , parentSplits, splitPackedValues, leafBlockFPs, spareDocIds);
+ build(rightOffset, numLeaves - numLeftLeafNodes, slices[1],
+ out, radixSelector, minSplitPackedValue, maxPackedValue,
+ parentSplits, splitPackedValues, splitDimensionValues, leafBlockFPs, spareDocIds);
parentSplits[splitDim]--;
}
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
index 260e9dab1aeb..c3cf67bfbdd5 100644
--- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
@@ -26,6 +26,7 @@
import java.nio.file.Path;
import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
@@ -407,26 +408,26 @@ private static boolean flag(int flags, int bit) {
private static final int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 : 28;
/** Load a previously saved FST. */
- public FST(DataInput in, Outputs outputs) throws IOException {
- this(in, outputs, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS));
+ public FST(DataInput metaIn, DataInput in, Outputs outputs) throws IOException {
+ this(metaIn, in, outputs, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS));
}
/** Load a previously saved FST; maxBlockBits allows you to
* control the size of the byte[] pages used to hold the FST bytes. */
- public FST(DataInput in, Outputs outputs, FSTStore fstStore) throws IOException {
+ public FST(DataInput metaIn, DataInput in, Outputs outputs, FSTStore fstStore) throws IOException {
bytes = null;
this.fstStore = fstStore;
this.outputs = outputs;
// NOTE: only reads formats VERSION_START up to VERSION_CURRENT; we don't have
// back-compat promise for FSTs (they are experimental), but we are sometimes able to offer it
- CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_START, VERSION_CURRENT);
- if (in.readByte() == 1) {
+ CodecUtil.checkHeader(metaIn, FILE_FORMAT_NAME, VERSION_START, VERSION_CURRENT);
+ if (metaIn.readByte() == 1) {
// accepts empty string
// 1 KB blocks:
BytesStore emptyBytes = new BytesStore(10);
- int numBytes = in.readVInt();
- emptyBytes.copyBytes(in, numBytes);
+ int numBytes = metaIn.readVInt();
+ emptyBytes.copyBytes(metaIn, numBytes);
// De-serialize empty-string output:
BytesReader reader = emptyBytes.getReverseReader();
@@ -440,7 +441,7 @@ public FST(DataInput in, Outputs outputs, FSTStore fstStore) throws IOExcepti
} else {
emptyOutput = null;
}
- final byte t = in.readByte();
+ final byte t = metaIn.readByte();
switch(t) {
case 0:
inputType = INPUT_TYPE.BYTE1;
@@ -452,11 +453,11 @@ public FST(DataInput in, Outputs outputs, FSTStore fstStore) throws IOExcepti
inputType = INPUT_TYPE.BYTE4;
break;
default:
- throw new IllegalStateException("invalid input type " + t);
+ throw new CorruptIndexException("invalid input type " + t, in);
}
- startNode = in.readVLong();
+ startNode = metaIn.readVLong();
- long numBytes = in.readVLong();
+ long numBytes = metaIn.readVLong();
this.fstStore.init(in, numBytes);
}
@@ -501,16 +502,16 @@ void setEmptyOutput(T v) {
}
}
- public void save(DataOutput out) throws IOException {
+ public void save(DataOutput metaOut, DataOutput out) throws IOException {
if (startNode == -1) {
throw new IllegalStateException("call finish first");
}
- CodecUtil.writeHeader(out, FILE_FORMAT_NAME, VERSION_CURRENT);
+ CodecUtil.writeHeader(metaOut, FILE_FORMAT_NAME, VERSION_CURRENT);
// TODO: really we should encode this as an arc, arriving
// to the root node, instead of special casing here:
if (emptyOutput != null) {
// Accepts empty string
- out.writeByte((byte) 1);
+ metaOut.writeByte((byte) 1);
// Serialize empty-string output:
ByteBuffersDataOutput ros = new ByteBuffersDataOutput();
@@ -527,10 +528,10 @@ public void save(DataOutput out) throws IOException {
emptyOutputBytes[emptyLen - upto - 1] = b;
upto++;
}
- out.writeVInt(emptyLen);
- out.writeBytes(emptyOutputBytes, 0, emptyLen);
+ metaOut.writeVInt(emptyLen);
+ metaOut.writeBytes(emptyOutputBytes, 0, emptyLen);
} else {
- out.writeByte((byte) 0);
+ metaOut.writeByte((byte) 0);
}
final byte t;
if (inputType == INPUT_TYPE.BYTE1) {
@@ -540,11 +541,11 @@ public void save(DataOutput out) throws IOException {
} else {
t = 2;
}
- out.writeByte(t);
- out.writeVLong(startNode);
+ metaOut.writeByte(t);
+ metaOut.writeVLong(startNode);
if (bytes != null) {
long numBytes = bytes.getPosition();
- out.writeVLong(numBytes);
+ metaOut.writeVLong(numBytes);
bytes.writeTo(out);
} else {
assert fstStore != null;
@@ -557,7 +558,8 @@ public void save(DataOutput out) throws IOException {
*/
public void save(final Path path) throws IOException {
try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(path))) {
- save(new OutputStreamDataOutput(os));
+ DataOutput out = new OutputStreamDataOutput(os);
+ save(out, out);
}
}
@@ -566,7 +568,8 @@ public void save(final Path path) throws IOException {
*/
public static FST read(Path path, Outputs outputs) throws IOException {
try (InputStream is = Files.newInputStream(path)) {
- return new FST<>(new InputStreamDataInput(new BufferedInputStream(is)), outputs);
+ DataInput in = new InputStreamDataInput(new BufferedInputStream(is));
+ return new FST<>(in, in, outputs);
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java b/lucene/core/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java
index bc9806dcdddc..c6ea6f263f44 100644
--- a/lucene/core/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java
@@ -33,6 +33,7 @@
@SuppressForbidden(reason = "Uses a Long instance as a marker")
public final class PositiveIntOutputs extends Outputs {
+ @SuppressWarnings("deprecation")
private final static Long NO_OUTPUT = new Long(0);
private final static PositiveIntOutputs singleton = new PositiveIntOutputs();
diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/Packed64SingleBlock.java b/lucene/core/src/java/org/apache/lucene/util/packed/Packed64SingleBlock.java
index 85e7ea8e0780..dab9a4b9a234 100644
--- a/lucene/core/src/java/org/apache/lucene/util/packed/Packed64SingleBlock.java
+++ b/lucene/core/src/java/org/apache/lucene/util/packed/Packed64SingleBlock.java
@@ -90,6 +90,7 @@ public int get(int index, long[] arr, int off, int len) {
// bulk get
assert index % valuesPerBlock == 0;
+ @SuppressWarnings("deprecation")
final PackedInts.Decoder decoder = BulkOperation.of(PackedInts.Format.PACKED_SINGLE_BLOCK, bitsPerValue);
assert decoder.longBlockCount() == 1;
assert decoder.longValueCount() == valuesPerBlock;
@@ -134,6 +135,7 @@ public int set(int index, long[] arr, int off, int len) {
// bulk set
assert index % valuesPerBlock == 0;
+ @SuppressWarnings("deprecation")
final BulkOperation op = BulkOperation.of(PackedInts.Format.PACKED_SINGLE_BLOCK, bitsPerValue);
assert op.longBlockCount() == 1;
assert op.longValueCount() == valuesPerBlock;
@@ -195,6 +197,7 @@ public void fill(int fromIndex, int toIndex, long val) {
}
@Override
+ @SuppressWarnings("deprecation")
protected PackedInts.Format getFormat() {
return PackedInts.Format.PACKED_SINGLE_BLOCK;
}
diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
index 611e237384c3..2897a8ac7c1a 100644
--- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
+++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
@@ -13,4 +13,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-org.apache.lucene.codecs.lucene84.Lucene84Codec
+org.apache.lucene.codecs.lucene86.Lucene86Codec
diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.index.SortFieldProvider b/lucene/core/src/resources/META-INF/services/org.apache.lucene.index.SortFieldProvider
new file mode 100644
index 000000000000..a96a47b7b810
--- /dev/null
+++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.index.SortFieldProvider
@@ -0,0 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+org.apache.lucene.search.SortField$Provider
+org.apache.lucene.search.SortedNumericSortField$Provider
+org.apache.lucene.search.SortedSetSortField$Provider
\ No newline at end of file
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestCharArraySet.java b/lucene/core/src/test/org/apache/lucene/analysis/TestCharArraySet.java
index 8430eb82def1..1a32a800bbba 100644
--- a/lucene/core/src/test/org/apache/lucene/analysis/TestCharArraySet.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestCharArraySet.java
@@ -61,15 +61,19 @@ public void testNonZeroOffset() {
public void testObjectContains() {
CharArraySet set = new CharArraySet(10, true);
Integer val = Integer.valueOf(1);
+ @SuppressWarnings("deprecation")
+ Integer val1 = new Integer(1);
+ // Verify explicitly the case of different Integer instances
+ assertNotSame(val, val1);
set.add(val);
assertTrue(set.contains(val));
- assertTrue(set.contains(new Integer(1))); // another integer
+ assertTrue(set.contains(val1)); // another integer
assertTrue(set.contains("1"));
assertTrue(set.contains(new char[]{'1'}));
// test unmodifiable
set = CharArraySet.unmodifiableSet(set);
assertTrue(set.contains(val));
- assertTrue(set.contains(new Integer(1))); // another integer
+ assertTrue(set.contains(val1)); // another integer
assertTrue(set.contains("1"));
assertTrue(set.contains(new char[]{'1'}));
}
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/TestCodecUtil.java b/lucene/core/src/test/org/apache/lucene/codecs/TestCodecUtil.java
index 0a11a9b5b495..ea0972d5a2af 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/TestCodecUtil.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/TestCodecUtil.java
@@ -26,6 +26,8 @@
import org.apache.lucene.store.ByteBuffersIndexInput;
import org.apache.lucene.store.ByteBuffersIndexOutput;
import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.LuceneTestCase;
@@ -148,13 +150,13 @@ public void testCheckFooterValidPastFooter() throws Exception {
// bogusly read a byte too far (can happen)
input.readByte();
Exception mine = new RuntimeException("fake exception");
- RuntimeException expected = expectThrows(RuntimeException.class, () -> {
+ CorruptIndexException expected = expectThrows(CorruptIndexException.class, () -> {
CodecUtil.checkFooter(input, mine);
});
- assertEquals("fake exception", expected.getMessage());
+ assertTrue(expected.getMessage().contains("checksum status indeterminate"));
Throwable suppressed[] = expected.getSuppressed();
assertEquals(1, suppressed.length);
- assertTrue(suppressed[0].getMessage().contains("checksum status indeterminate"));
+ assertEquals("fake exception", suppressed[0].getMessage());
input.close();
}
@@ -172,13 +174,13 @@ public void testCheckFooterInvalid() throws Exception {
CodecUtil.checkHeader(input, "FooBar", 5, 5);
assertEquals("this is the data", input.readString());
Exception mine = new RuntimeException("fake exception");
- RuntimeException expected = expectThrows(RuntimeException.class, () -> {
+ CorruptIndexException expected = expectThrows(CorruptIndexException.class, () -> {
CodecUtil.checkFooter(input, mine);
});
- assertEquals("fake exception", expected.getMessage());
+ assertTrue(expected.getMessage().contains("checksum failed"));
Throwable suppressed[] = expected.getSuppressed();
assertEquals(1, suppressed.length);
- assertTrue(suppressed[0].getMessage().contains("checksum failed"));
+ assertEquals("fake exception", suppressed[0].getMessage());
input.close();
}
@@ -319,4 +321,47 @@ public void testTruncatedFileThrowsCorruptIndexException() throws IOException {
() -> CodecUtil.retrieveChecksum(input));
assertTrue(e.getMessage(), e.getMessage().contains("misplaced codec footer (file truncated?): length=0 but footerLength==16 (resource"));
}
+
+ public void testRetrieveChecksum() throws IOException {
+ Directory dir = newDirectory();
+ try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
+ out.writeByte((byte) 42);
+ CodecUtil.writeFooter(out);
+ }
+ try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
+ CodecUtil.retrieveChecksum(in, in.length()); // no exception
+
+ CorruptIndexException exception = expectThrows(CorruptIndexException.class,
+ () -> CodecUtil.retrieveChecksum(in, in.length() - 1));
+ assertTrue(exception.getMessage().contains("too long"));
+ assertArrayEquals(new Throwable[0], exception.getSuppressed());
+
+ exception = expectThrows(CorruptIndexException.class,
+ () -> CodecUtil.retrieveChecksum(in, in.length() + 1));
+ assertTrue(exception.getMessage().contains("truncated"));
+ assertArrayEquals(new Throwable[0], exception.getSuppressed());
+ }
+
+ try (IndexOutput out = dir.createOutput("bar", IOContext.DEFAULT)) {
+ for (int i = 0; i <= CodecUtil.footerLength(); ++i) {
+ out.writeByte((byte) i);
+ }
+ }
+ try (IndexInput in = dir.openInput("bar", IOContext.DEFAULT)) {
+ CorruptIndexException exception = expectThrows(CorruptIndexException.class,
+ () -> CodecUtil.retrieveChecksum(in, in.length()));
+ assertTrue(exception.getMessage().contains("codec footer mismatch"));
+ assertArrayEquals(new Throwable[0], exception.getSuppressed());
+
+ exception = expectThrows(CorruptIndexException.class,
+ () -> CodecUtil.retrieveChecksum(in, in.length() - 1));
+ assertTrue(exception.getMessage().contains("too long"));
+
+ exception = expectThrows(CorruptIndexException.class,
+ () -> CodecUtil.retrieveChecksum(in, in.length() + 1));
+ assertTrue(exception.getMessage().contains("truncated"));
+ }
+
+ dir.close();
+ }
}
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java
index 6a3ce93a0f03..cccee736d46b 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java
@@ -17,9 +17,10 @@
package org.apache.lucene.codecs.lucene50;
+import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
-import org.apache.lucene.codecs.lucene84.Lucene84Codec;
+import org.apache.lucene.codecs.lucene86.Lucene86Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.BaseStoredFieldsFormatTestCase;
@@ -28,12 +29,10 @@
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
-import com.carrotsearch.randomizedtesting.generators.RandomPicks;
-
public class TestLucene50StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase {
@Override
protected Codec getCodec() {
- return new Lucene84Codec(Mode.BEST_COMPRESSION);
+ return new Lucene86Codec(Mode.BEST_COMPRESSION);
}
/**
@@ -44,7 +43,7 @@ public void testMixedCompressions() throws Exception {
Directory dir = newDirectory();
for (int i = 0; i < 10; i++) {
IndexWriterConfig iwc = newIndexWriterConfig();
- iwc.setCodec(new Lucene84Codec(RandomPicks.randomFrom(random(), Mode.values())));
+ iwc.setCodec(new Lucene86Codec(RandomPicks.randomFrom(random(), Mode.values())));
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig());
Document doc = new Document();
doc.add(new StoredField("field1", "value1"));
@@ -71,7 +70,7 @@ public void testMixedCompressions() throws Exception {
public void testInvalidOptions() {
expectThrows(NullPointerException.class, () -> {
- new Lucene84Codec(null);
+ new Lucene86Codec(null);
});
expectThrows(NullPointerException.class, () -> {
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene80/TestLucene80NormsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene80/TestLucene80NormsFormat.java
index 4eadf05ef5a2..b6e7268d67f1 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene80/TestLucene80NormsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene80/TestLucene80NormsFormat.java
@@ -18,14 +18,14 @@
import org.apache.lucene.codecs.Codec;
-import org.apache.lucene.codecs.lucene84.Lucene84Codec;
+import org.apache.lucene.codecs.lucene86.Lucene86Codec;
import org.apache.lucene.index.BaseNormsFormatTestCase;
/**
* Tests Lucene80NormsFormat
*/
public class TestLucene80NormsFormat extends BaseNormsFormatTestCase {
- private final Codec codec = new Lucene84Codec();
+ private final Codec codec = new Lucene86Codec();
@Override
protected Codec getCodec() {
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene86/TestLucene86PointsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene86/TestLucene86PointsFormat.java
new file mode 100644
index 000000000000..8d5ce08f346d
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene86/TestLucene86PointsFormat.java
@@ -0,0 +1,393 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene86;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.FilterCodec;
+import org.apache.lucene.codecs.PointsFormat;
+import org.apache.lucene.codecs.PointsReader;
+import org.apache.lucene.codecs.PointsWriter;
+import org.apache.lucene.document.BinaryPoint;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.BasePointsFormatTestCase;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.MockRandomMergePolicy;
+import org.apache.lucene.index.PointValues;
+import org.apache.lucene.index.PointValues.IntersectVisitor;
+import org.apache.lucene.index.PointValues.Relation;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.bkd.BKDWriter;
+
+public class TestLucene86PointsFormat extends BasePointsFormatTestCase {
+
+ private final Codec codec;
+ private final int maxPointsInLeafNode;
+
+ public TestLucene86PointsFormat() {
+ // standard issue
+ Codec defaultCodec = new Lucene86Codec();
+ if (random().nextBoolean()) {
+ // randomize parameters
+ maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 500);
+ double maxMBSortInHeap = 3.0 + (3*random().nextDouble());
+ if (VERBOSE) {
+ System.out.println("TEST: using Lucene60PointsFormat with maxPointsInLeafNode=" + maxPointsInLeafNode + " and maxMBSortInHeap=" + maxMBSortInHeap);
+ }
+
+ // sneaky impersonation!
+ codec = new FilterCodec(defaultCodec.getName(), defaultCodec) {
+ @Override
+ public PointsFormat pointsFormat() {
+ return new PointsFormat() {
+ @Override
+ public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
+ return new Lucene86PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap);
+ }
+
+ @Override
+ public PointsReader fieldsReader(SegmentReadState readState) throws IOException {
+ return new Lucene86PointsReader(readState);
+ }
+ };
+ }
+ };
+ } else {
+ // standard issue
+ codec = defaultCodec;
+ maxPointsInLeafNode = BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE;
+ }
+ }
+
+ @Override
+ protected Codec getCodec() {
+ return codec;
+ }
+
+ @Override
+ public void testMergeStability() throws Exception {
+ assumeFalse("TODO: mess with the parameters and test gets angry!", codec instanceof FilterCodec);
+ super.testMergeStability();
+ }
+
+ public void testEstimatePointCount() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = newIndexWriterConfig();
+ // Avoid mockRandomMP since it may cause non-optimal merges that make the
+ // number of points per leaf hard to predict
+ while (iwc.getMergePolicy() instanceof MockRandomMergePolicy) {
+ iwc.setMergePolicy(newMergePolicy());
+ }
+ IndexWriter w = new IndexWriter(dir, iwc);
+ byte[] pointValue = new byte[3];
+ byte[] uniquePointValue = new byte[3];
+ random().nextBytes(uniquePointValue);
+ final int numDocs = TEST_NIGHTLY ? atLeast(10000) : atLeast(500); // at night, make sure we have several leaves
+ final boolean multiValues = random().nextBoolean();
+ for (int i = 0; i < numDocs; ++i) {
+ Document doc = new Document();
+ if (i == numDocs / 2) {
+ doc.add(new BinaryPoint("f", uniquePointValue));
+ } else {
+ final int numValues = (multiValues) ? TestUtil.nextInt(random(), 2, 100) : 1;
+ for (int j = 0; j < numValues; j ++) {
+ do {
+ random().nextBytes(pointValue);
+ } while (Arrays.equals(pointValue, uniquePointValue));
+ doc.add(new BinaryPoint("f", pointValue));
+ }
+ }
+ w.addDocument(doc);
+ }
+ w.forceMerge(1);
+ final IndexReader r = DirectoryReader.open(w);
+ w.close();
+ final LeafReader lr = getOnlyLeafReader(r);
+ PointValues points = lr.getPointValues("f");
+
+ // If all points match, then the point count is numLeaves * maxPointsInLeafNode
+ final int numLeaves = (int) Math.ceil((double) points.size() / maxPointsInLeafNode);
+
+ IntersectVisitor allPointsVisitor = new IntersectVisitor() {
+ @Override
+ public void visit(int docID, byte[] packedValue) throws IOException {}
+
+ @Override
+ public void visit(int docID) throws IOException {}
+
+ @Override
+ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+ return Relation.CELL_INSIDE_QUERY;
+ }
+ };
+
+ assertEquals(numLeaves * maxPointsInLeafNode, points.estimatePointCount(allPointsVisitor));
+ assertEquals(numDocs, points.estimateDocCount(allPointsVisitor));
+
+ IntersectVisitor noPointsVisitor = new IntersectVisitor() {
+ @Override
+ public void visit(int docID, byte[] packedValue) throws IOException {}
+
+ @Override
+ public void visit(int docID) throws IOException {}
+
+ @Override
+ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+ return Relation.CELL_OUTSIDE_QUERY;
+ }
+ };
+
+ // Return 0 if no points match
+ assertEquals(0, points.estimatePointCount(noPointsVisitor));
+ assertEquals(0, points.estimateDocCount(noPointsVisitor));
+
+ IntersectVisitor onePointMatchVisitor = new IntersectVisitor() {
+ @Override
+ public void visit(int docID, byte[] packedValue) throws IOException {}
+
+ @Override
+ public void visit(int docID) throws IOException {}
+
+ @Override
+ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+ if (Arrays.compareUnsigned(uniquePointValue, 0, 3, maxPackedValue, 0, 3) > 0 ||
+ Arrays.compareUnsigned(uniquePointValue, 0, 3, minPackedValue, 0, 3) < 0) {
+ return Relation.CELL_OUTSIDE_QUERY;
+ }
+ return Relation.CELL_CROSSES_QUERY;
+ }
+ };
+
+ // If only one point matches, then the point count is (maxPointsInLeafNode + 1) / 2
+ // in general, or maybe 2x that if the point is a split value
+ final long pointCount = points.estimatePointCount(onePointMatchVisitor);
+ assertTrue(""+pointCount,
+ pointCount == (maxPointsInLeafNode + 1) / 2 || // common case
+ pointCount == 2*((maxPointsInLeafNode + 1) / 2)); // if the point is a split value
+
+ final long docCount = points.estimateDocCount(onePointMatchVisitor);
+
+ if (multiValues) {
+ assertEquals(docCount, (long) (docCount * (1d - Math.pow( (numDocs - pointCount) / points.size() , points.size() / docCount))));
+ } else {
+ assertEquals(Math.min(pointCount, numDocs), docCount);
+ }
+ r.close();
+ dir.close();
+ }
+
+ // The tree is always balanced in the N dims case, and leaves are
+ // not all full so things are a bit different
+ public void testEstimatePointCount2Dims() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
+ byte[][] pointValue = new byte[2][];
+ pointValue[0] = new byte[3];
+ pointValue[1] = new byte[3];
+ byte[][] uniquePointValue = new byte[2][];
+ uniquePointValue[0] = new byte[3];
+ uniquePointValue[1] = new byte[3];
+ random().nextBytes(uniquePointValue[0]);
+ random().nextBytes(uniquePointValue[1]);
+ final int numDocs = TEST_NIGHTLY? atLeast(10000) : atLeast(1000); // in nightly, make sure we have several leaves
+ final boolean multiValues = random().nextBoolean();
+ for (int i = 0; i < numDocs; ++i) {
+ Document doc = new Document();
+ if (i == numDocs / 2) {
+ doc.add(new BinaryPoint("f", uniquePointValue));
+ } else {
+ final int numValues = (multiValues) ? TestUtil.nextInt(random(), 2, 100) : 1;
+ for (int j = 0; j < numValues; j ++) {
+ do {
+ random().nextBytes(pointValue[0]);
+ random().nextBytes(pointValue[1]);
+ } while (Arrays.equals(pointValue[0], uniquePointValue[0]) || Arrays.equals(pointValue[1], uniquePointValue[1]));
+ doc.add(new BinaryPoint("f", pointValue));
+ }
+ }
+ w.addDocument(doc);
+ }
+ w.forceMerge(1);
+ final IndexReader r = DirectoryReader.open(w);
+ w.close();
+ final LeafReader lr = getOnlyLeafReader(r);
+ PointValues points = lr.getPointValues("f");
+
+ IntersectVisitor allPointsVisitor = new IntersectVisitor() {
+ @Override
+ public void visit(int docID, byte[] packedValue) throws IOException {}
+
+ @Override
+ public void visit(int docID) throws IOException {}
+
+ @Override
+ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+ return Relation.CELL_INSIDE_QUERY;
+ }
+ };
+
+ // If all points match, then the point count is numLeaves * maxPointsInLeafNode
+ final int numLeaves = (int) Math.ceil((double) points.size() / maxPointsInLeafNode);
+
+ assertEquals(numLeaves * maxPointsInLeafNode, points.estimatePointCount(allPointsVisitor));
+ assertEquals(numDocs, points.estimateDocCount(allPointsVisitor));
+
+ IntersectVisitor noPointsVisitor = new IntersectVisitor() {
+ @Override
+ public void visit(int docID, byte[] packedValue) throws IOException {}
+
+ @Override
+ public void visit(int docID) throws IOException {}
+
+ @Override
+ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+ return Relation.CELL_OUTSIDE_QUERY;
+ }
+ };
+
+ // Return 0 if no points match
+ assertEquals(0, points.estimatePointCount(noPointsVisitor));
+ assertEquals(0, points.estimateDocCount(noPointsVisitor));
+
+ IntersectVisitor onePointMatchVisitor = new IntersectVisitor() {
+ @Override
+ public void visit(int docID, byte[] packedValue) throws IOException {}
+
+ @Override
+ public void visit(int docID) throws IOException {}
+
+ @Override
+ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+ for (int dim = 0; dim < 2; ++dim) {
+ if (Arrays.compareUnsigned(uniquePointValue[dim], 0, 3, maxPackedValue, dim * 3, dim * 3 + 3) > 0 ||
+ Arrays.compareUnsigned(uniquePointValue[dim], 0, 3, minPackedValue, dim * 3, dim * 3 + 3) < 0) {
+ return Relation.CELL_OUTSIDE_QUERY;
+ }
+ }
+ return Relation.CELL_CROSSES_QUERY;
+ }
+ };
+
+ final long pointCount = points.estimatePointCount(onePointMatchVisitor);
+ // The number of matches needs to be multiple of count per leaf
+ final long countPerLeaf = (maxPointsInLeafNode + 1) / 2;
+ assertTrue(""+pointCount, pointCount % countPerLeaf == 0);
+ // in extreme cases, a point can be be shared by 4 leaves
+ assertTrue(""+pointCount, pointCount / countPerLeaf <= 4 && pointCount / countPerLeaf >= 1);
+
+ final long docCount = points.estimateDocCount(onePointMatchVisitor);
+ if (multiValues) {
+ assertEquals(docCount, (long) (docCount * (1d - Math.pow( (numDocs - pointCount) / points.size() , points.size() / docCount))));
+ } else {
+ assertEquals(Math.min(pointCount, numDocs), docCount);
+ }
+ r.close();
+ dir.close();
+ }
+
+ public void testDocCountEdgeCases() {
+ PointValues values = getPointValues(Long.MAX_VALUE, 1, Long.MAX_VALUE);
+ long docs = values.estimateDocCount(null);
+ assertEquals(1, docs);
+ values = getPointValues(Long.MAX_VALUE, 1, 1);
+ docs = values.estimateDocCount(null);
+ assertEquals(1, docs);
+ values = getPointValues(Long.MAX_VALUE, Integer.MAX_VALUE, Long.MAX_VALUE);
+ docs = values.estimateDocCount(null);
+ assertEquals(Integer.MAX_VALUE, docs);
+ values = getPointValues(Long.MAX_VALUE, Integer.MAX_VALUE, Long.MAX_VALUE / 2);
+ docs = values.estimateDocCount(null);
+ assertEquals(Integer.MAX_VALUE, docs);
+ values = getPointValues(Long.MAX_VALUE, Integer.MAX_VALUE, 1);
+ docs = values.estimateDocCount(null);
+ assertEquals(1, docs);
+ }
+
+ public void testRandomDocCount() {
+ for (int i = 0; i < 100; i++) {
+ long size = TestUtil.nextLong(random(), 1, Long.MAX_VALUE);
+ int maxDoc = (size > Integer.MAX_VALUE) ? Integer.MAX_VALUE : Math.toIntExact(size);
+ int docCount = TestUtil.nextInt(random(), 1, maxDoc);
+ long estimatedPointCount = TestUtil.nextLong(random(), 0, size);
+ PointValues values = getPointValues(size, docCount, estimatedPointCount);
+ long docs = values.estimateDocCount(null);
+ assertTrue(docs <= estimatedPointCount);
+ assertTrue(docs <= maxDoc);
+ assertTrue(docs >= estimatedPointCount / (size/docCount));
+ }
+ }
+
+
+ private PointValues getPointValues(long size, int docCount, long estimatedPointCount) {
+ return new PointValues() {
+ @Override
+ public void intersect(IntersectVisitor visitor) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public long estimatePointCount(IntersectVisitor visitor) {
+ return estimatedPointCount;
+ }
+
+ @Override
+ public byte[] getMinPackedValue() throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public byte[] getMaxPackedValue() throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int getNumDimensions() throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int getNumIndexDimensions() throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int getBytesPerDimension() throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public long size() {
+ return size;
+ }
+
+ @Override
+ public int getDocCount() {
+ return docCount;
+ }
+ };
+ }
+
+}
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene86/TestLucene86SegmentInfoFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene86/TestLucene86SegmentInfoFormat.java
new file mode 100644
index 000000000000..e462d3f4ac45
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene86/TestLucene86SegmentInfoFormat.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.codecs.lucene86;
+
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.index.BaseSegmentInfoFormatTestCase;
+import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.Version;
+
+public class TestLucene86SegmentInfoFormat extends BaseSegmentInfoFormatTestCase {
+
+ @Override
+ protected Version[] getVersions() {
+ return new Version[] { Version.LATEST };
+ }
+
+ @Override
+ protected Codec getCodec() {
+ return TestUtil.getDefaultCodec();
+ }
+
+}
diff --git a/lucene/core/src/test/org/apache/lucene/document/TestDocument.java b/lucene/core/src/test/org/apache/lucene/document/TestDocument.java
index 50c1ed0a919a..ac5c43452b54 100644
--- a/lucene/core/src/test/org/apache/lucene/document/TestDocument.java
+++ b/lucene/core/src/test/org/apache/lucene/document/TestDocument.java
@@ -214,14 +214,10 @@ public void testGetValuesForIndexedDocument() throws Exception {
public void testGetValues() {
Document doc = makeDocumentWithFields();
- assertEquals(new String[] {"test1", "test2"},
- doc.getValues("keyword"));
- assertEquals(new String[] {"test1", "test2"},
- doc.getValues("text"));
- assertEquals(new String[] {"test1", "test2"},
- doc.getValues("unindexed"));
- assertEquals(new String[0],
- doc.getValues("nope"));
+ assertArrayEquals(new String[] {"test1", "test2"}, doc.getValues("keyword"));
+ assertArrayEquals(new String[] {"test1", "test2"}, doc.getValues("text"));
+ assertArrayEquals(new String[] {"test1", "test2"}, doc.getValues("unindexed"));
+ assertArrayEquals(new String[0], doc.getValues("nope"));
}
public void testPositionIncrementMultiFields() throws Exception {
diff --git a/lucene/core/src/test/org/apache/lucene/geo/TestTessellator.java b/lucene/core/src/test/org/apache/lucene/geo/TestTessellator.java
index 0d1f270776e5..2410ba13a5db 100644
--- a/lucene/core/src/test/org/apache/lucene/geo/TestTessellator.java
+++ b/lucene/core/src/test/org/apache/lucene/geo/TestTessellator.java
@@ -573,6 +573,14 @@ public void testComplexPolygon40() throws Exception {
}
}
+ public void testComplexPolygon41() throws Exception {
+ String wkt = "POLYGON((-1.569137181294115 54.4855283059375, -1.5692505240440333 54.48535373128068, -1.5684753656387294 54.48534438253056, -1.568606793880459 54.485674703738624, -1.5694141387939453 54.48611720532629, -1.569137181294115 54.4855283059375)," +
+ "(-1.569137181294115 54.4855283059375, -1.5690783030431206 54.48545352137167, -1.5689449291711688 54.48547663706703, -1.569137181294115 54.4855283059375)," +
+ "(-1.5689449291711688 54.48547663706703, -1.5689437289004642 54.48535482680399, -1.5687730514221028 54.48538045082698, -1.5689449291711688 54.48547663706703)," +
+ "(-1.5689449291711688 54.48547663706703, -1.5689879483854345 54.485580118416785, -1.5687756358893499 54.485612860811244, -1.568765285875931 54.485496217554285, -1.5689449291711688 54.48547663706703))";
+ checkPolygon(wkt);
+ }
+
private void checkPolygon(String wkt) throws Exception {
Polygon polygon = (Polygon) SimpleWKTShapeParser.parse(wkt);
List tessellation = Tessellator.tessellate(polygon);
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAllFilesDetectBitFlips.java b/lucene/core/src/test/org/apache/lucene/index/TestAllFilesDetectBitFlips.java
new file mode 100644
index 000000000000..f63387b74d03
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/index/TestAllFilesDetectBitFlips.java
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.index;
+
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.store.BaseDirectoryWrapper;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.LineFileDocs;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
+import org.apache.lucene.util.LuceneTestCase.SuppressFileSystems;
+import org.apache.lucene.util.TestUtil;
+
+/**
+ * Test that the default codec detects bit flips at open or checkIntegrity time.
+ */
+@SuppressFileSystems("ExtrasFS")
+@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9356")
+public class TestAllFilesDetectBitFlips extends LuceneTestCase {
+
+ public void test() throws Exception {
+ doTest(false);
+ }
+
+ public void testCFS() throws Exception {
+ doTest(true);
+ }
+
+ public void doTest(boolean cfs) throws Exception {
+ Directory dir = newDirectory();
+
+ IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
+ conf.setCodec(TestUtil.getDefaultCodec());
+
+ if (cfs == false) {
+ conf.setUseCompoundFile(false);
+ conf.getMergePolicy().setNoCFSRatio(0.0);
+ }
+
+ RandomIndexWriter riw = new RandomIndexWriter(random(), dir, conf);
+ // Use LineFileDocs so we (hopefully) get most Lucene features
+ // tested, e.g. IntPoint was recently added to it:
+ LineFileDocs docs = new LineFileDocs(random());
+ for (int i = 0; i < 100; i++) {
+ riw.addDocument(docs.nextDoc());
+ if (random().nextInt(7) == 0) {
+ riw.commit();
+ }
+ if (random().nextInt(20) == 0) {
+ riw.deleteDocuments(new Term("docid", Integer.toString(i)));
+ }
+ if (random().nextInt(15) == 0) {
+ riw.updateNumericDocValue(new Term("docid", Integer.toString(i)), "docid_intDV", Long.valueOf(i));
+ }
+ }
+ if (TEST_NIGHTLY == false) {
+ riw.forceMerge(1);
+ }
+ riw.close();
+ checkBitFlips(dir);
+ dir.close();
+ }
+
+ private void checkBitFlips(Directory dir) throws IOException {
+ for(String name : dir.listAll()) {
+ if (name.equals(IndexWriter.WRITE_LOCK_NAME) == false) {
+ corruptFile(dir, name);
+ }
+ }
+ }
+
+ private void corruptFile(Directory dir, String victim) throws IOException {
+ try (BaseDirectoryWrapper dirCopy = newDirectory()) {
+ dirCopy.setCheckIndexOnClose(false);
+
+ long victimLength = dir.fileLength(victim);
+ long flipOffset = TestUtil.nextLong(random(), 0, victimLength - 1);
+
+ if (VERBOSE) {
+ System.out.println("TEST: now corrupt file " + victim + " by changing byte at offset " + flipOffset + " (length= " + victimLength + ")");
+ }
+
+ for(String name : dir.listAll()) {
+ if (name.equals(victim) == false) {
+ dirCopy.copyFrom(dir, name, name, IOContext.DEFAULT);
+ } else {
+ try (IndexOutput out = dirCopy.createOutput(name, IOContext.DEFAULT);
+ IndexInput in = dir.openInput(name, IOContext.DEFAULT)) {
+ out.copyBytes(in, flipOffset);
+ out.writeByte((byte) (in.readByte() + TestUtil.nextInt(random(), 0x01, 0xFF)));
+ out.copyBytes(in, victimLength - flipOffset - 1);
+ }
+ try (IndexInput in = dirCopy.openInput(name, IOContext.DEFAULT)) {
+ try {
+ CodecUtil.checksumEntireFile(in);
+ System.out.println("TEST: changing a byte in " + victim + " did not update the checksum)");
+ return;
+ } catch (CorruptIndexException e) {
+ // ok
+ }
+ }
+ }
+ dirCopy.sync(Collections.singleton(name));
+ }
+
+ // corruption must be detected
+ expectThrowsAnyOf(Arrays.asList(CorruptIndexException.class, IndexFormatTooOldException.class, IndexFormatTooNewException.class),
+ () -> {
+ try (IndexReader reader = DirectoryReader.open(dirCopy)) {
+ for (LeafReaderContext context : reader.leaves()) {
+ context.reader().checkIntegrity();
+ }
+ }
+ }
+ );
+ }
+ }
+}
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java b/lucene/core/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java
index 050aed7864a9..a31194caf7ac 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java
@@ -113,9 +113,13 @@ protected boolean isOK(Throwable th) {
ioe.printStackTrace(System.out);
}
failure.clearDoFail();
- assertTrue(writer.isClosed());
+ // make sure we are closed or closing - if we are unlucky a merge does
+ // the actual closing for us. this is rare but might happen since the
+ // tragicEvent is checked by IFD and that might throw during a merge
+ expectThrows(AlreadyClosedException.class, writer::ensureOpen);
// Abort should have closed the deleter:
assertTrue(writer.isDeleterClosed());
+ writer.close(); // now wait for the close to actually happen if a merge thread did the close.
break outer;
}
}
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDemoParallelLeafReader.java b/lucene/core/src/test/org/apache/lucene/index/TestDemoParallelLeafReader.java
index 4b82800fd188..7fdad3ba2de9 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestDemoParallelLeafReader.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestDemoParallelLeafReader.java
@@ -538,7 +538,7 @@ public CodecReader wrapForMerge(CodecReader reader) throws IOException {
}
@Override
- public void mergeFinished() throws IOException {
+ public void mergeFinished(boolean success) throws IOException {
Throwable th = null;
for (ParallelLeafReader r : parallelReaders) {
try {
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java
index ae944de1bc32..042e2a80ff58 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java
@@ -2146,7 +2146,7 @@ public void testBadSort() throws Exception {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
iwc.setIndexSort(Sort.RELEVANCE);
});
- assertEquals("invalid SortField type: must be one of [STRING, INT, FLOAT, LONG, DOUBLE] but got: ", expected.getMessage());
+ assertEquals("Cannot sort index with sort field ", expected.getMessage());
}
// you can't change the index sort on an existing index:
@@ -2498,6 +2498,7 @@ public void testRandom3() throws Exception {
System.out.println(" float=" + docValues.floatValue);
System.out.println(" double=" + docValues.doubleValue);
System.out.println(" bytes=" + new BytesRef(docValues.bytesValue));
+ System.out.println(" mvf=" + Arrays.toString(docValues.floatValues));
}
Document doc = new Document();
@@ -2741,7 +2742,7 @@ public void testWrongSortFieldType() throws Exception {
Document doc = new Document();
doc.add(dvs.get(j));
IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> w.addDocument(doc));
- assertThat(exc.getMessage(), containsString("invalid doc value type"));
+ assertThat(exc.getMessage(), containsString("expected field [field] to be "));
doc.clear();
doc.add(dvs.get(i));
w.addDocument(doc);
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
index 8fb1ce5d75f7..5c6164a254ca 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
@@ -344,7 +344,7 @@ private static int getSegmentCount(Directory dir) throws IOException {
// Make sure it's OK to change RAM buffer size and
// maxBufferedDocs in a write session
public void testChangingRAMBuffer() throws IOException {
- Directory dir = newDirectory();
+ Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
writer.getConfig().setMaxBufferedDocs(10);
writer.getConfig().setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH);
@@ -607,7 +607,7 @@ public void testVariableSchema() throws Exception {
doc.add(newField("content4", contents, customType));
type = customType;
} else
- type = TextField.TYPE_NOT_STORED;
+ type = TextField.TYPE_NOT_STORED;
doc.add(newTextField("content1", contents, Field.Store.NO));
doc.add(newField("content3", "", customType));
doc.add(newField("content5", "", type));
@@ -663,13 +663,13 @@ public void testEmptyFieldName() throws IOException {
writer.close();
dir.close();
}
-
+
public void testEmptyFieldNameTerms() throws IOException {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
Document doc = new Document();
doc.add(newTextField("", "a b c", Field.Store.NO));
- writer.addDocument(doc);
+ writer.addDocument(doc);
writer.close();
DirectoryReader reader = DirectoryReader.open(dir);
LeafReader subreader = getOnlyLeafReader(reader);
@@ -681,7 +681,7 @@ public void testEmptyFieldNameTerms() throws IOException {
reader.close();
dir.close();
}
-
+
public void testEmptyFieldNameWithEmptyTerm() throws IOException {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
@@ -690,7 +690,7 @@ public void testEmptyFieldNameWithEmptyTerm() throws IOException {
doc.add(newStringField("", "a", Field.Store.NO));
doc.add(newStringField("", "b", Field.Store.NO));
doc.add(newStringField("", "c", Field.Store.NO));
- writer.addDocument(doc);
+ writer.addDocument(doc);
writer.close();
DirectoryReader reader = DirectoryReader.open(dir);
LeafReader subreader = getOnlyLeafReader(reader);
@@ -834,7 +834,7 @@ public void testDeadlock() throws Exception {
customType.setStoreTermVectors(true);
customType.setStoreTermVectorPositions(true);
customType.setStoreTermVectorOffsets(true);
-
+
doc.add(newField("content", "aaa bbb ccc ddd eee fff ggg hhh iii", customType));
writer.addDocument(doc);
writer.addDocument(doc);
@@ -922,7 +922,7 @@ public void run() {
// open/close slowly sometimes
dir.setUseSlowOpenClosers(true);
-
+
// throttle a little
dir.setThrottling(MockDirectoryWrapper.Throttling.SOMETIMES);
@@ -1148,7 +1148,7 @@ public void testIndexStoreCombos() throws Exception {
FieldType customType = new FieldType(StoredField.TYPE);
customType.setTokenized(true);
-
+
Field f = new Field("binary", b, 10, 17, customType);
// TODO: this is evil, changing the type after creating the field:
customType.setIndexOptions(IndexOptions.DOCS);
@@ -1157,7 +1157,7 @@ public void testIndexStoreCombos() throws Exception {
f.setTokenStream(doc1field1);
FieldType customType2 = new FieldType(TextField.TYPE_STORED);
-
+
Field f2 = newField("string", "value", customType2);
final MockTokenizer doc1field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
doc1field2.setReader(new StringReader("doc1field2"));
@@ -1233,7 +1233,7 @@ public void testNoDocsIndex() throws Throwable {
public void testDeleteUnusedFiles() throws Exception {
assumeFalse("test relies on exact filenames", Codec.getDefault() instanceof SimpleTextCodec);
assumeWorkingMMapOnWindows();
-
+
for(int iter=0;iter<2;iter++) {
// relies on windows semantics
Path path = createTempDir();
@@ -1250,7 +1250,7 @@ public void testDeleteUnusedFiles() throws Exception {
}
MergePolicy mergePolicy = newLogMergePolicy(true);
-
+
// This test expects all of its segments to be in CFS
mergePolicy.setNoCFSRatio(1.0);
mergePolicy.setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY);
@@ -1338,7 +1338,7 @@ public void testDeleteUnusedFiles2() throws Exception {
customType.setStoreTermVectors(true);
customType.setStoreTermVectorPositions(true);
customType.setStoreTermVectorOffsets(true);
-
+
doc.add(newField("c", "val", customType));
writer.addDocument(doc);
writer.commit();
@@ -1379,7 +1379,7 @@ public void testEmptyDirRollback() throws Exception {
// indexed, flushed (but not committed) and then IW rolls back, then no
// files are left in the Directory.
Directory dir = newDirectory();
-
+
String[] origFiles = dir.listAll();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))
.setMaxBufferedDocs(2)
@@ -1409,8 +1409,8 @@ public void testEmptyDirRollback() throws Exception {
// Adding just one document does not call flush yet.
int computedExtraFileCount = 0;
for (String file : dir.listAll()) {
- if (IndexWriter.WRITE_LOCK_NAME.equals(file) ||
- file.startsWith(IndexFileNames.SEGMENTS) ||
+ if (IndexWriter.WRITE_LOCK_NAME.equals(file) ||
+ file.startsWith(IndexFileNames.SEGMENTS) ||
IndexFileNames.CODEC_FILE_PATTERN.matcher(file).matches()) {
if (file.lastIndexOf('.') < 0
// don't count stored fields and term vectors in, or any temporary files they might
@@ -1458,7 +1458,7 @@ public void testNoUnwantedTVFiles() throws Exception {
FieldType customType3 = new FieldType(TextField.TYPE_STORED);
customType3.setTokenized(false);
customType3.setOmitNorms(true);
-
+
for (int i=0; i<2; i++) {
Document doc = new Document();
doc.add(new Field("id", Integer.toString(i)+BIG, customType3));
@@ -1478,7 +1478,7 @@ public void testNoUnwantedTVFiles() throws Exception {
SegmentReader sr = (SegmentReader) ctx.reader();
assertFalse(sr.getFieldInfos().hasVectors());
}
-
+
r0.close();
dir.close();
}
@@ -1501,7 +1501,7 @@ public StringSplitTokenizer() {
@Override
public final boolean incrementToken() {
- clearAttributes();
+ clearAttributes();
if (upto < tokens.length) {
termAtt.setEmpty();
termAtt.append(tokens[upto]);
@@ -1724,7 +1724,7 @@ public void testPrepareCommitThenRollback2() throws Exception {
r.close();
dir.close();
}
-
+
public void testDontInvokeAnalyzerForUnAnalyzedFields() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
@@ -1759,13 +1759,13 @@ public int getOffsetGap(String fieldName) {
w.close();
dir.close();
}
-
+
//LUCENE-1468 -- make sure opening an IndexWriter with
// create=true does not remove non-index files
-
+
public void testOtherFiles() throws Throwable {
Directory dir = newDirectory();
- IndexWriter iw = new IndexWriter(dir,
+ IndexWriter iw = new IndexWriter(dir,
newIndexWriterConfig(new MockAnalyzer(random())));
iw.addDocument(new Document());
iw.close();
@@ -1774,15 +1774,15 @@ public void testOtherFiles() throws Throwable {
IndexOutput out = dir.createOutput("myrandomfile", newIOContext(random()));
out.writeByte((byte) 42);
out.close();
-
+
new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))).close();
-
+
assertTrue(slowFileExists(dir, "myrandomfile"));
} finally {
dir.close();
}
}
-
+
// LUCENE-3849
public void testStopwordsPosIncHole() throws Exception {
Directory dir = newDirectory();
@@ -1811,7 +1811,7 @@ protected TokenStreamComponents createComponents(String fieldName) {
ir.close();
dir.close();
}
-
+
// LUCENE-3849
public void testStopwordsPosIncHole2() throws Exception {
// use two stopfilters for testing here
@@ -1843,23 +1843,23 @@ protected TokenStreamComponents createComponents(String fieldName) {
ir.close();
dir.close();
}
-
+
// LUCENE-4575
public void testCommitWithUserDataOnly() throws Exception {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(null));
writer.commit(); // first commit to complete IW create transaction.
-
+
// this should store the commit data, even though no other changes were made
writer.setLiveCommitData(new HashMap() {{
put("key", "value");
}}.entrySet());
writer.commit();
-
+
DirectoryReader r = DirectoryReader.open(dir);
assertEquals("value", r.getIndexCommit().getUserData().get("key"));
r.close();
-
+
// now check setCommitData and prepareCommit/commit sequence
writer.setLiveCommitData(new HashMap() {{
put("key", "value1");
@@ -1873,7 +1873,7 @@ public void testCommitWithUserDataOnly() throws Exception {
r = DirectoryReader.open(dir);
assertEquals("value1", r.getIndexCommit().getUserData().get("key"));
r.close();
-
+
// now should commit the second commitData - there was a bug where
// IndexWriter.finishCommit overrode the second commitData
writer.commit();
@@ -1881,7 +1881,7 @@ public void testCommitWithUserDataOnly() throws Exception {
assertEquals("IndexWriter.finishCommit may have overridden the second commitData",
"value2", r.getIndexCommit().getUserData().get("key"));
r.close();
-
+
writer.close();
dir.close();
}
@@ -1896,7 +1896,7 @@ private Map getLiveCommitData(IndexWriter writer) {
}
return data;
}
-
+
@Test
public void testGetCommitData() throws Exception {
Directory dir = newDirectory();
@@ -1906,16 +1906,16 @@ public void testGetCommitData() throws Exception {
}}.entrySet());
assertEquals("value", getLiveCommitData(writer).get("key"));
writer.close();
-
+
// validate that it's also visible when opening a new IndexWriter
writer = new IndexWriter(dir, newIndexWriterConfig(null)
.setOpenMode(OpenMode.APPEND));
assertEquals("value", getLiveCommitData(writer).get("key"));
writer.close();
-
+
dir.close();
}
-
+
public void testNullAnalyzer() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwConf = newIndexWriterConfig(null);
@@ -1942,7 +1942,7 @@ public void testNullAnalyzer() throws IOException {
iw.close();
dir.close();
}
-
+
public void testNullDocument() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
@@ -1967,7 +1967,7 @@ public void testNullDocument() throws IOException {
iw.close();
dir.close();
}
-
+
public void testNullDocuments() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
@@ -1992,7 +1992,7 @@ public void testNullDocuments() throws IOException {
iw.close();
dir.close();
}
-
+
public void testIterableFieldThrowsException() throws IOException {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
@@ -2000,7 +2000,7 @@ public void testIterableFieldThrowsException() throws IOException {
int docCount = 0;
int docId = 0;
Set liveIds = new HashSet<>();
- for (int i = 0; i < iters; i++) {
+ for (int i = 0; i < iters; i++) {
int numDocs = atLeast(4);
for (int j = 0; j < numDocs; j++) {
String id = Integer.toString(docId++);
@@ -2008,7 +2008,7 @@ public void testIterableFieldThrowsException() throws IOException {
fields.add(new StringField("id", id, Field.Store.YES));
fields.add(new StringField("foo", TestUtil.randomSimpleString(random()), Field.Store.NO));
docId++;
-
+
boolean success = false;
try {
w.addDocument(new RandomFailingIterable(fields, random()));
@@ -2040,7 +2040,7 @@ public void testIterableFieldThrowsException() throws IOException {
w.close();
IOUtils.close(reader, dir);
}
-
+
public void testIterableThrowsException() throws IOException {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
@@ -2088,7 +2088,7 @@ public void testIterableThrowsException() throws IOException {
w.close();
IOUtils.close(reader, dir);
}
-
+
public void testIterableThrowsException2() throws IOException {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
@@ -2128,7 +2128,7 @@ public RandomFailingIterable(Iterable extends T> list, Random random) {
this.list = list;
this.failOn = random.nextInt(5);
}
-
+
@Override
public Iterator iterator() {
final Iterator extends T> docIter = list.iterator();
@@ -2254,7 +2254,7 @@ public void testHasUncommittedChanges() throws IOException {
writer.close();
dir.close();
}
-
+
public void testMergeAllDeleted() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
@@ -2477,12 +2477,12 @@ public void testIds() throws Exception {
IndexWriter w = new IndexWriter(d, newIndexWriterConfig(new MockAnalyzer(random())));
w.addDocument(new Document());
w.close();
-
+
SegmentInfos sis = SegmentInfos.readLatestCommit(d);
byte[] id1 = sis.getId();
assertNotNull(id1);
assertEquals(StringHelper.ID_LENGTH, id1.length);
-
+
byte[] id2 = sis.info(0).info.getId();
byte[] sciId2 = sis.info(0).getId();
assertNotNull(id2);
@@ -2514,7 +2514,7 @@ public void testIds() throws Exception {
ids.add(id);
}
}
-
+
public void testEmptyNorm() throws Exception {
Directory d = newDirectory();
IndexWriter w = new IndexWriter(d, newIndexWriterConfig(new MockAnalyzer(random())));
@@ -2579,7 +2579,7 @@ public void testNRTSegmentsFile() throws Exception {
assertEquals(1, r2.getIndexCommit().getGeneration());
assertEquals("segments_1", r2.getIndexCommit().getSegmentsFileName());
r2.close();
-
+
// make a change and another commit
w.addDocument(new Document());
w.commit();
@@ -2866,7 +2866,7 @@ public void testLeftoverTempFiles() throws Exception {
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
IndexWriter w = new IndexWriter(dir, iwc);
w.close();
-
+
IndexOutput out = dir.createTempOutput("_0", "bkd", IOContext.DEFAULT);
String tempName = out.getName();
out.close();
@@ -3151,7 +3151,7 @@ public void testSoftUpdateDocuments() throws IOException {
expectThrows(IllegalArgumentException.class, () -> {
writer.softUpdateDocument(null, new Document(), new NumericDocValuesField("soft_delete", 1));
});
-
+
expectThrows(IllegalArgumentException.class, () -> {
writer.softUpdateDocument(new Term("id", "1"), new Document());
});
@@ -4167,4 +4167,76 @@ public void testSegmentCommitInfoId() throws IOException {
}
}
}
+
+ public void testMergeZeroDocsMergeIsClosedOnce() throws IOException {
+ LogDocMergePolicy keepAllSegments = new LogDocMergePolicy() {
+ @Override
+ public boolean keepFullyDeletedSegment(IOSupplier readerIOSupplier) {
+ return true;
+ }
+ };
+ try (Directory dir = newDirectory()) {
+ try (IndexWriter writer = new IndexWriter(dir,
+ new IndexWriterConfig().setMergePolicy(new OneMergeWrappingMergePolicy(keepAllSegments, merge -> {
+ SetOnce onlyFinishOnce = new SetOnce<>();
+ return new MergePolicy.OneMerge(merge.segments) {
+ @Override
+ public void mergeFinished(boolean success) {
+ onlyFinishOnce.set(true);
+ }
+ };
+ })))) {
+ Document doc = new Document();
+ doc.add(new StringField("id", "1", Field.Store.NO));
+ writer.addDocument(doc);
+ writer.flush();
+ writer.addDocument(doc);
+ writer.flush();
+ writer.deleteDocuments(new Term("id", "1"));
+ writer.flush();
+ assertEquals(2, writer.getSegmentCount());
+ assertEquals(0, writer.getDocStats().numDocs);
+ assertEquals(2, writer.getDocStats().maxDoc);
+ writer.forceMerge(1);
+ }
+ }
+ }
+
+ public void testMergeOnCommitKeepFullyDeletedSegments() throws Exception {
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = newIndexWriterConfig();
+ iwc.setMaxCommitMergeWaitSeconds(30);
+ iwc.mergePolicy = new FilterMergePolicy(newMergePolicy()) {
+ @Override
+ public boolean keepFullyDeletedSegment(IOSupplier readerIOSupplier) {
+ return true;
+ }
+
+ @Override
+ public MergeSpecification findFullFlushMerges(MergeTrigger mergeTrigger,
+ SegmentInfos segmentInfos,
+ MergeContext mergeContext) {
+ List fullyDeletedSegments = segmentInfos.asList().stream()
+ .filter(s -> s.info.maxDoc() - s.getDelCount() == 0)
+ .collect(Collectors.toList());
+ if (fullyDeletedSegments.isEmpty()) {
+ return null;
+ }
+ MergeSpecification spec = new MergeSpecification();
+ spec.add(new OneMerge(fullyDeletedSegments));
+ return spec;
+ }
+ };
+ IndexWriter w = new IndexWriter(dir, iwc);
+ Document d = new Document();
+ d.add(new StringField("id", "1", Field.Store.YES));
+ w.addDocument(d);
+ w.commit();
+ w.updateDocument(new Term("id", "1"), d);
+ w.commit();
+ try (DirectoryReader reader = w.getReader()) {
+ assertEquals(1, reader.numDocs());
+ }
+ IOUtils.close(w, dir);
+ }
}
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterDelete.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterDelete.java
index bb16884269cc..e7dbfd294da4 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterDelete.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterDelete.java
@@ -702,8 +702,8 @@ private void doTestOperationsOnDiskFull(boolean updates) throws IOException {
}
dir.close();
- // Try again with 10 more bytes of free space:
- diskFree += 10;
+ // Try again with more bytes of free space:
+ diskFree += Math.max(10, diskFree >>> 3);
}
startDir.close();
}
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java
index ce591a280c6e..2577f6b28ecb 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java
@@ -18,17 +18,42 @@
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
public class TestIndexWriterMergePolicy extends LuceneTestCase {
-
+
+ private static final MergePolicy MERGE_ON_COMMIT_POLICY = new LogDocMergePolicy() {
+ @Override
+ public MergeSpecification findFullFlushMerges(MergeTrigger mergeTrigger, SegmentInfos segmentInfos, MergeContext mergeContext) {
+ // Optimize down to a single segment on commit
+ if (mergeTrigger == MergeTrigger.COMMIT && segmentInfos.size() > 1) {
+ List nonMergingSegments = new ArrayList<>();
+ for (SegmentCommitInfo sci : segmentInfos) {
+ if (mergeContext.getMergingSegments().contains(sci) == false) {
+ nonMergingSegments.add(sci);
+ }
+ }
+ if (nonMergingSegments.size() > 1) {
+ MergeSpecification mergeSpecification = new MergeSpecification();
+ mergeSpecification.add(new OneMerge(nonMergingSegments));
+ return mergeSpecification;
+ }
+ }
+ return null;
+ }
+ };
+
// Test the normal case
public void testNormalCase() throws IOException {
Directory dir = newDirectory();
@@ -278,6 +303,50 @@ public void testSetters() {
assertSetters(new LogDocMergePolicy());
}
+ // Test basic semantics of merge on commit
+ public void testMergeOnCommit() throws IOException {
+ Directory dir = newDirectory();
+
+ IndexWriter firstWriter = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))
+ .setMergePolicy(NoMergePolicy.INSTANCE));
+ for (int i = 0; i < 5; i++) {
+ TestIndexWriter.addDoc(firstWriter);
+ firstWriter.flush();
+ }
+ DirectoryReader firstReader = DirectoryReader.open(firstWriter);
+ assertEquals(5, firstReader.leaves().size());
+ firstReader.close();
+ firstWriter.close(); // When this writer closes, it does not merge on commit.
+
+ IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()))
+ .setMergePolicy(MERGE_ON_COMMIT_POLICY).setMaxCommitMergeWaitSeconds(30);
+
+
+ IndexWriter writerWithMergePolicy = new IndexWriter(dir, iwc);
+ writerWithMergePolicy.commit(); // No changes. Commit doesn't trigger a merge.
+
+ DirectoryReader unmergedReader = DirectoryReader.open(writerWithMergePolicy);
+ assertEquals(5, unmergedReader.leaves().size());
+ unmergedReader.close();
+
+ TestIndexWriter.addDoc(writerWithMergePolicy);
+ writerWithMergePolicy.commit(); // Doc added, do merge on commit.
+ assertEquals(1, writerWithMergePolicy.getSegmentCount()); //
+
+ DirectoryReader mergedReader = DirectoryReader.open(writerWithMergePolicy);
+ assertEquals(1, mergedReader.leaves().size());
+ mergedReader.close();
+
+ try (IndexReader reader = writerWithMergePolicy.getReader()) {
+ IndexSearcher searcher = new IndexSearcher(reader);
+ assertEquals(6, reader.numDocs());
+ assertEquals(6, searcher.count(new MatchAllDocsQuery()));
+ }
+
+ writerWithMergePolicy.close();
+ dir.close();
+ }
+
private void assertSetters(MergePolicy lmp) {
lmp.setMaxCFSSegmentSizeMB(2.0);
assertEquals(2.0, lmp.getMaxCFSSegmentSizeMB(), EPSILON);
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterReader.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterReader.java
index 5f8650fe0aaa..228c34366db5 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterReader.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterReader.java
@@ -809,11 +809,16 @@ private Directory getAssertNoDeletesDirectory(Directory directory) {
// Stress test reopen during add/delete
public void testDuringAddDelete() throws Exception {
Directory dir1 = newDirectory();
+ IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()))
+ .setMergePolicy(newLogMergePolicy(2));
+ if (TEST_NIGHTLY) {
+ // if we have a ton of iterations we need to make sure we don't do unnecessary
+ // extra flushing otherwise we will timeout on nightly
+ iwc.setRAMBufferSizeMB(IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB);
+ iwc.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH);
+ }
final IndexWriter writer = new IndexWriter(
- dir1,
- newIndexWriterConfig(new MockAnalyzer(random()))
- .setMergePolicy(newLogMergePolicy(2))
- );
+ dir1,iwc);
// create the index
createIndexNoClose(false, "test", writer);
@@ -822,7 +827,7 @@ public void testDuringAddDelete() throws Exception {
DirectoryReader r = writer.getReader();
final int iters = TEST_NIGHTLY ? 1000 : 10;
- final List excs = Collections.synchronizedList(new ArrayList());
+ final List excs = Collections.synchronizedList(new ArrayList<>());
final Thread[] threads = new Thread[numThreads];
final AtomicInteger remainingThreads = new AtomicInteger(numThreads);
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestMergePolicy.java b/lucene/core/src/test/org/apache/lucene/index/TestMergePolicy.java
new file mode 100644
index 000000000000..e5f5635e1f19
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/index/TestMergePolicy.java
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.index;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.Version;
+
+public class TestMergePolicy extends LuceneTestCase {
+
+ public void testWaitForOneMerge() throws IOException, InterruptedException {
+ try (Directory dir = newDirectory()) {
+ MergePolicy.MergeSpecification ms = createRandomMergeSpecification(dir, 1 + random().nextInt(10));
+ for (MergePolicy.OneMerge m : ms.merges) {
+ assertFalse(m.hasCompletedSuccessfully().isPresent());
+ }
+ Thread t = new Thread(() -> {
+ try {
+ for (MergePolicy.OneMerge m : ms.merges) {
+ m.mergeFinished(true);
+ }
+ } catch (IOException e) {
+ throw new AssertionError(e);
+ }
+ });
+ t.start();
+ assertTrue(ms.await(100, TimeUnit.HOURS));
+ for (MergePolicy.OneMerge m : ms.merges) {
+ assertTrue(m.hasCompletedSuccessfully().get());
+ }
+ t.join();
+ }
+ }
+
+ public void testTimeout() throws IOException, InterruptedException {
+ try (Directory dir = newDirectory()) {
+ MergePolicy.MergeSpecification ms = createRandomMergeSpecification(dir, 3);
+ for (MergePolicy.OneMerge m : ms.merges) {
+ assertFalse(m.hasCompletedSuccessfully().isPresent());
+ }
+ Thread t = new Thread(() -> {
+ try {
+ ms.merges.get(0).mergeFinished(true);
+ } catch (IOException e) {
+ throw new AssertionError(e);
+ }
+ });
+ t.start();
+ assertFalse(ms.await(10, TimeUnit.MILLISECONDS));
+ assertFalse(ms.merges.get(1).hasCompletedSuccessfully().isPresent());
+ t.join();
+ }
+ }
+
+ public void testTimeoutLargeNumberOfMerges() throws IOException, InterruptedException {
+ try (Directory dir = newDirectory()) {
+ MergePolicy.MergeSpecification ms = createRandomMergeSpecification(dir, 10000);
+ for (MergePolicy.OneMerge m : ms.merges) {
+ assertFalse(m.hasCompletedSuccessfully().isPresent());
+ }
+ AtomicInteger i = new AtomicInteger(0);
+ AtomicBoolean stop = new AtomicBoolean(false);
+ Thread t = new Thread(() -> {
+ while (stop.get() == false) {
+ try {
+ ms.merges.get(i.getAndIncrement()).mergeFinished(true);
+ Thread.sleep(1);
+ } catch (IOException | InterruptedException e) {
+ throw new AssertionError(e);
+ }
+ }
+ });
+ t.start();
+ assertFalse(ms.await(10, TimeUnit.MILLISECONDS));
+ stop.set(true);
+ t.join();
+ for (int j = 0; j < ms.merges.size(); j++) {
+ if (j < i.get()) {
+ assertTrue(ms.merges.get(j).hasCompletedSuccessfully().get());
+ } else {
+ assertFalse(ms.merges.get(j).hasCompletedSuccessfully().isPresent());
+ }
+ }
+ }
+ }
+
+ @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9408")
+ public void testFinishTwice() throws IOException {
+ try (Directory dir = newDirectory()) {
+ MergePolicy.MergeSpecification spec = createRandomMergeSpecification(dir, 1);
+ MergePolicy.OneMerge oneMerge = spec.merges.get(0);
+ oneMerge.mergeFinished(true);
+ expectThrows(IllegalStateException.class, () -> oneMerge.mergeFinished(false));
+ }
+ }
+
+ public void testTotalMaxDoc() throws IOException {
+ try (Directory dir = newDirectory()) {
+ MergePolicy.MergeSpecification spec = createRandomMergeSpecification(dir, 1);
+ int docs = 0;
+ MergePolicy.OneMerge oneMerge = spec.merges.get(0);
+ for (SegmentCommitInfo info : oneMerge.segments) {
+ docs += info.info.maxDoc();
+ }
+ assertEquals(docs, oneMerge.totalMaxDoc);
+ }
+ }
+
+ private static MergePolicy.MergeSpecification createRandomMergeSpecification(Directory dir, int numMerges) {
+ MergePolicy.MergeSpecification ms = new MergePolicy.MergeSpecification();
+ for (int ii = 0; ii < numMerges; ++ii) {
+ final SegmentInfo si = new SegmentInfo(
+ dir, // dir
+ Version.LATEST, // version
+ Version.LATEST, // min version
+ TestUtil.randomSimpleString(random()), // name
+ random().nextInt(1000), // maxDoc
+ random().nextBoolean(), // isCompoundFile
+ null, // codec
+ Collections.emptyMap(), // diagnostics
+ TestUtil.randomSimpleString(// id
+ random(),
+ StringHelper.ID_LENGTH,
+ StringHelper.ID_LENGTH).getBytes(StandardCharsets.US_ASCII),
+ Collections.emptyMap(), // attributes
+ null /* indexSort */);
+ final List segments = new LinkedList();
+ segments.add(new SegmentCommitInfo(si, 0, 0, 0, 0, 0, StringHelper.randomId()));
+ ms.add(new MergePolicy.OneMerge(segments));
+ }
+ return ms;
+ }
+}
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPointValues.java b/lucene/core/src/test/org/apache/lucene/index/TestPointValues.java
index ee778ed90027..d982953a2f62 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestPointValues.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestPointValues.java
@@ -396,7 +396,7 @@ public void testIllegalTooManyDimensions() throws Exception {
public void testDifferentCodecs1() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
- iwc.setCodec(Codec.forName("Lucene84"));
+ iwc.setCodec(Codec.forName("Lucene86"));
IndexWriter w = new IndexWriter(dir, iwc);
Document doc = new Document();
doc.add(new IntPoint("int", 1));
@@ -427,7 +427,7 @@ public void testDifferentCodecs2() throws Exception {
w.close();
iwc = new IndexWriterConfig(new MockAnalyzer(random()));
- iwc.setCodec(Codec.forName("Lucene84"));
+ iwc.setCodec(Codec.forName("Lucene86"));
w = new IndexWriter(dir, iwc);
doc = new Document();
doc.add(new IntPoint("int", 1));
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java b/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java
index 19d821481e00..4570f0906001 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestSegmentInfos.java
@@ -18,15 +18,21 @@
import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.mockfile.ExtrasFS;
import org.apache.lucene.search.Sort;
import org.apache.lucene.store.BaseDirectoryWrapper;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.Version;
import java.io.IOException;
+import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
@@ -178,5 +184,64 @@ public void testIDChangesOnAdvance() throws IOException {
assertEquals("clone changed but shouldn't", StringHelper.idToString(id), StringHelper.idToString(clone.getId()));
}
}
+
+ public void testBitFlippedTriggersCorruptIndexException() throws IOException {
+ BaseDirectoryWrapper dir = newDirectory();
+ dir.setCheckIndexOnClose(false);
+ byte id[] = StringHelper.randomId();
+ Codec codec = Codec.getDefault();
+
+ SegmentInfos sis = new SegmentInfos(Version.LATEST.major);
+ SegmentInfo info = new SegmentInfo(dir, Version.LATEST, Version.LATEST, "_0", 1, false, Codec.getDefault(),
+ Collections.emptyMap(), id, Collections.emptyMap(), null);
+ info.setFiles(Collections.emptySet());
+ codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT);
+ SegmentCommitInfo commitInfo = new SegmentCommitInfo(info, 0, 0, -1, -1, -1, StringHelper.randomId());
+ sis.add(commitInfo);
+
+ info = new SegmentInfo(dir, Version.LATEST, Version.LATEST, "_1", 1, false, Codec.getDefault(),
+ Collections.emptyMap(), id, Collections.emptyMap(), null);
+ info.setFiles(Collections.emptySet());
+ codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT);
+ commitInfo = new SegmentCommitInfo(info, 0, 0,-1, -1, -1, StringHelper.randomId());
+ sis.add(commitInfo);
+
+ sis.commit(dir);
+
+ BaseDirectoryWrapper corruptDir = newDirectory();
+ corruptDir.setCheckIndexOnClose(false);
+ boolean corrupt = false;
+ for (String file : dir.listAll()) {
+ if (file.startsWith(IndexFileNames.SEGMENTS)) {
+ try (IndexInput in = dir.openInput(file, IOContext.DEFAULT);
+ IndexOutput out = corruptDir.createOutput(file, IOContext.DEFAULT)) {
+ final long corruptIndex = TestUtil.nextLong(random(), 0, in.length() - 1);
+ out.copyBytes(in, corruptIndex);
+ final int b = Byte.toUnsignedInt(in.readByte()) + TestUtil.nextInt(random(), 0x01, 0xff);
+ out.writeByte((byte) b);
+ out.copyBytes(in, in.length() - in.getFilePointer());
+ }
+ try (IndexInput in = corruptDir.openInput(file, IOContext.DEFAULT)) {
+ CodecUtil.checksumEntireFile(in);
+ if (VERBOSE) {
+ System.out.println("TEST: Altering the file did not update the checksum, aborting...");
+ }
+ return;
+ } catch (CorruptIndexException e) {
+ // ok
+ }
+ corrupt = true;
+ } else if (ExtrasFS.isExtra(file) == false) {
+ corruptDir.copyFrom(dir, file, file, IOContext.DEFAULT);
+ }
+ }
+ assertTrue("No segments file found", corrupt);
+
+ expectThrowsAnyOf(
+ Arrays.asList(CorruptIndexException.class, IndexFormatTooOldException.class, IndexFormatTooNewException.class),
+ () -> SegmentInfos.readLatestCommit(corruptDir));
+ dir.close();
+ corruptDir.close();
+ }
}
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTermsHashPerField.java b/lucene/core/src/test/org/apache/lucene/index/TestTermsHashPerField.java
new file mode 100644
index 000000000000..a8a879bf5c10
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/index/TestTermsHashPerField.java
@@ -0,0 +1,209 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.index;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.Collectors;
+
+import com.carrotsearch.randomizedtesting.generators.RandomPicks;
+import com.carrotsearch.randomizedtesting.generators.RandomStrings;
+import org.apache.lucene.util.ByteBlockPool;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.Counter;
+import org.apache.lucene.util.IntBlockPool;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestTermsHashPerField extends LuceneTestCase {
+
+ private static TermsHashPerField createNewHash(AtomicInteger newCalled, AtomicInteger addCalled) {
+ IntBlockPool intBlockPool = new IntBlockPool();
+ ByteBlockPool byteBlockPool = new ByteBlockPool(new ByteBlockPool.DirectAllocator());
+ ByteBlockPool termBlockPool = new ByteBlockPool(new ByteBlockPool.DirectAllocator());
+
+ TermsHashPerField hash = new TermsHashPerField(1, intBlockPool, byteBlockPool, termBlockPool, Counter.newCounter(),
+ null, "testfield", IndexOptions.DOCS_AND_FREQS) {
+
+ private FreqProxTermsWriterPerField.FreqProxPostingsArray freqProxPostingsArray;
+
+ @Override
+ void newTerm(int termID, int docID) {
+ newCalled.incrementAndGet();
+ FreqProxTermsWriterPerField.FreqProxPostingsArray postings = freqProxPostingsArray;
+ postings.lastDocIDs[termID] = docID;
+ postings.lastDocCodes[termID] = docID << 1;
+ postings.termFreqs[termID] = 1;
+ }
+
+ @Override
+ void addTerm(int termID, int docID) {
+ addCalled.incrementAndGet();
+ FreqProxTermsWriterPerField.FreqProxPostingsArray postings = freqProxPostingsArray;
+ if (docID != postings.lastDocIDs[termID]) {
+ if (1 == postings.termFreqs[termID]) {
+ writeVInt(0, postings.lastDocCodes[termID]|1);
+ } else {
+ writeVInt(0, postings.lastDocCodes[termID]);
+ writeVInt(0, postings.termFreqs[termID]);
+ }
+ postings.termFreqs[termID] = 1;
+ postings.lastDocCodes[termID] = (docID - postings.lastDocIDs[termID]) << 1;
+ postings.lastDocIDs[termID] = docID;
+ } else {
+ postings.termFreqs[termID] = Math.addExact(postings.termFreqs[termID], 1);
+ }
+ }
+
+ @Override
+ void newPostingsArray() {
+ freqProxPostingsArray = (FreqProxTermsWriterPerField.FreqProxPostingsArray) postingsArray;
+ }
+
+ @Override
+ ParallelPostingsArray createPostingsArray(int size) {
+ return new FreqProxTermsWriterPerField.FreqProxPostingsArray(size, true, false, false);
+ }
+ };
+ return hash;
+ }
+
+ boolean assertDocAndFreq(ByteSliceReader reader, FreqProxTermsWriterPerField.FreqProxPostingsArray postingsArray, int prevDoc, int termId, int doc, int frequency) throws IOException {
+ int docId = prevDoc;
+ int freq;
+ boolean eof = reader.eof();
+ if (eof) {
+ docId = postingsArray.lastDocIDs[termId];
+ freq = postingsArray.termFreqs[termId];
+ } else {
+ int code = reader.readVInt();
+ docId += code >>> 1;
+ if ((code & 1) != 0) {
+ freq = 1;
+ } else {
+ freq = reader.readVInt();
+ }
+ }
+ assertEquals("docID mismatch eof: " + eof, doc, docId);
+ assertEquals("freq mismatch eof: " + eof, frequency, freq);
+ return eof;
+ }
+
+ public void testAddAndUpdateTerm() throws IOException {
+ AtomicInteger newCalled = new AtomicInteger(0);
+ AtomicInteger addCalled = new AtomicInteger(0);
+ TermsHashPerField hash = createNewHash(newCalled, addCalled);
+ hash.start(null, true);
+
+ hash.add(new BytesRef("start"), 0); // tid = 0;
+ hash.add(new BytesRef("foo"), 0); // tid = 1;
+ hash.add(new BytesRef("bar"), 0); // tid = 2;
+ hash.finish();
+ hash.add(new BytesRef("bar"), 1);
+ hash.add(new BytesRef("foobar"), 1); // tid = 3;
+ hash.add(new BytesRef("bar"), 1);
+ hash.add(new BytesRef("bar"), 1);
+ hash.add(new BytesRef("foobar"), 1);
+ hash.add(new BytesRef("verylongfoobarbaz"), 1); // tid = 4;
+ hash.finish();
+ hash.add(new BytesRef("verylongfoobarbaz"), 2);
+ hash.add(new BytesRef("boom"), 2); // tid = 5;
+ hash.finish();
+ hash.add(new BytesRef("verylongfoobarbaz"), 3);
+ hash.add(new BytesRef("end"), 3); // tid = 6;
+ hash.finish();
+
+ assertEquals(7, newCalled.get());
+ assertEquals(6, addCalled.get());
+ final ByteSliceReader reader = new ByteSliceReader();
+ hash.initReader(reader, 0, 0);
+ assertTrue(assertDocAndFreq(reader, (FreqProxTermsWriterPerField.FreqProxPostingsArray) hash.postingsArray, 0, 0, 0, 1));
+ hash.initReader(reader, 1, 0);
+ assertTrue(assertDocAndFreq(reader, (FreqProxTermsWriterPerField.FreqProxPostingsArray) hash.postingsArray, 0, 1, 0, 1));
+ hash.initReader(reader, 2, 0);
+ assertFalse(assertDocAndFreq(reader, (FreqProxTermsWriterPerField.FreqProxPostingsArray) hash.postingsArray, 0, 2, 0, 1));
+ assertTrue(assertDocAndFreq(reader, (FreqProxTermsWriterPerField.FreqProxPostingsArray) hash.postingsArray, 2, 2, 1, 3));
+ hash.initReader(reader, 3, 0);
+ assertTrue(assertDocAndFreq(reader, (FreqProxTermsWriterPerField.FreqProxPostingsArray) hash.postingsArray, 0, 3, 1, 2));
+ hash.initReader(reader, 4, 0);
+ assertFalse(assertDocAndFreq(reader, (FreqProxTermsWriterPerField.FreqProxPostingsArray) hash.postingsArray, 0, 4, 1, 1));
+ assertFalse(assertDocAndFreq(reader, (FreqProxTermsWriterPerField.FreqProxPostingsArray) hash.postingsArray, 1, 4, 2, 1));
+ assertTrue(assertDocAndFreq(reader, (FreqProxTermsWriterPerField.FreqProxPostingsArray) hash.postingsArray, 2, 4, 3, 1));
+ hash.initReader(reader, 5, 0);
+ assertTrue(assertDocAndFreq(reader, (FreqProxTermsWriterPerField.FreqProxPostingsArray) hash.postingsArray, 0, 5, 2, 1));
+ hash.initReader(reader, 6, 0);
+ assertTrue(assertDocAndFreq(reader, (FreqProxTermsWriterPerField.FreqProxPostingsArray) hash.postingsArray, 0, 6, 3, 1));
+ }
+
+ public void testAddAndUpdateRandom() throws IOException {
+ AtomicInteger newCalled = new AtomicInteger(0);
+ AtomicInteger addCalled = new AtomicInteger(0);
+ TermsHashPerField hash = createNewHash(newCalled, addCalled);
+ hash.start(null, true);
+ class Posting {
+ int termId = -1;
+ final TreeMap docAndFreq = new TreeMap<>();
+ }
+ Map postingMap = new HashMap<>();
+ int numStrings = 1 + random().nextInt(200);
+ for (int i = 0; i < numStrings; i++) {
+ String randomString = RandomStrings.randomRealisticUnicodeOfCodepointLengthBetween(random(), 1, 10);
+ postingMap.putIfAbsent(new BytesRef(randomString), new Posting());
+ }
+ List bytesRefs = Arrays.asList(postingMap.keySet().toArray(new BytesRef[0]));
+ Collections.sort(bytesRefs);
+ int numDocs = 1 + random().nextInt(200);
+ int termOrd = 0;
+ for (int i = 0; i < numDocs; i++) {
+ int numTerms = 1 + random().nextInt(200);
+ int doc = i;
+ for (int j = 0; i < numTerms; i++) {
+ BytesRef ref = RandomPicks.randomFrom(random(), bytesRefs);
+ Posting posting = postingMap.get(ref);
+ if (posting.termId == -1) {
+ posting.termId = termOrd++;
+ }
+ posting.docAndFreq.putIfAbsent(doc, 0);
+ posting.docAndFreq.compute(doc, (key, oldVal) -> oldVal+1);
+ hash.add(ref, doc);
+ }
+ hash.finish();
+ }
+ List values = postingMap.values().stream().filter( x -> x.termId != -1)
+ .collect(Collectors.toList());
+ Collections.shuffle(values, random()); // term order doesn't matter
+ final ByteSliceReader reader = new ByteSliceReader();
+ for (Posting p : values) {
+ hash.initReader(reader, p.termId, 0);
+ boolean eof = false;
+ int prefDoc = 0;
+ for (Map.Entry entry : p.docAndFreq.entrySet()) {
+ assertFalse("the reader must not be EOF here", eof);
+ eof = assertDocAndFreq(reader, (FreqProxTermsWriterPerField.FreqProxPostingsArray) hash.postingsArray,
+ prefDoc, p.termId, entry.getKey(), entry.getValue());
+ prefDoc = entry.getKey();
+ }
+ assertTrue("the last posting must be EOF on the reader", eof);
+ }
+ }
+}
diff --git a/lucene/core/src/test/org/apache/lucene/search/MultiCollectorTest.java b/lucene/core/src/test/org/apache/lucene/search/MultiCollectorTest.java
index c3b4f42650ac..80a5a9a99583 100644
--- a/lucene/core/src/test/org/apache/lucene/search/MultiCollectorTest.java
+++ b/lucene/core/src/test/org/apache/lucene/search/MultiCollectorTest.java
@@ -163,4 +163,176 @@ public void testCacheScoresIfNecessary() throws IOException {
reader.close();
dir.close();
}
+
+ public void testScorerWrappingForTopScores() throws IOException {
+ Directory dir = newDirectory();
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
+ iw.addDocument(new Document());
+ DirectoryReader reader = iw.getReader();
+ iw.close();
+ final LeafReaderContext ctx = reader.leaves().get(0);
+ Collector c1 = collector(ScoreMode.TOP_SCORES, MultiCollector.MinCompetitiveScoreAwareScorable.class);
+ Collector c2 = collector(ScoreMode.TOP_SCORES, MultiCollector.MinCompetitiveScoreAwareScorable.class);
+ MultiCollector.wrap(c1, c2).getLeafCollector(ctx).setScorer(new ScoreAndDoc());
+
+ c1 = collector(ScoreMode.TOP_SCORES, ScoreCachingWrappingScorer.class);
+ c2 = collector(ScoreMode.COMPLETE, ScoreCachingWrappingScorer.class);
+ MultiCollector.wrap(c1, c2).getLeafCollector(ctx).setScorer(new ScoreAndDoc());
+
+ reader.close();
+ dir.close();
+ }
+
+ public void testMinCompetitiveScore() throws IOException {
+ float[] currentMinScores = new float[3];
+ float[] minCompetitiveScore = new float[1];
+ Scorable scorer = new Scorable() {
+
+ @Override
+ public float score() throws IOException {
+ return 0;
+ }
+
+ @Override
+ public int docID() {
+ return 0;
+ }
+
+ @Override
+ public void setMinCompetitiveScore(float minScore) throws IOException {
+ minCompetitiveScore[0] = minScore;
+ }
+ };
+ Scorable s0 = new MultiCollector.MinCompetitiveScoreAwareScorable(scorer, 0, currentMinScores);
+ Scorable s1 = new MultiCollector.MinCompetitiveScoreAwareScorable(scorer, 1, currentMinScores);
+ Scorable s2 = new MultiCollector.MinCompetitiveScoreAwareScorable(scorer, 2, currentMinScores);
+ assertEquals(0f, minCompetitiveScore[0], 0);
+ s0.setMinCompetitiveScore(0.5f);
+ assertEquals(0f, minCompetitiveScore[0], 0);
+ s1.setMinCompetitiveScore(0.8f);
+ assertEquals(0f, minCompetitiveScore[0], 0);
+ s2.setMinCompetitiveScore(0.3f);
+ assertEquals(0.3f, minCompetitiveScore[0], 0);
+ s2.setMinCompetitiveScore(0.1f);
+ assertEquals(0.3f, minCompetitiveScore[0], 0);
+ s1.setMinCompetitiveScore(Float.MAX_VALUE);
+ assertEquals(0.3f, minCompetitiveScore[0], 0);
+ s2.setMinCompetitiveScore(Float.MAX_VALUE);
+ assertEquals(0.5f, minCompetitiveScore[0], 0);
+ s0.setMinCompetitiveScore(Float.MAX_VALUE);
+ assertEquals(Float.MAX_VALUE, minCompetitiveScore[0], 0);
+ }
+
+ public void testCollectionTermination() throws IOException {
+ Directory dir = newDirectory();
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
+ iw.addDocument(new Document());
+ DirectoryReader reader = iw.getReader();
+ iw.close();
+ final LeafReaderContext ctx = reader.leaves().get(0);
+ DummyCollector c1 = new TerminatingDummyCollector(1, ScoreMode.COMPLETE);
+ DummyCollector c2 = new TerminatingDummyCollector(2, ScoreMode.COMPLETE);
+
+ Collector mc = MultiCollector.wrap(c1, c2);
+ LeafCollector lc = mc.getLeafCollector(ctx);
+ lc.setScorer(new ScoreAndDoc());
+ lc.collect(0); // OK
+ assertTrue("c1's collect should be called", c1.collectCalled);
+ assertTrue("c2's collect should be called", c2.collectCalled);
+ c1.collectCalled = false;
+ c2.collectCalled = false;
+ lc.collect(1); // OK, but c1 should terminate
+ assertFalse("c1 should be removed already", c1.collectCalled);
+ assertTrue("c2's collect should be called", c2.collectCalled);
+ c2.collectCalled = false;
+
+ expectThrows(CollectionTerminatedException.class, () -> {
+ lc.collect(2);
+ });
+ assertFalse("c1 should be removed already", c1.collectCalled);
+ assertFalse("c2 should be removed already", c2.collectCalled);
+
+ reader.close();
+ dir.close();
+ }
+
+ public void testSetScorerOnCollectionTerminationSkipNonCompetitive() throws IOException {
+ doTestSetScorerOnCollectionTermination(true);
+ }
+
+ public void testSetScorerOnCollectionTerminationSkipNoSkips() throws IOException {
+ doTestSetScorerOnCollectionTermination(false);
+ }
+
+ private void doTestSetScorerOnCollectionTermination(boolean allowSkipNonCompetitive) throws IOException {
+ Directory dir = newDirectory();
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
+ iw.addDocument(new Document());
+ DirectoryReader reader = iw.getReader();
+ iw.close();
+ final LeafReaderContext ctx = reader.leaves().get(0);
+
+ DummyCollector c1 = new TerminatingDummyCollector(1, allowSkipNonCompetitive? ScoreMode.TOP_SCORES : ScoreMode.COMPLETE);
+ DummyCollector c2 = new TerminatingDummyCollector(2, allowSkipNonCompetitive? ScoreMode.TOP_SCORES : ScoreMode.COMPLETE);
+
+ Collector mc = MultiCollector.wrap(c1, c2);
+ LeafCollector lc = mc.getLeafCollector(ctx);
+ assertFalse(c1.setScorerCalled);
+ assertFalse(c2.setScorerCalled);
+ lc.setScorer(new ScoreAndDoc());
+ assertTrue(c1.setScorerCalled);
+ assertTrue(c2.setScorerCalled);
+ c1.setScorerCalled = false;
+ c2.setScorerCalled = false;
+ lc.collect(0); // OK
+
+ lc.setScorer(new ScoreAndDoc());
+ assertTrue(c1.setScorerCalled);
+ assertTrue(c2.setScorerCalled);
+ c1.setScorerCalled = false;
+ c2.setScorerCalled = false;
+
+ lc.collect(1); // OK, but c1 should terminate
+ lc.setScorer(new ScoreAndDoc());
+ assertFalse(c1.setScorerCalled);
+ assertTrue(c2.setScorerCalled);
+ c2.setScorerCalled = false;
+
+ expectThrows(CollectionTerminatedException.class, () -> {
+ lc.collect(2);
+ });
+ lc.setScorer(new ScoreAndDoc());
+ assertFalse(c1.setScorerCalled);
+ assertFalse(c2.setScorerCalled);
+
+ reader.close();
+ dir.close();
+ }
+
+ private static class TerminatingDummyCollector extends DummyCollector {
+
+ private final int terminateOnDoc;
+ private final ScoreMode scoreMode;
+
+ public TerminatingDummyCollector(int terminateOnDoc, ScoreMode scoreMode) {
+ super();
+ this.terminateOnDoc = terminateOnDoc;
+ this.scoreMode = scoreMode;
+ }
+
+ @Override
+ public void collect(int doc) throws IOException {
+ if (doc == terminateOnDoc) {
+ throw new CollectionTerminatedException();
+ }
+ super.collect(doc);
+ }
+
+ @Override
+ public ScoreMode scoreMode() {
+ return scoreMode;
+ }
+
+ }
+
}
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java b/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java
index 2ccfd9aa8cb7..3400f0e6dd67 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java
@@ -96,7 +96,7 @@ public static void beforeClass() throws Exception {
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
// randomized codecs are sometimes too costly for this test:
- iwc.setCodec(Codec.forName("Lucene84"));
+ iwc.setCodec(Codec.forName("Lucene86"));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter writer= new RandomIndexWriter(random(), directory, iwc);
// we'll make a ton of docs, disable store/norms/vectors
@@ -141,7 +141,7 @@ public static void beforeClass() throws Exception {
iwc = newIndexWriterConfig(new MockAnalyzer(random()));
// we need docID order to be preserved:
// randomized codecs are sometimes too costly for this test:
- iwc.setCodec(Codec.forName("Lucene84"));
+ iwc.setCodec(Codec.forName("Lucene86"));
iwc.setMergePolicy(newLogMergePolicy());
try (IndexWriter w = new IndexWriter(singleSegmentDirectory, iwc)) {
w.forceMerge(1, true);
@@ -167,7 +167,7 @@ public static void beforeClass() throws Exception {
iwc = newIndexWriterConfig(new MockAnalyzer(random()));
// randomized codecs are sometimes too costly for this test:
- iwc.setCodec(Codec.forName("Lucene84"));
+ iwc.setCodec(Codec.forName("Lucene86"));
RandomIndexWriter w = new RandomIndexWriter(random(), dir2, iwc);
w.addIndexes(copy);
copy.close();
@@ -179,7 +179,7 @@ public static void beforeClass() throws Exception {
iwc = newIndexWriterConfig(new MockAnalyzer(random()));
iwc.setMaxBufferedDocs(TestUtil.nextInt(random(), 50, 1000));
// randomized codecs are sometimes too costly for this test:
- iwc.setCodec(Codec.forName("Lucene84"));
+ iwc.setCodec(Codec.forName("Lucene86"));
RandomIndexWriter w = new RandomIndexWriter(random(), dir2, iwc);
doc = new Document();
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
index a95ad39cf59a..4e9bf9dcfdb1 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
@@ -24,6 +24,8 @@
import java.util.HashSet;
import java.util.List;
import java.util.Set;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.function.Supplier;
import com.carrotsearch.randomizedtesting.RandomizedTest;
import org.apache.lucene.analysis.MockAnalyzer;
@@ -36,15 +38,15 @@
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.Operations;
@@ -222,7 +224,46 @@ public void testFuzziness() throws Exception {
reader.close();
directory.close();
}
-
+
+ public void testPrefixLengthEqualStringLength() throws Exception {
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
+ addDoc("b*a", writer);
+ addDoc("b*ab", writer);
+ addDoc("b*abc", writer);
+ addDoc("b*abcd", writer);
+ String multibyte = "아프리카코끼리속";
+ addDoc(multibyte, writer);
+ IndexReader reader = writer.getReader();
+ IndexSearcher searcher = newSearcher(reader);
+ writer.close();
+
+ int maxEdits = 0;
+ int prefixLength = 3;
+ FuzzyQuery query = new FuzzyQuery(new Term("field", "b*a"), maxEdits, prefixLength);
+ ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
+ assertEquals(1, hits.length);
+
+ maxEdits = 1;
+ query = new FuzzyQuery(new Term("field", "b*a"), maxEdits, prefixLength);
+ hits = searcher.search(query, 1000).scoreDocs;
+ assertEquals(2, hits.length);
+
+ maxEdits = 2;
+ query = new FuzzyQuery(new Term("field", "b*a"), maxEdits, prefixLength);
+ hits = searcher.search(query, 1000).scoreDocs;
+ assertEquals(3, hits.length);
+
+ maxEdits = 1;
+ prefixLength = multibyte.length() - 1;
+ query = new FuzzyQuery(new Term("field", multibyte.substring(0, prefixLength)), maxEdits, prefixLength);
+ hits = searcher.search(query, 1000).scoreDocs;
+ assertEquals(1, hits.length);
+
+ reader.close();
+ directory.close();
+ }
+
public void test2() throws Exception {
Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory, new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));
@@ -409,7 +450,6 @@ public void testBoostOnlyRewrite() throws Exception {
public void testGiga() throws Exception {
- MockAnalyzer analyzer = new MockAnalyzer(random());
Directory index = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), index);
@@ -441,6 +481,7 @@ public void testGiga() throws Exception {
assertEquals(1, hits.length);
assertEquals("Giga byte", searcher.doc(hits[0].doc).get("field"));
r.close();
+ w.close();
index.close();
}
@@ -515,54 +556,13 @@ public void testErrorMessage() {
final String value = randomRealisticMultiByteUnicode(length);
FuzzyTermsEnum.FuzzyTermsException expected = expectThrows(FuzzyTermsEnum.FuzzyTermsException.class, () -> {
- new FuzzyQuery(new Term("field", value)).getTermsEnum(new Terms() {
- @Override
- public TermsEnum iterator() {
- return TermsEnum.EMPTY;
- }
-
- @Override
- public long size() {
- return 0;
- }
-
- @Override
- public long getSumTotalTermFreq() {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public long getSumDocFreq() {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public int getDocCount() {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public boolean hasFreqs() {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public boolean hasOffsets() {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public boolean hasPositions() {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public boolean hasPayloads() {
- throw new UnsupportedOperationException();
- }
- });
+ new FuzzyAutomatonBuilder(value, 2, 0, true).buildMaxEditAutomaton();
});
assertThat(expected.getMessage(), containsString(value));
+
+ expected = expectThrows(FuzzyTermsEnum.FuzzyTermsException.class,
+ () -> new FuzzyAutomatonBuilder(value, 2, 0, true).buildAutomatonSet());
+ assertThat(expected.getMessage(), containsString(value));
}
private void addDoc(String text, RandomIndexWriter writer) throws IOException {
@@ -600,6 +600,7 @@ public void testRandom() throws Exception {
w.addDocument(doc);
}
DirectoryReader r = w.getReader();
+ w.close();
//System.out.println("TEST: reader=" + r);
IndexSearcher s = newSearcher(r);
int iters = atLeast(200);
@@ -677,7 +678,7 @@ public void testRandom() throws Exception {
}
}
- IOUtils.close(r, w, dir);
+ IOUtils.close(r, dir);
}
private static class TermAndScore implements Comparable {
@@ -777,4 +778,31 @@ private static IntsRef toIntsRef(String s) {
}
return ref;
}
+
+ public void testVisitor() {
+ FuzzyQuery q = new FuzzyQuery(new Term("field", "blob"), 2);
+ AtomicBoolean visited = new AtomicBoolean(false);
+ q.visit(new QueryVisitor() {
+ @Override
+ public void consumeTermsMatching(Query query, String field, Supplier automaton) {
+ visited.set(true);
+ ByteRunAutomaton a = automaton.get();
+ assertMatches(a, "blob");
+ assertMatches(a, "bolb");
+ assertMatches(a, "blobby");
+ assertNoMatches(a, "bolbby");
+ }
+ });
+ assertTrue(visited.get());
+ }
+
+ private static void assertMatches(ByteRunAutomaton automaton, String text) {
+ BytesRef b = new BytesRef(text);
+ assertTrue(automaton.run(b.bytes, b.offset, b.length));
+ }
+
+ private static void assertNoMatches(ByteRunAutomaton automaton, String text) {
+ BytesRef b = new BytesRef(text);
+ assertFalse(automaton.run(b.bytes, b.offset, b.length));
+ }
}
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestLRUQueryCache.java b/lucene/core/src/test/org/apache/lucene/search/TestLRUQueryCache.java
index 7993beb7427e..ef023752a66b 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestLRUQueryCache.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestLRUQueryCache.java
@@ -181,14 +181,17 @@ public ScoreMode scoreMode() {
thread.join();
}
- if (error.get() != null) {
- throw error.get();
+ try {
+ if (error.get() != null) {
+ throw error.get();
+ }
+ queryCache.assertConsistent();
+ } finally {
+ mgr.close();
+ w.close();
+ dir.close();
+ queryCache.assertConsistent();
}
- queryCache.assertConsistent();
- mgr.close();
- w.close();
- dir.close();
- queryCache.assertConsistent();
}
public void testLRUEviction() throws Exception {
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMultiTermConstantScore.java b/lucene/core/src/test/org/apache/lucene/search/TestMultiTermConstantScore.java
index 69e1e1039b22..f4a2543925f9 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestMultiTermConstantScore.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestMultiTermConstantScore.java
@@ -28,13 +28,12 @@
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.junit.AfterClass;
+import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.IOException;
-import junit.framework.Assert;
-
public class TestMultiTermConstantScore extends BaseTestRangeFilter {
/** threshold for comparing floats */
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
index 3c6e29403e52..7b0c41896e4e 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
@@ -38,8 +38,8 @@
import org.apache.lucene.codecs.PointsFormat;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.PointsWriter;
-import org.apache.lucene.codecs.lucene60.Lucene60PointsReader;
-import org.apache.lucene.codecs.lucene60.Lucene60PointsWriter;
+import org.apache.lucene.codecs.lucene86.Lucene86PointsReader;
+import org.apache.lucene.codecs.lucene86.Lucene86PointsWriter;
import org.apache.lucene.document.BinaryPoint;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoublePoint;
@@ -1173,12 +1173,12 @@ public PointsFormat pointsFormat() {
return new PointsFormat() {
@Override
public PointsWriter fieldsWriter(SegmentWriteState writeState) throws IOException {
- return new Lucene60PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap);
+ return new Lucene86PointsWriter(writeState, maxPointsInLeafNode, maxMBSortInHeap);
}
@Override
public PointsReader fieldsReader(SegmentReadState readState) throws IOException {
- return new Lucene60PointsReader(readState);
+ return new Lucene86PointsReader(readState);
}
};
}
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
index 6a8e183e0d45..767ee20a7d45 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
@@ -50,7 +50,7 @@ public void setUp() throws Exception {
directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
Document doc = new Document();
- doc.add(newTextField(FN, "the quick brown fox jumps over the lazy ??? dog 493432 49344", Field.Store.NO));
+ doc.add(newTextField(FN, "the quick brown fox jumps over the lazy ??? dog 493432 49344 [foo] 12.3 \\", Field.Store.NO));
writer.addDocument(doc);
reader = writer.getReader();
writer.close();
@@ -90,6 +90,41 @@ public void testNumericRange() throws IOException {
assertEquals(0, regexQueryNrHits("<493433-600000>"));
}
+ public void testCharacterClasses() throws IOException {
+ assertEquals(0, regexQueryNrHits("\\d"));
+ assertEquals(1, regexQueryNrHits("\\d*"));
+ assertEquals(1, regexQueryNrHits("\\d{6}"));
+ assertEquals(1, regexQueryNrHits("[a\\d]{6}"));
+ assertEquals(1, regexQueryNrHits("\\d{2,7}"));
+ assertEquals(0, regexQueryNrHits("\\d{4}"));
+ assertEquals(0, regexQueryNrHits("\\dog"));
+ assertEquals(1, regexQueryNrHits("493\\d32"));
+
+ assertEquals(1, regexQueryNrHits("\\wox"));
+ assertEquals(1, regexQueryNrHits("493\\w32"));
+ assertEquals(1, regexQueryNrHits("\\?\\?\\?"));
+ assertEquals(1, regexQueryNrHits("\\?\\W\\?"));
+ assertEquals(1, regexQueryNrHits("\\?\\S\\?"));
+
+ assertEquals(1, regexQueryNrHits("\\[foo\\]"));
+ assertEquals(1, regexQueryNrHits("\\[\\w{3}\\]"));
+
+ assertEquals(0, regexQueryNrHits("\\s.*")); // no matches because all whitespace stripped
+ assertEquals(1, regexQueryNrHits("\\S*ck")); //matches quick
+ assertEquals(1, regexQueryNrHits("[\\d\\.]{3,10}")); // matches 12.3
+ assertEquals(1, regexQueryNrHits("\\d{1,3}(\\.(\\d{1,2}))+")); // matches 12.3
+
+ assertEquals(1, regexQueryNrHits("\\\\"));
+ assertEquals(1, regexQueryNrHits("\\\\.*"));
+
+ IllegalArgumentException expected = expectThrows(
+ IllegalArgumentException.class, () -> {
+ regexQueryNrHits("\\p");
+ }
+ );
+ assertTrue(expected.getMessage().contains("invalid character class"));
+ }
+
public void testRegexComplement() throws IOException {
assertEquals(1, regexQueryNrHits("4934~[3]"));
// not the empty lang, i.e. match all docs
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSameScoresWithThreads.java b/lucene/core/src/test/org/apache/lucene/search/TestSameScoresWithThreads.java
index a615a6a7fbb6..4b284dfd31b0 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestSameScoresWithThreads.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestSameScoresWithThreads.java
@@ -119,6 +119,7 @@ public void run() {
thread.join();
}
}
+ docs.close();
r.close();
dir.close();
}
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSearchAfter.java b/lucene/core/src/test/org/apache/lucene/search/TestSearchAfter.java
index 9dc9843d5106..59a8de9027fb 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestSearchAfter.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestSearchAfter.java
@@ -295,7 +295,7 @@ void assertPage(int pageStart, TopDocs all, TopDocs paged) throws IOException {
assertEquals(sd1.score, sd2.score, 0f);
if (sd1 instanceof FieldDoc) {
assertTrue(sd2 instanceof FieldDoc);
- assertEquals(((FieldDoc) sd1).fields, ((FieldDoc) sd2).fields);
+ assertArrayEquals(((FieldDoc) sd1).fields, ((FieldDoc) sd2).fields);
}
}
}
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSearcherManager.java b/lucene/core/src/test/org/apache/lucene/search/TestSearcherManager.java
index b92386669163..1d8edccf092d 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestSearcherManager.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestSearcherManager.java
@@ -310,6 +310,7 @@ public void testReferenceDecrementIllegally() throws Exception {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
new MockAnalyzer(random())).setMergeScheduler(new ConcurrentMergeScheduler()));
+ @SuppressWarnings("resource")
SearcherManager sm = new SearcherManager(writer, false, false, new SearcherFactory());
writer.addDocument(new Document());
writer.commit();
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTermQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestTermQuery.java
index e460e26ddba3..65986d968485 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestTermQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestTermQuery.java
@@ -21,6 +21,7 @@
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.CompositeReaderContext;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FilterDirectoryReader;
import org.apache.lucene.index.FilterLeafReader;
@@ -47,9 +48,13 @@ public void testEquals() throws IOException {
QueryUtils.checkUnequal(
new TermQuery(new Term("foo", "bar")),
new TermQuery(new Term("foo", "baz")));
+ final CompositeReaderContext context;
+ try (MultiReader multiReader = new MultiReader()) {
+ context = multiReader.getContext();
+ }
QueryUtils.checkEqual(
new TermQuery(new Term("foo", "bar")),
- new TermQuery(new Term("foo", "bar"), TermStates.build(new MultiReader().getContext(), new Term("foo", "bar"), true)));
+ new TermQuery(new Term("foo", "bar"), TermStates.build(context, new Term("foo", "bar"), true)));
}
public void testCreateWeightDoesNotSeekIfScoresAreNotNeeded() throws IOException {
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestCharsRef.java b/lucene/core/src/test/org/apache/lucene/util/TestCharsRef.java
index 079b3b774b53..fd9d639ea888 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestCharsRef.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestCharsRef.java
@@ -20,6 +20,8 @@
public class TestCharsRef extends LuceneTestCase {
+
+ @SuppressWarnings("deprecation")
public void testUTF16InUTF8Order() {
final int numStrings = atLeast(1000);
BytesRef utf8[] = new BytesRef[numStrings];
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestOfflineSorter.java b/lucene/core/src/test/org/apache/lucene/util/TestOfflineSorter.java
index 927af148afc7..f902cd58cc05 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestOfflineSorter.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestOfflineSorter.java
@@ -17,7 +17,6 @@
package org.apache.lucene.util;
-import java.io.EOFException;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
@@ -351,14 +350,12 @@ protected void corruptFile() throws IOException {
IndexOutput unsorted = dir.createTempOutput("unsorted", "tmp", IOContext.DEFAULT);
writeAll(unsorted, generateFixed(5*1024));
- // This corruption made OfflineSorter fail with its own exception, but we verify it also went and added (as suppressed) that the
- // checksum was wrong:
- EOFException e = expectThrows(EOFException.class, () -> {
+ // This corruption made OfflineSorter fail with its own exception, but we verify and throw a CorruptIndexException
+ // instead when checksums don't match.
+ CorruptIndexException e = expectThrows(CorruptIndexException.class, () -> {
new OfflineSorter(dir, "foo").sort(unsorted.getName());
});
- assertEquals(1, e.getSuppressed().length);
- assertTrue(e.getSuppressed()[0] instanceof CorruptIndexException);
- assertTrue(e.getSuppressed()[0].getMessage().contains("checksum failed (hardware problem?)"));
+ assertTrue(e.getMessage().contains("checksum failed (hardware problem?)"));
}
}
@@ -436,12 +433,10 @@ protected void corruptFile() throws IOException {
IndexOutput unsorted = dir.createTempOutput("unsorted", "tmp", IOContext.DEFAULT);
writeAll(unsorted, generateFixed((int) (OfflineSorter.MB * 3)));
- EOFException e = expectThrows(EOFException.class, () -> {
+ CorruptIndexException e = expectThrows(CorruptIndexException.class, () -> {
new OfflineSorter(dir, "foo", OfflineSorter.DEFAULT_COMPARATOR, BufferSize.megabytes(1), 10, -1, null, 0).sort(unsorted.getName());
});
- assertEquals(1, e.getSuppressed().length);
- assertTrue(e.getSuppressed()[0] instanceof CorruptIndexException);
- assertTrue(e.getSuppressed()[0].getMessage().contains("checksum failed (hardware problem?)"));
+ assertTrue(e.getMessage().contains("checksum failed (hardware problem?)"));
}
}
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestVersion.java b/lucene/core/src/test/org/apache/lucene/util/TestVersion.java
index e19c61508128..02d566ea10ae 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestVersion.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestVersion.java
@@ -23,6 +23,7 @@
import java.util.Locale;
import java.util.Random;
+@SuppressWarnings("deprecation")
public class TestVersion extends LuceneTestCase {
public void testOnOrAfter() throws Exception {
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
index 7d24939c3478..ce36eacfb10d 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
@@ -17,8 +17,12 @@
package org.apache.lucene.util.automaton;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
public class TestRegExp extends LuceneTestCase {
/**
@@ -83,4 +87,148 @@ public void testRepeatWithEmptyLanguage() throws Exception {
a = new RegExp("#?").toAutomaton(1000);
assertTrue(a.toString().length() > 0);
}
+
+ public void testCoreJavaParity() {
+ // Generate random doc values and random regular expressions
+ // and check for same matching behaviour as Java's Pattern class.
+ for (int i = 0; i < 1000; i++) {
+ checkRandomExpression(randomDocValue(1 + random().nextInt(30)));
+ }
+ }
+
+ public void testIllegalBackslashChars() {
+ String illegalChars = "abcefghijklmnopqrtuvxyzABCEFGHIJKLMNOPQRTUVXYZ";
+ for (int i = 0; i < illegalChars.length(); i++) {
+ String illegalExpression = "\\" + illegalChars.charAt(i);
+ IllegalArgumentException expected = expectThrows(
+ IllegalArgumentException.class, () -> {
+ new RegExp(illegalExpression);
+ }
+ );
+ assertTrue(expected.getMessage().contains("invalid character class"));
+ }
+ }
+
+ public void testLegalBackslashChars() {
+ String legalChars = "dDsSWw0123456789[]*&^$@!{}\\/";
+ for (int i = 0; i < legalChars.length(); i++) {
+ String legalExpression = "\\" + legalChars.charAt(i);
+ new RegExp(legalExpression);
+ }
+ }
+
+ static String randomDocValue(int minLength) {
+ String charPalette = "AAAaaaBbbCccc123456 \t";
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < minLength; i++) {
+ sb.append(charPalette.charAt(randomInt(charPalette.length() - 1)));
+ }
+ return sb.toString();
+ }
+
+ private static int randomInt(int bound) {
+ return bound == 0 ? 0 : random().nextInt(bound);
+ }
+
+ protected String checkRandomExpression(String docValue) {
+ // Generate and test a random regular expression which should match the given docValue
+ StringBuilder result = new StringBuilder();
+ // Pick a part of the string to change
+ int substitutionPoint = randomInt(docValue.length() - 1);
+ int substitutionLength = 1 + randomInt(Math.min(10, docValue.length() - substitutionPoint));
+
+ // Add any head to the result, unchanged
+ if (substitutionPoint > 0) {
+ result.append(docValue.substring(0, substitutionPoint));
+ }
+
+ // Modify the middle...
+ String replacementPart = docValue.substring(substitutionPoint, substitutionPoint + substitutionLength);
+ int mutation = random().nextInt(13);
+ switch (mutation) {
+ case 0:
+ // OR with random alpha of same length
+ result.append("(" + replacementPart + "|d" + randomDocValue(replacementPart.length()) + ")");
+ break;
+ case 1:
+ // OR with non-existant value
+ result.append("(" + replacementPart + "|doesnotexist)");
+ break;
+ case 2:
+ // OR with another randomised regex (used to create nested levels of expression).
+ result.append("(" + checkRandomExpression(replacementPart) + "|doesnotexist)");
+ break;
+ case 3:
+ // Star-replace all ab sequences.
+ result.append(replacementPart.replaceAll("ab", ".*"));
+ break;
+ case 4:
+ // .-replace all b chars
+ result.append(replacementPart.replaceAll("b", "."));
+ break;
+ case 5:
+ // length-limited stars {1,2}
+ result.append(".{1," + replacementPart.length() + "}");
+ break;
+ case 6:
+ // replace all chars with .
+ result.append(replacementPart.replaceAll(".", "."));
+ break;
+ case 7:
+ // OR with uppercase chars eg [aA] (many of these sorts of expression in the wild..
+ char[] chars = replacementPart.toCharArray();
+ for (char c : chars) {
+ result.append("[" + c + Character.toUpperCase(c) + "]");
+ }
+ break;
+ case 8:
+ // NOT a character - replace all b's with "not a"
+ result.append(replacementPart.replaceAll("b", "[^a]"));
+ break;
+ case 9:
+ // Make whole part repeatable 1 or more times
+ result.append("(" + replacementPart + ")+");
+ break;
+ case 10:
+ // Make whole part repeatable 0 or more times
+ result.append("(" + replacementPart + ")?");
+ break;
+ case 11:
+ // Make any digits replaced by character class
+ result.append(replacementPart.replaceAll("\\d", "\\\\d"));
+ break;
+ case 12:
+ // Make any whitespace chars replaced by not word class
+ result.append(replacementPart.replaceAll("\\s", "\\\\W"));
+ break;
+ case 13:
+ // Make any whitespace chars replace by whitespace class
+ result.append(replacementPart.replaceAll("\\s", "\\\\s"));
+ break;
+ default:
+ break;
+ }
+ // add any remaining tail, unchanged
+ if (substitutionPoint + substitutionLength <= docValue.length() - 1) {
+ result.append(docValue.substring(substitutionPoint + substitutionLength));
+ }
+
+ String regexPattern = result.toString();
+ // Assert our randomly generated regex actually matches the provided raw input using java's expression matcher
+ Pattern pattern = Pattern.compile(regexPattern);
+ Matcher matcher = pattern.matcher(docValue);
+ assertTrue("Java regex " + regexPattern + " did not match doc value " + docValue, matcher.matches());
+
+ RegExp regex = new RegExp(regexPattern);
+ Automaton automaton = regex.toAutomaton();
+ ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton);
+ BytesRef br = new BytesRef(docValue);
+ assertTrue(
+ "[" + regexPattern + "]should match [" + docValue + "]" + substitutionPoint + "-" + substitutionLength + "/"
+ + docValue.length(),
+ bytesMatcher.run(br.bytes, br.offset, br.length)
+ );
+ return regexPattern;
+ }
+
}
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java b/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java
index deccdf0498a6..27b5511a9822 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/Test2BBKDPoints.java
@@ -58,12 +58,14 @@ public void test1D() throws Exception {
}
}
IndexOutput out = dir.createOutput("1d.bkd", IOContext.DEFAULT);
- long indexFP = w.finish(out);
+ Runnable finalizer = w.finish(out, out, out);
+ long indexFP = out.getFilePointer();
+ finalizer.run();
out.close();
IndexInput in = dir.openInput("1d.bkd", IOContext.DEFAULT);
in.seek(indexFP);
- BKDReader r = new BKDReader(in);
+ BKDReader r = new BKDReader(in, in, in);
CheckIndex.VerifyPointsVisitor visitor = new CheckIndex.VerifyPointsVisitor("1d", numDocs, r);
r.intersect(visitor);
assertEquals(r.size(), visitor.getPointCountSeen());
@@ -98,12 +100,14 @@ public void test2D() throws Exception {
}
}
IndexOutput out = dir.createOutput("2d.bkd", IOContext.DEFAULT);
- long indexFP = w.finish(out);
+ Runnable finalizer = w.finish(out, out, out);
+ long indexFP = out.getFilePointer();
+ finalizer.run();
out.close();
IndexInput in = dir.openInput("2d.bkd", IOContext.DEFAULT);
in.seek(indexFP);
- BKDReader r = new BKDReader(in);
+ BKDReader r = new BKDReader(in, in, in);
CheckIndex.VerifyPointsVisitor visitor = new CheckIndex.VerifyPointsVisitor("2d", numDocs, r);
r.intersect(visitor);
assertEquals(r.size(), visitor.getPointCountSeen());
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
index 38d91f20f9b0..f9bb9ea56b59 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
@@ -46,8 +46,6 @@
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.TestUtil;
-import static com.carrotsearch.randomizedtesting.RandomizedTest.randomBoolean;
-
public class TestBKD extends LuceneTestCase {
public void testBasicInts1D() throws Exception {
@@ -61,12 +59,14 @@ public void testBasicInts1D() throws Exception {
long indexFP;
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
- indexFP = w.finish(out);
+ Runnable finalizer = w.finish(out, out, out);
+ indexFP = out.getFilePointer();
+ finalizer.run();
}
try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
in.seek(indexFP);
- BKDReader r = new BKDReader(in, randomBoolean());
+ BKDReader r = new BKDReader(in, in, in);
// Simple 1D range query:
final int queryMin = 42;
@@ -163,12 +163,14 @@ public void testRandomIntsNDims() throws Exception {
long indexFP;
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
- indexFP = w.finish(out);
+ Runnable finalizer = w.finish(out, out, out);
+ indexFP = out.getFilePointer();
+ finalizer.run();
}
try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
in.seek(indexFP);
- BKDReader r = new BKDReader(in, randomBoolean());
+ BKDReader r = new BKDReader(in, in, in);
byte[] minPackedValue = r.getMinPackedValue();
byte[] maxPackedValue = r.getMaxPackedValue();
@@ -290,13 +292,15 @@ public void testBigIntNDims() throws Exception {
}
long indexFP;
- try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
- indexFP = w.finish(out);
+ try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
+ Runnable finalizer = w.finish(out, out, out);
+ indexFP = out.getFilePointer();
+ finalizer.run();
}
try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
in.seek(indexFP);
- BKDReader r = new BKDReader(in, randomBoolean());
+ BKDReader r = new BKDReader(in, in, in);
int iters = atLeast(100);
for(int iter=0;iter 0) {
- toMerge.add(w.finish(out));
+ Runnable finalizer = w.finish(out, out, out);
+ toMerge.add(out.getFilePointer());
+ finalizer.run();
final int curDocIDBase = lastDocIDBase;
docMaps.add(new MergeState.DocMap() {
@Override
@@ -788,21 +796,25 @@ public int get(int docID) {
List readers = new ArrayList<>();
for(long fp : toMerge) {
in.seek(fp);
- readers.add(new BKDReader(in, randomBoolean()));
+ readers.add(new BKDReader(in, in, in));
}
out = dir.createOutput("bkd2", IOContext.DEFAULT);
- indexFP = w.merge(out, docMaps, readers);
+ Runnable finalizer = w.merge(out, out, out, docMaps, readers);
+ indexFP = out.getFilePointer();
+ finalizer.run();
out.close();
in.close();
in = dir.openInput("bkd2", IOContext.DEFAULT);
} else {
- indexFP = w.finish(out);
+ Runnable finalizer = w.finish(out, out, out);
+ indexFP = out.getFilePointer();
+ finalizer.run();
out.close();
in = dir.openInput("bkd", IOContext.DEFAULT);
}
in.seek(indexFP);
- BKDReader r = new BKDReader(in, randomBoolean());
+ BKDReader r = new BKDReader(in, in, in);
int iters = atLeast(100);
for(int iter=0;iter maxPointsInLeafNode) {
- actualMaxPointsInLeafNode = (actualMaxPointsInLeafNode + 1) / 2;
- }
+ BKDReader points = new BKDReader(pointsIn, pointsIn, pointsIn);
// If all points match, then the point count is numLeaves * maxPointsInLeafNode
- final int numLeaves = Integer.highestOneBit((numValues - 1) / actualMaxPointsInLeafNode) << 1;
- assertEquals(numLeaves * actualMaxPointsInLeafNode,
+ int numLeaves = numValues / maxPointsInLeafNode;
+ if (numValues % maxPointsInLeafNode != 0) {
+ numLeaves++;
+ }
+ assertEquals(numLeaves * maxPointsInLeafNode,
points.estimatePointCount(new IntersectVisitor() {
@Override
public void visit(int docID, byte[] packedValue) throws IOException {}
@@ -1363,8 +1383,8 @@ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
}
});
assertTrue(""+pointCount,
- pointCount == (actualMaxPointsInLeafNode + 1) / 2 || // common case
- pointCount == 2*((actualMaxPointsInLeafNode + 1) / 2)); // if the point is a split value
+ pointCount == (maxPointsInLeafNode + 1) / 2 || // common case
+ pointCount == 2*((maxPointsInLeafNode + 1) / 2)); // if the point is a split value
pointsIn.close();
dir.close();
@@ -1452,7 +1472,7 @@ public byte getByteAt(int i, int k) {
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP, numValues);
expectThrows(IllegalStateException.class, () -> {
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
- w.writeField(out, "test_field_name", reader);
+ w.writeField(out, out, out, "test_field_name", reader);
} finally {
w.close();
dir.close();
@@ -1563,7 +1583,7 @@ public int getDocCount() {
}
};
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
- IllegalStateException ex = expectThrows(IllegalStateException.class, () -> { w.writeField(out, "", val);});
+ IllegalStateException ex = expectThrows(IllegalStateException.class, () -> { w.writeField(out, out, out, "", val);});
assertEquals("totalPointCount=10 was passed when we were created, but we just hit 11 values", ex.getMessage());
w.close();
}
diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java
index 9ee6947b2e3f..8725b032af1f 100644
--- a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java
+++ b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java
@@ -119,10 +119,10 @@ public void test() throws Exception {
if (verify == 0) {
System.out.println("\nTEST: save/load FST and re-verify");
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
- fst.save(out);
+ fst.save(out, out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
- fst = new FST<>(in, outputs);
+ fst = new FST<>(in, in, outputs);
in.close();
} else {
dir.deleteFile("fst");
@@ -198,10 +198,10 @@ public void test() throws Exception {
if (verify == 0) {
System.out.println("\nTEST: save/load FST and re-verify");
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
- fst.save(out);
+ fst.save(out, out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
- fst = new FST<>(in, outputs);
+ fst = new FST<>(in, in, outputs);
in.close();
} else {
dir.deleteFile("fst");
@@ -256,7 +256,9 @@ public void test() throws Exception {
// forward lookup:
assertEquals(output, Util.get(fst, input).longValue());
// reverse lookup:
- assertEquals(input, Util.getByOutput(fst, output));
+ @SuppressWarnings("deprecation")
+ IntsRef inputResult = Util.getByOutput(fst, output);
+ assertEquals(input, inputResult);
output += 1 + r.nextInt(10);
nextInput(r, ints);
}
@@ -284,10 +286,10 @@ public void test() throws Exception {
if (verify == 0) {
System.out.println("\nTEST: save/load FST and re-verify");
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
- fst.save(out);
+ fst.save(out, out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
- fst = new FST<>(in, outputs);
+ fst = new FST<>(in, in, outputs);
in.close();
} else {
dir.deleteFile("fst");
diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDirectAddressing.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDirectAddressing.java
index 25ea6f6fe0e4..6cf2d76f3f14 100644
--- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDirectAddressing.java
+++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTDirectAddressing.java
@@ -174,7 +174,7 @@ public static void main(String... args) throws Exception {
private static void countFSTArcs(String fstFilePath) throws IOException {
byte[] buf = Files.readAllBytes(Paths.get(fstFilePath));
DataInput in = new ByteArrayDataInput(buf);
- FST fst = new FST<>(in, ByteSequenceOutputs.getSingleton());
+ FST fst = new FST<>(in, in, ByteSequenceOutputs.getSingleton());
BytesRefFSTEnum fstEnum = new BytesRefFSTEnum<>(fst);
int binarySearchArcCount = 0, directAddressingArcCount = 0, listArcCount = 0;
while(fstEnum.next() != null) {
@@ -228,7 +228,7 @@ private static void recompileAndWalk(String fstFilePath) throws IOException {
System.out.println("Reading FST");
long startTimeMs = System.currentTimeMillis();
- FST originalFst = new FST<>(in, CharSequenceOutputs.getSingleton());
+ FST originalFst = new FST<>(in, in, CharSequenceOutputs.getSingleton());
long endTimeMs = System.currentTimeMillis();
System.out.println("time = " + (endTimeMs - startTimeMs) + " ms");
diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
index e081f6c2f5e6..10319f9eb54e 100644
--- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
+++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
@@ -529,7 +529,7 @@ public void run(int limit, boolean verify, boolean verifyByOutput) throws IOExce
Directory dir = FSDirectory.open(dirOut);
IndexOutput out = dir.createOutput("fst.bin", IOContext.DEFAULT);
- fst.save(out);
+ fst.save(out, out);
out.close();
System.out.println("Saved FST to fst.bin.");
@@ -570,7 +570,7 @@ public void run(int limit, boolean verify, boolean verifyByOutput) throws IOExce
} else {
// Get by output
final Long output = (Long) getOutput(intsRef.get(), ord);
- @SuppressWarnings("unchecked") final IntsRef actual = Util.getByOutput((FST) fst, output.longValue());
+ @SuppressWarnings({"unchecked", "deprecation"}) final IntsRef actual = Util.getByOutput((FST) fst, output.longValue());
if (actual == null) {
throw new RuntimeException("unexpected null input from output=" + output);
}
@@ -833,13 +833,17 @@ public void testSimple() throws Exception {
assertEquals(b, seekResult.input);
assertEquals(42, (long) seekResult.output);
- assertEquals(Util.toIntsRef(new BytesRef("c"), new IntsRefBuilder()),
- Util.getByOutput(fst, 13824324872317238L));
- assertNull(Util.getByOutput(fst, 47));
- assertEquals(Util.toIntsRef(new BytesRef("b"), new IntsRefBuilder()),
- Util.getByOutput(fst, 42));
- assertEquals(Util.toIntsRef(new BytesRef("a"), new IntsRefBuilder()),
- Util.getByOutput(fst, 17));
+ @SuppressWarnings("deprecation") IntsRef byOutput = Util.getByOutput(fst, 13824324872317238L);
+ assertEquals(Util.toIntsRef(new BytesRef("c"), new IntsRefBuilder()), byOutput);
+
+ @SuppressWarnings("deprecation") IntsRef byOutput47 = Util.getByOutput(fst, 47);
+ assertNull(byOutput47);
+
+ @SuppressWarnings("deprecation") IntsRef byOutput42 = Util.getByOutput(fst, 42);
+ assertEquals(Util.toIntsRef(new BytesRef("b"), new IntsRefBuilder()), byOutput42);
+
+ @SuppressWarnings("deprecation") IntsRef byOutput17 = Util.getByOutput(fst, 17);
+ assertEquals(Util.toIntsRef(new BytesRef("a"), new IntsRefBuilder()), byOutput17);
}
public void testPrimaryKeys() throws Exception {
@@ -1191,11 +1195,11 @@ public void testNonFinalStopNode() throws Exception {
// Make sure it still works after save/load:
Directory dir = newDirectory();
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
- fst.save(out);
+ fst.save(out, out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
- final FST fst2 = new FST<>(in, outputs);
+ final FST fst2 = new FST<>(in, in, outputs);
checkStopNodes(fst2, outputs);
in.close();
dir.close();
diff --git a/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java b/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java
index 27ac460e9c4e..497052baed99 100644
--- a/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java
+++ b/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java
@@ -836,7 +836,9 @@ public void testEncodeDecode() {
final long[] blocks = new long[blocksOffset + blocksLen];
for (int i = 0; i < blocks.length; ++i) {
blocks[i] = random().nextLong();
- if (format == PackedInts.Format.PACKED_SINGLE_BLOCK && 64 % bpv != 0) {
+ @SuppressWarnings("deprecation")
+ PackedInts.Format PACKED_SINGLE_BLOCK = PackedInts.Format.PACKED_SINGLE_BLOCK;
+ if (format == PACKED_SINGLE_BLOCK && 64 % bpv != 0) {
// clear highest bits for packed
final int toClear = 64 % bpv;
blocks[i] = (blocks[i] << toClear) >>> toClear;
diff --git a/lucene/demo/build.gradle b/lucene/demo/build.gradle
index 2871774bbf88..8407fa6b5d63 100644
--- a/lucene/demo/build.gradle
+++ b/lucene/demo/build.gradle
@@ -17,6 +17,8 @@
apply plugin: 'java-library'
+description = 'Simple example code for Apache Lucene'
+
dependencies {
implementation project(':lucene:core')
implementation project(':lucene:facet')
diff --git a/lucene/expressions/build.gradle b/lucene/expressions/build.gradle
index 62d197830e76..7ba76a7f3a97 100644
--- a/lucene/expressions/build.gradle
+++ b/lucene/expressions/build.gradle
@@ -17,6 +17,8 @@
apply plugin: 'java-library'
+description = 'Dynamically computed values to sort/facet/search on based on a pluggable grammar'
+
dependencies {
api project(':lucene:core')
diff --git a/lucene/expressions/src/java/org/apache/lucene/expressions/ExpressionValueSource.java b/lucene/expressions/src/java/org/apache/lucene/expressions/ExpressionValueSource.java
index f4fa894af8a6..1f8713dd38f3 100644
--- a/lucene/expressions/src/java/org/apache/lucene/expressions/ExpressionValueSource.java
+++ b/lucene/expressions/src/java/org/apache/lucene/expressions/ExpressionValueSource.java
@@ -174,7 +174,7 @@ public DoubleValuesSource rewrite(IndexSearcher searcher) throws IOException {
DoubleValuesSource[] rewritten = new DoubleValuesSource[variables.length];
for (int i = 0; i < variables.length; i++) {
rewritten[i] = variables[i].rewrite(searcher);
- changed |= (rewritten[i] == variables[i]);
+ changed |= (rewritten[i] != variables[i]);
}
if (changed) {
return new ExpressionValueSource(rewritten, expression, needsScores);
diff --git a/lucene/expressions/src/test/org/apache/lucene/expressions/TestExpressionValueSource.java b/lucene/expressions/src/test/org/apache/lucene/expressions/TestExpressionValueSource.java
index d5dbabee5478..3b5589b2d7e3 100644
--- a/lucene/expressions/src/test/org/apache/lucene/expressions/TestExpressionValueSource.java
+++ b/lucene/expressions/src/test/org/apache/lucene/expressions/TestExpressionValueSource.java
@@ -28,9 +28,12 @@
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.search.DoubleValues;
import org.apache.lucene.search.DoubleValuesSource;
+import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
+import java.io.IOException;
+
public class TestExpressionValueSource extends LuceneTestCase {
DirectoryReader reader;
Directory dir;
@@ -125,4 +128,59 @@ public void testDoubleValuesSourceEquals() throws Exception {
assertFalse(vs1.equals(vs4));
}
+ public void testRewrite() throws Exception {
+ Expression expr = JavascriptCompiler.compile("a");
+
+ ExpressionValueSource rewritingExpressionSource = new ExpressionValueSource(
+ new DoubleValuesSource[]{createDoubleValuesSourceMock(true)},
+ expr,
+ false);
+ ExpressionValueSource notRewritingExpressionSource = new ExpressionValueSource(
+ new DoubleValuesSource[]{createDoubleValuesSourceMock(false)},
+ expr,
+ false);
+
+ assertNotSame(rewritingExpressionSource, rewritingExpressionSource.rewrite(null));
+ assertSame(notRewritingExpressionSource, notRewritingExpressionSource.rewrite(null));
+ }
+
+ private static DoubleValuesSource createDoubleValuesSourceMock(boolean rewriting) {
+ return new DoubleValuesSource() {
+ @Override
+ public DoubleValues getValues(LeafReaderContext ctx, DoubleValues scores) throws IOException {
+ return null;
+ }
+
+ @Override
+ public boolean needsScores() {
+ return false;
+ }
+
+ @Override
+ public DoubleValuesSource rewrite(IndexSearcher reader) throws IOException {
+ return rewriting ? createDoubleValuesSourceMock(true) : this;
+ }
+
+ @Override
+ public int hashCode() {
+ return 0;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ return false;
+ }
+
+ @Override
+ public String toString() {
+ return null;
+ }
+
+ @Override
+ public boolean isCacheable(LeafReaderContext ctx) {
+ return false;
+ }
+ };
+ }
+
}
diff --git a/lucene/facet/build.gradle b/lucene/facet/build.gradle
index c0ffc91e9298..6b6a6ef137e8 100644
--- a/lucene/facet/build.gradle
+++ b/lucene/facet/build.gradle
@@ -18,6 +18,7 @@
apply plugin: 'java-library'
+description = 'Faceted indexing and search capabilities'
dependencies {
api project(':lucene:core')
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/DrillSideways.java b/lucene/facet/src/java/org/apache/lucene/facet/DrillSideways.java
index 15df7e3d957b..6dbacf19578f 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/DrillSideways.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/DrillSideways.java
@@ -299,7 +299,7 @@ public DrillSidewaysResult search(ScoreDoc after, DrillDownQuery query, int topN
if (executor != null) { // We have an executor, let use the multi-threaded version
final CollectorManager collectorManager =
- new CollectorManager() {
+ new CollectorManager<>() {
@Override
public TopScoreDocCollector newCollector() throws IOException {
@@ -404,6 +404,7 @@ private DrillDownQuery getDrillDownQuery(final DrillDownQuery query, Query[] que
}
/** Runs a search, using a {@link CollectorManager} to gather and merge search results */
+ @SuppressWarnings("unchecked")
public ConcurrentDrillSidewaysResult search(final DrillDownQuery query,
final CollectorManager, R> hitCollectorManager) throws IOException {
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/FacetsConfig.java b/lucene/facet/src/java/org/apache/lucene/facet/FacetsConfig.java
index f958af187805..84f69d7bc0a4 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/FacetsConfig.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/FacetsConfig.java
@@ -496,8 +496,15 @@ private void checkTaxoWriter(TaxonomyWriter taxoWriter) {
}
}
- // Joins the path components together:
- private static final char DELIM_CHAR = '\u001F';
+ /**
+ * Character used to join the category path components together into a single
+ * drill down term for indexing. Applications and unit-tests can reference this for
+ * creating their own drill-down terms, or use existing APIs (for example,
+ * {@link #pathToString}).
+ *
+ * @lucene.internal
+ */
+ public static final char DELIM_CHAR = '\u001F';
// Escapes any occurrence of the path component inside the label:
private static final char ESCAPE_CHAR = '\u001E';
diff --git a/lucene/grouping/build.gradle b/lucene/grouping/build.gradle
index b18d3d9cf4dd..b0838e3dfd23 100644
--- a/lucene/grouping/build.gradle
+++ b/lucene/grouping/build.gradle
@@ -18,6 +18,8 @@
apply plugin: 'java-library'
+description = 'Collectors for grouping search results'
+
dependencies {
api project(':lucene:core')
diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/BlockGroupingCollector.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/BlockGroupingCollector.java
index 23601ca994b0..bbeb2ee204b4 100644
--- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/BlockGroupingCollector.java
+++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/BlockGroupingCollector.java
@@ -67,6 +67,10 @@
* @lucene.experimental
*/
+// TODO: TopGroups.merge() won't work with TopGroups returned by this collector, because
+// each block will be on a different shard. Add a specialized merge() static method
+// to this collector?
+
public class BlockGroupingCollector extends SimpleCollector {
private int[] pendingSubDocs;
diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/DoubleRange.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/DoubleRange.java
new file mode 100644
index 000000000000..df34f6b1e0df
--- /dev/null
+++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/DoubleRange.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.grouping;
+
+import java.util.Objects;
+
+/**
+ * Represents a contiguous range of double values, with an inclusive minimum and
+ * exclusive maximum
+ */
+public class DoubleRange {
+
+ /** The inclusive minimum value of this range */
+ public double min;
+ /** The exclusive maximum value of this range */
+ public double max;
+
+ /**
+ * Creates a new double range, running from {@code min} inclusive to {@code max} exclusive
+ */
+ public DoubleRange(double min, double max) {
+ this.min = min;
+ this.max = max;
+ }
+
+ @Override
+ public String toString() {
+ return "DoubleRange(" + min + ", " + max + ")";
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+ DoubleRange that = (DoubleRange) o;
+ return Double.compare(that.min, min) == 0 &&
+ Double.compare(that.max, max) == 0;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(min, max);
+ }
+}
diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/DoubleRangeFactory.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/DoubleRangeFactory.java
new file mode 100644
index 000000000000..3ea4606a6431
--- /dev/null
+++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/DoubleRangeFactory.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.grouping;
+
+/**
+ * Groups double values into ranges
+ */
+public class DoubleRangeFactory {
+
+ private final double min;
+ private final double width;
+ private final double max;
+
+ /**
+ * Creates a new DoubleRangeFactory
+ * @param min a minimum value; all doubles below this value are grouped into a single range
+ * @param width a standard width; all ranges between {@code min} and {@code max} are this wide,
+ * with the exception of the final range which may be up to this width. Ranges
+ * are inclusive at the lower end, and exclusive at the upper end.
+ * @param max a maximum value; all doubles above this value are grouped into a single range
+ */
+ public DoubleRangeFactory(double min, double width, double max) {
+ this.min = min;
+ this.width = width;
+ this.max = max;
+ }
+
+ /**
+ * Finds the DoubleRange that a value should be grouped into
+ * @param value the value to group
+ * @param reuse an existing DoubleRange object to reuse
+ */
+ public DoubleRange getRange(double value, DoubleRange reuse) {
+ if (reuse == null)
+ reuse = new DoubleRange(Double.MIN_VALUE, Double.MAX_VALUE);
+ if (value < min) {
+ reuse.max = min;
+ reuse.min = Double.MIN_VALUE;
+ return reuse;
+ }
+ if (value >= max) {
+ reuse.min = max;
+ reuse.max = Double.MAX_VALUE;
+ return reuse;
+ }
+ double bucket = Math.floor((value - min) / width);
+ reuse.min = min + (bucket * width);
+ reuse.max = reuse.min + width;
+ return reuse;
+ }
+
+}
diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/DoubleRangeGroupSelector.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/DoubleRangeGroupSelector.java
new file mode 100644
index 000000000000..4a6a65a72366
--- /dev/null
+++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/DoubleRangeGroupSelector.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.grouping;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.search.DoubleValues;
+import org.apache.lucene.search.DoubleValuesSource;
+import org.apache.lucene.search.Scorable;
+
+/**
+ * A GroupSelector implementation that groups documents by double values
+ */
+public class DoubleRangeGroupSelector extends GroupSelector {
+
+ private final DoubleValuesSource source;
+ private final DoubleRangeFactory rangeFactory;
+
+ private Set inSecondPass;
+ private boolean includeEmpty = true;
+ private boolean positioned;
+ private DoubleRange current;
+
+ private LeafReaderContext context;
+ private DoubleValues values;
+
+ /**
+ * Creates a new DoubleRangeGroupSelector
+ * @param source a DoubleValuesSource to retrieve double values per document
+ * @param rangeFactory a DoubleRangeFactory that defines how to group the double values into range buckets
+ */
+ public DoubleRangeGroupSelector(DoubleValuesSource source, DoubleRangeFactory rangeFactory) {
+ this.source = source;
+ this.rangeFactory = rangeFactory;
+ }
+
+ @Override
+ public void setNextReader(LeafReaderContext readerContext) throws IOException {
+ this.context = readerContext;
+ }
+
+ @Override
+ public void setScorer(Scorable scorer) throws IOException {
+ this.values = source.getValues(context, DoubleValuesSource.fromScorer(scorer));
+ }
+
+ @Override
+ public State advanceTo(int doc) throws IOException {
+ positioned = values.advanceExact(doc);
+ if (positioned == false) {
+ return includeEmpty ? State.ACCEPT : State.SKIP;
+ }
+ this.current = rangeFactory.getRange(values.doubleValue(), this.current);
+ if (inSecondPass == null) {
+ return State.ACCEPT;
+ }
+ return inSecondPass.contains(this.current) ? State.ACCEPT : State.SKIP;
+ }
+
+ @Override
+ public DoubleRange currentValue() throws IOException {
+ return positioned ? this.current : null;
+ }
+
+ @Override
+ public DoubleRange copyValue() throws IOException {
+ return positioned ? new DoubleRange(this.current.min, this.current.max) : null;
+ }
+
+ @Override
+ public void setGroups(Collection> searchGroups) {
+ inSecondPass = new HashSet<>();
+ includeEmpty = false;
+ for (SearchGroup group : searchGroups) {
+ if (group.groupValue == null)
+ includeEmpty = true;
+ else
+ inSecondPass.add(group.groupValue);
+ }
+ }
+}
diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java
index 6a745b8e7352..f5b05974e416 100644
--- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java
+++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java
@@ -151,6 +151,7 @@ public Collection> getTopGroups(int groupOffset) throws IOExcepti
@Override
public void setScorer(Scorable scorer) throws IOException {
+ groupSelector.setScorer(scorer);
for (LeafFieldComparator comparator : leafComparators) {
comparator.setScorer(scorer);
}
diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupSelector.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupSelector.java
index dbb09329f8ae..92962a4d4570 100644
--- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupSelector.java
+++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupSelector.java
@@ -21,6 +21,7 @@
import java.util.Collection;
import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.search.Scorable;
/**
* Defines a group, for use by grouping collectors
@@ -43,6 +44,11 @@ public enum State { SKIP, ACCEPT }
*/
public abstract void setNextReader(LeafReaderContext readerContext) throws IOException;
+ /**
+ * Set the current Scorer
+ */
+ public abstract void setScorer(Scorable scorer) throws IOException;
+
/**
* Advance the GroupSelector's iterator to the given document
*/
@@ -53,12 +59,12 @@ public enum State { SKIP, ACCEPT }
*
* N.B. this object may be reused, for a persistent version use {@link #copyValue()}
*/
- public abstract T currentValue();
+ public abstract T currentValue() throws IOException;
/**
* @return a copy of the group value of the current document
*/
- public abstract T copyValue();
+ public abstract T copyValue() throws IOException;
/**
* Set a restriction on the group values returned by this selector
diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupingSearch.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupingSearch.java
index b88fb743f087..25ed3770addc 100644
--- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupingSearch.java
+++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/GroupingSearch.java
@@ -71,6 +71,14 @@ public GroupingSearch(String groupField) {
this(new TermGroupSelector(groupField), null);
}
+ /**
+ * Constructs a GroupingSearch instance that groups documents using a {@link GroupSelector}
+ * @param groupSelector a {@link GroupSelector} that defines groups for this GroupingSearch
+ */
+ public GroupingSearch(GroupSelector> groupSelector) {
+ this(groupSelector, null);
+ }
+
/**
* Constructs a GroupingSearch instance that groups documents by function using a {@link ValueSource}
* instance.
@@ -78,7 +86,7 @@ public GroupingSearch(String groupField) {
* @param groupFunction The function to group by specified as {@link ValueSource}
* @param valueSourceContext The context of the specified groupFunction
*/
- public GroupingSearch(ValueSource groupFunction, Map, ?> valueSourceContext) {
+ public GroupingSearch(ValueSource groupFunction, Map