Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rebase on vanilla Solr 9.3 #9826

Merged
merged 4 commits into from
Aug 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
123 changes: 60 additions & 63 deletions conf/solr/9.3.0/schema.xml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@


For more information, on how to customize this file, please see
http://lucene.apache.org/solr/guide/documents-fields-and-schema-design.html
https://solr.apache.org/guide/solr/latest/indexing-guide/schema-elements.html

PERFORMANCE NOTE: this schema includes many optional features and should not
be used for benchmarking. To improve performance one could
Expand All @@ -38,7 +38,7 @@
catchall "text" field, and use that for searching.
-->

<schema name="default-config" version="1.7">
<schema name="default-config" version="1.6">
<!-- attribute "name" is the name of this schema and is only used for display purposes.
version="x.y" is Solr's version number for the schema syntax and
semantics. It should not normally be changed by applications.
Expand Down Expand Up @@ -129,15 +129,8 @@
<!-- catchall text field that indexes tokens both normally and in reverse for efficient
leading wildcard queries. -->
<field name="text_rev" type="text_general_rev" indexed="true" stored="false" multiValued="true"/>
<field name="name" type="text_en" indexed="true" stored="true"/>








<field name="name" type="text_en" indexed="true" stored="true"/>

<field name="definitionPointDocId" type="string" stored="true" indexed="true" multiValued="false"/>
<field name="definitionPointDvObjectId" type="string" stored="true" indexed="true" multiValued="false"/>
<field name="discoverableBy" type="string" stored="true" indexed="true" multiValued="true"/>
Expand All @@ -163,7 +156,7 @@

<field name="publicationStatus" type="string" stored="true" indexed="true" multiValued="true"/>
<field name="externalStatus" type="string" stored="true" indexed="true" multiValued="false"/>
<field name="embargoEndDate" type="long" stored="true" indexed="true" multiValued="false"/>
<field name="embargoEndDate" type="plong" stored="true" indexed="true" multiValued="false"/>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@qqmyers do you remember why you picked long for embargoEndDate when releasedate uses int?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not really. I see in the IndexServiceBean that the embargo date is turned into a long via toEpochDay() where the release date is indexed directly as a timestamp, but I'm not sure why that is.


<field name="subtreePaths" type="string" stored="true" indexed="true" multiValued="true"/>

Expand Down Expand Up @@ -200,28 +193,28 @@
<field name="identifier" type="string" stored="true" indexed="true" multiValued="false"/>
<field name="persistentUrl" type="string" stored="true" indexed="false" multiValued="false"/>
<field name="unf" type="string" stored="true" indexed="true" multiValued="false"/>
<field name="fileSizeInBytes" type="long" stored="true" indexed="true" multiValued="false"/>
<field name="fileSizeInBytes" type="plong" stored="true" indexed="true" multiValued="false"/>
<field name="fileMd5" type="string" stored="true" indexed="true" multiValued="false"/>
<field name="fileChecksumType" type="string" stored="true" indexed="true" multiValued="false"/>
<field name="fileChecksumValue" type="string" stored="true" indexed="true" multiValued="false"/>
<field name="fileContentType" type="string" stored="true" indexed="true" multiValued="false"/>
<field name="deaccessionReason" type="string" stored="true" indexed="false" multiValued="false"/>

<!-- Added for Dataverse 4.0 alpha 1. This is a required field so we don't have to go to the database to get the database id of the entity. On cards we use the id in links -->
<field name="entityId" type="long" stored="true" indexed="true" multiValued="false"/>
<field name="entityId" type="plong" stored="true" indexed="true" multiValued="false"/>

<field name="datasetVersionId" type="long" stored="true" indexed="true" multiValued="false"/>
<field name="datasetVersionId" type="plong" stored="true" indexed="true" multiValued="false"/>

<!-- Added for Dataverse 4.0 alpha 1 to sort by name -->
<!-- https://redmine.hmdc.harvard.edu/issues/3482 -->
<!-- 'Sorting can be done on the "score" of the document, or on any multiValued="false" indexed="true" field provided that field is either non-tokenized (ie: has no Analyzer) or uses an Analyzer that only produces a single Term (ie: uses the KeywordTokenizer)' http://wiki.apache.org/solr/CommonQueryParameters#sort -->
<!-- http://stackoverflow.com/questions/13360706/solr-4-0-alphabetical-sorting-trouble/13361226#13361226 -->
<field name="nameSort" type="alphaOnlySort" indexed="true" stored="true"/>

<field name="dateSort" type="date" indexed="true" stored="true"/>
<field name="dateSort" type="pdate" indexed="true" stored="true"/>

<!-- Added for Dataverse 4.0: release date https://redmine.hmdc.harvard.edu/issues/3592 -->
<field name="releasedate" type="int" indexed="true" stored="true"/>
<field name="releasedate" type="pint" indexed="true" stored="true"/>

<!-- Added for Dataverse 4.0: do we want a description field that applies to dataverses, datasets, and files? https://redmine.hmdc.harvard.edu/issues/3745 -->
<field name="description" type="text_en" multiValued="false" stored="true" indexed="true"/>
Expand Down Expand Up @@ -658,27 +651,32 @@
<!-- Dynamic field definitions allow using convention over configuration
for fields via the specification of patterns to match field names.
EXAMPLE: name="*_i" will match any field ending in _i (like myid_i, z_i)
RESTRICTION: the glob-like pattern in the name attribute must have a "*" only at the start or the end. -->
RESTRICTION: the glob-like pattern in the name attribute must have a "*"
only at the start or the end. -->

<dynamicField name="*_i" type="pint" indexed="true" stored="true"/>
<dynamicField name="*_is" type="pints" indexed="true" stored="true"/>
<dynamicField name="*_s" type="string" indexed="true" stored="true" />
<dynamicField name="*_ss" type="strings" indexed="true" stored="true"/>
<dynamicField name="*_l" type="plong" indexed="true" stored="true"/>
<dynamicField name="*_ls" type="plongs" indexed="true" stored="true"/>
<dynamicField name="*_txt" type="text_general" indexed="true" stored="true"/>
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
<dynamicField name="*_bs" type="booleans" indexed="true" stored="true"/>
<dynamicField name="*_f" type="pfloat" indexed="true" stored="true"/>
<dynamicField name="*_fs" type="pfloats" indexed="true" stored="true"/>
<dynamicField name="*_d" type="pdouble" indexed="true" stored="true"/>
<dynamicField name="*_ds" type="pdoubles" indexed="true" stored="true"/>
<dynamicField name="*_dt" type="pdate" indexed="true" stored="true"/>
<dynamicField name="*_dts" type="pdates" indexed="true" stored="true"/>
<dynamicField name="*_t" type="text_general" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="*_txt" type="text_general" indexed="true" stored="true"/>

<dynamicField name="random_*" type="random"/>
<dynamicField name="ignored_*" type="ignored"/>

<!-- Type used for data-driven schema, to add a string copy for each text field -->
<dynamicField name="*_str" type="strings" stored="false" docValues="true" indexed="false" />

<dynamicField name="*_dt" type="pdate" indexed="true" stored="true"/>
<dynamicField name="*_dts" type="pdate" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_str" type="strings" stored="false" docValues="true" indexed="false" useDocValuesAsStored="false" />

<dynamicField name="*_p" type="location" indexed="true" stored="true"/>
<dynamicField name="*_srpt" type="location_rpt" indexed="true" stored="true"/>

Expand Down Expand Up @@ -724,43 +722,6 @@
field first in an ascending sort and last in a descending sort.
-->

<fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/>

<fieldType name="tint" class="solr.TrieIntField" precisionStep="8" positionIncrementGap="0"/>
<fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" positionIncrementGap="0"/>
<fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" positionIncrementGap="0"/>
<fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" positionIncrementGap="0"/>

<!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
is a more restricted form of the canonical representation of dateTime
http://www.w3.org/TR/xmlschema-2/#dateTime
The trailing "Z" designates UTC time and is mandatory.
Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
All other components are mandatory.

Expressions can also be used to denote calculations that should be
performed relative to "NOW" to determine the value, ie...

NOW/HOUR
... Round to the start of the current hour
NOW-1DAY
... Exactly 1 day prior to now
NOW/DAY+6MONTHS+3DAYS
... 6 months and 3 days in the future from the start of
the current day

Consult the DateField javadocs for more information.

Note: For faster range queries, consider the tdate type
-->
<fieldType name="date" class="solr.TrieDateField" precisionStep="0" positionIncrementGap="0"/>

<!-- A Trie based date field for faster date range queries and date faceting. -->
<fieldType name="tdate" class="solr.TrieDateField" precisionStep="6" positionIncrementGap="0"/>

<!-- This is an example of using the KeywordTokenizer along
With various TokenFilterFactories to produce a sortable field
that does not include some properties of the source text
Expand Down Expand Up @@ -815,6 +776,11 @@
<fieldType name="pfloats" class="solr.FloatPointField" docValues="true" multiValued="true"/>
<fieldType name="plongs" class="solr.LongPointField" docValues="true" multiValued="true"/>
<fieldType name="pdoubles" class="solr.DoublePointField" docValues="true" multiValued="true"/>
<fieldType name="random" class="solr.RandomSortField" indexed="true"/>

<!-- since fields of this type are by default not stored or indexed,
any data added to them will be ignored outright. -->
<fieldType name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />

<!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
is a more restricted form of the canonical representation of dateTime
Expand All @@ -841,7 +807,14 @@

<!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings -->
<fieldType name="binary" class="solr.BinaryField"/>


<!--
RankFields can be used to store scoring factors to improve document ranking. They should be used
in combination with RankQParserPlugin.
(experimental)
-->
<fieldType name="rank" class="solr.RankField"/>

<!-- solr.TextField allows the specification of custom text analyzers
specified as a tokenizer and a list of token filters. Different
analyzers may be specified for indexing and querying.
Expand All @@ -851,7 +824,7 @@
matching across fields.

For more info on customizing your analyzer chain, please see
http://lucene.apache.org/solr/guide/understanding-analyzers-tokenizers-and-filters.html#understanding-analyzers-tokenizers-and-filters
https://solr.apache.org/guide/solr/latest/indexing-guide/document-analysis.html#using-analyzers-tokenizers-and-filters
-->

<!-- One can also specify an existing Analyzer class that has a
Expand All @@ -866,7 +839,7 @@
<dynamicField name="*_ws" type="text_ws" indexed="true" stored="true"/>
<fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<tokenizer name="whitespace"/>
</analyzer>
</fieldType>

Expand All @@ -893,6 +866,30 @@
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>

<!-- SortableTextField generaly functions exactly like TextField,
except that it supports, and by default uses, docValues for sorting (or faceting)
on the first 1024 characters of the original field values (which is configurable).

This makes it a bit more useful then TextField in many situations, but the trade-off
is that it takes up more space on disk; which is why it's not used in place of TextField
for every fieldType in this _default schema.
-->
<dynamicField name="*_t_sort" type="text_gen_sort" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="*_txt_sort" type="text_gen_sort" indexed="true" stored="true"/>
<fieldType name="text_gen_sort" class="solr.SortableTextField" positionIncrementGap="100" multiValued="true">
<analyzer type="index">
<tokenizer name="standard"/>
<filter name="stop" ignoreCase="true" words="stopwords.txt" />
<filter name="lowercase"/>
</analyzer>
<analyzer type="query">
<tokenizer name="standard"/>
<filter name="stop" ignoreCase="true" words="stopwords.txt" />
<filter name="synonymGraph" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter name="lowercase"/>
</analyzer>
</fieldType>

<!-- A text field with defaults appropriate for English: it tokenizes with StandardTokenizer,
removes English stop words (lang/stopwords_en.txt), down cases, protects words from protwords.txt, and
Expand Down