Merge pull request #10050 from IQSS/9635-solr-search-improvements

9635 solr improvements
IQSS · Nov 29, 2023 · 960751f · 960751f
2 parents 1ea692b + 8403a59
commit 960751f
Show file tree

Hide file tree

Showing 17 changed files with 526 additions and 218 deletions.
diff --git a/conf/solr/9.3.0/solrconfig.xml b/conf/solr/9.3.0/solrconfig.xml
@@ -588,6 +588,7 @@
      check for "Circuit Breakers tripped" in logs and the corresponding error message should tell
      you what transpired (if the failure was caused by tripped circuit breakers).
     -->
+
     <!--
     <str name="memEnabled">true</str>
     <str name="memThreshold">75</str>
@@ -599,10 +600,12 @@
      whether the circuit breaker is enabled and the average load over the last minute at which the
      circuit breaker should start rejecting queries.
     -->
+
     <!--
     <str name="cpuEnabled">true</str>
     <str name="cpuThreshold">75</str>
     -->
+
   </circuitBreaker>
 
   <!-- Request Dispatcher

diff --git a/doc/release-notes/9635-solr-improvements.md b/doc/release-notes/9635-solr-improvements.md
@@ -0,0 +1,4 @@
+- As of this release application-side support is added for the "circuit breaker" mechanism in Solr that makes it drop requests more gracefully when the search engine is experiencing load issues.
+
+Please see the "Installing Solr" section of the Installation Prerequisites guide.
+
diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst
@@ -2868,7 +2868,6 @@ To enable setting file-level PIDs per collection::
 
 When :AllowEnablingFilePIDsPerCollection is true, setting File PIDs to be enabled/disabled for a given collection can be done via the Native API - see :ref:`collection-attributes-api` in the Native API Guide.
 
-
 .. _:IndependentHandleService:
 
 :IndependentHandleService
@@ -3109,6 +3108,21 @@ If ``:SolrFullTextIndexing`` is set to true, the content of files of any size wi
 
 ``curl -X PUT -d 314572800 http://localhost:8080/api/admin/settings/:SolrMaxFileSizeForFullTextIndexing``
 
+
+.. _:DisableSolrFacets:
+
+:DisableSolrFacets
+++++++++++++++++++
+
+Setting this to ``true`` will make the collection ("dataverse") page start showing search results without the usual search facets on the left side of the page. A message will be shown in that column informing the users that facets are temporarily unavailable. Generating the facets is more resource-intensive for Solr than the main search results themselves, so applying this measure will significantly reduce the load on the search engine when its performance becomes an issue.
+
+This setting can be used in combination with the "circuit breaker" mechanism on the Solr side (see the "Installing Solr" section of the Installation Prerequisites guide). An admin can choose to enable it, or even create an automated system for enabling it in response to Solr beginning to drop incoming requests with the HTTP code 503.
+
+To enable the setting::
+
+  curl -X PUT -d true "http://localhost:8080/api/admin/settings/:DisableSolrFacets"
+
+
 .. _:SignUpUrl:
 
 :SignUpUrl

diff --git a/doc/sphinx-guides/source/installation/prerequisites.rst b/doc/sphinx-guides/source/installation/prerequisites.rst
@@ -211,6 +211,25 @@ Finally, you need to tell Solr to create the core "collection1" on startup::
 
         echo "name=collection1" > /usr/local/solr/solr-9.3.0/server/solr/collection1/core.properties
 
+Dataverse collection ("dataverse") page uses Solr very heavily. On a busy instance this may cause the search engine to become the performance bottleneck, making these pages take increasingly longer to load, potentially affecting the overall performance of the application and/or causing Solr itself to crash. If this is observed on your instance, we recommend uncommenting the following lines in the ``<circuitBreaker ...>`` section of the ``solrconfig.xml`` file::
+
+  <str name="memEnabled">true</str>
+  <str name="memThreshold">75</str>
+
+and::
+
+  <str name="cpuEnabled">true</str>
+  <str name="cpuThreshold">75</str>
+
+This will activate Solr "circuit breaker" mechanisms that make it start dropping incoming requests with the HTTP code 503 when it starts experiencing load issues. As of Dataverse 6.1, the collection page will recognize this condition and display a customizeable message to the users informing them that the search engine is unavailable because of heavy load, with the assumption that the condition is transitive and suggesting that they try again later. This is still an inconvenience to the users, but still a more graceful handling of the problem, rather than letting the pages time out or causing crashes. You may need to experiment and adjust the threshold values defined in the lines above. 
+
+If this becomes a common issue, another temporary workaround an admin may choose to use is to enable the following setting::
+
+  curl -X PUT -d true "http://localhost:8080/api/admin/settings/:DisableSolrFacets"
+
+This will make the collection page show the search results without the usual search facets on the left side of the page. Another customizeable message will be shown in that column informing the users that facets are temporarily unavailable. Generating these facets is more resource-intensive for Solr than the main search results themselves, so applying this measure will significantly reduce the load on the search engine. 
+
+
 Solr Init Script
 ================
 

diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java
@@ -754,17 +754,29 @@ public boolean isIndexedVersion() {
         if (isIndexedVersion != null) {
             return isIndexedVersion;
         }
+
+        // Just like on the collection page, facets on the Dataset page can be
+        // disabled instance-wide by an admin:
+        if (settingsWrapper.isTrueForKey(SettingsServiceBean.Key.DisableSolrFacets, false)) {
+            return isIndexedVersion = false;
+        }
+
         // The version is SUPPOSED to be indexed if it's the latest published version, or a
-        // draft. So if none of the above is true, we return false right away:
-
+        // draft. So if none of the above is true, we can return false right away. 
         if (!(workingVersion.isDraft() || isThisLatestReleasedVersion())) {
             return isIndexedVersion = false;
         }
-
-        // ... but if it is the latest published version or a draft, we want to test
-        // and confirm that this version *has* actually been indexed and is searchable
-        // (and that solr is actually up and running!), by running a quick solr search:
-        return isIndexedVersion = isThisVersionSearchable();
+        // If this is the latest published version, we want to confirm that this 
+        // version was successfully indexed after the last publication 
+
+        if (isThisLatestReleasedVersion()) {
+            return isIndexedVersion = (workingVersion.getDataset().getIndexTime() != null)
+                    && workingVersion.getDataset().getIndexTime().after(workingVersion.getReleaseTime());
+        }
+
+        // Drafts don't have the indextime stamps set/incremented when indexed, 
+        // so we'll just assume it is indexed, and will then hope for the best.
+        return isIndexedVersion = true;
     }
 
     /**
@@ -820,8 +832,18 @@ public List<FacetLabel> getFileTagsFacetLabels() {
     /**
      * Verifies that solr is running and that the version is indexed and searchable
      * @return boolean
-     */
+     * Commenting out this method for now, since we have decided it was not 
+     * necessary, to query solr just to figure out if we can query solr. We will
+     * rely solely on the latest-relesed status and the indexed timestamp from 
+     * the database for that. - L.A.
+     *
     public boolean isThisVersionSearchable() {
+        // Just like on the collection page, facets on the Dataset page can be
+        // disabled instance-wide by an admin:
+        if (settingsWrapper.isTrueForKey(SettingsServiceBean.Key.DisableSolrFacets, false)) {
+            return false;
+        }
+        
         SolrQuery solrQuery = new SolrQuery();
 
         solrQuery.setQuery(SearchUtil.constructQuery(SearchFields.ENTITY_ID, workingVersion.getDataset().getId().toString()));
@@ -856,6 +878,7 @@ public boolean isThisVersionSearchable() {
 
         return false;
     }
+    */
 
     /**
      * Finds the list of numeric datafile ids in the Version specified, by running
@@ -967,10 +990,19 @@ public Set<Long> getFileIdsInVersionFromSolr(Long datasetVersionId, String patte
             logger.fine("Remote Solr Exception: " + ex.getLocalizedMessage());
             String msg = ex.getLocalizedMessage();
             if (msg.contains(SearchFields.FILE_DELETED)) {
+                // This is a backward compatibility hook put in place many versions
+                // ago, to accommodate instances running Solr with schemas that 
+                // don't include this flag yet. Running Solr with an up-to-date
+                // schema has been a hard requirement for a while now; should we 
+                // remove it at this point? - L.A. 
                 fileDeletedFlagNotIndexed = true;
+            } else {
+                isIndexedVersion = false;
+                return resultIds;
             }
         } catch (Exception ex) {
             logger.warning("Solr exception: " + ex.getLocalizedMessage());
+            isIndexedVersion = false; 
             return resultIds;
         }
 
@@ -983,6 +1015,7 @@ public Set<Long> getFileIdsInVersionFromSolr(Long datasetVersionId, String patte
                 queryResponse = solrClientService.getSolrClient().query(solrQuery);
             } catch (Exception ex) {
                 logger.warning("Caught a Solr exception (again!): " + ex.getLocalizedMessage());
+                isIndexedVersion = false; 
                 return resultIds;
             }
         }

diff --git a/src/main/java/edu/harvard/iq/dataverse/GuestbookPage.java b/src/main/java/edu/harvard/iq/dataverse/GuestbookPage.java
@@ -288,19 +288,21 @@ public String save() {
 
         Command<Dataverse> cmd;
         try {
+            // Per recent #dv-tech conversation w/ Jim - copying the code 
+            // below from his QDR branch; the code that used to be here called
+            // UpdateDataverseCommand when saving new guestbooks, and that involved 
+            // an unnecessary reindexing of the dataverse (and, in some cases, 
+            // reindexing of the underlying datasets). - L.A.
             if (editMode == EditMode.CREATE || editMode == EditMode.CLONE ) {
                 guestbook.setCreateTime(new Timestamp(new Date().getTime()));
-                guestbook.setUsageCount(new Long(0));
+                guestbook.setUsageCount(Long.valueOf(0));
                 guestbook.setEnabled(true);
                 dataverse.getGuestbooks().add(guestbook);
-                cmd = new UpdateDataverseCommand(dataverse, null, null, dvRequestService.getDataverseRequest(), null);                
-                commandEngine.submit(cmd);
                 create = true;
-            } else {
-                cmd = new UpdateDataverseGuestbookCommand(dataverse, guestbook, dvRequestService.getDataverseRequest());
-                commandEngine.submit(cmd);
-            }
-
+            } 
+            cmd = new UpdateDataverseGuestbookCommand(dataverse, guestbook, dvRequestService.getDataverseRequest());
+            commandEngine.submit(cmd);
+
         } catch (EJBException ex) {
             StringBuilder error = new StringBuilder();
             error.append(ex).append(" ");

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Search.java b/src/main/java/edu/harvard/iq/dataverse/api/Search.java
@@ -157,7 +157,9 @@ public Response search(
                         numResultsPerPage,
                         true, //SEK get query entities always for search API additional Dataset Information 6300  12/6/2019
                         geoPoint,
-                        geoRadius
+                        geoRadius,
+                        showFacets, // facets are expensive, no need to ask for them if not requested
+                        showRelevance // no need for highlights unless requested either
                 );
             } catch (SearchException ex) {
                 Throwable cause = ex;

diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateDataverseCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateDataverseCommand.java
@@ -32,6 +32,8 @@ public class UpdateDataverseCommand extends AbstractCommand<Dataverse> {
 	private final List<DatasetFieldType> facetList;
         private final List<Dataverse> featuredDataverseList;
         private final List<DataverseFieldTypeInputLevel> inputLevelList;
+
+        private boolean datasetsReindexRequired = false; 
 
 	public UpdateDataverseCommand(Dataverse editedDv, List<DatasetFieldType> facetList, List<Dataverse> featuredDataverseList, 
                     DataverseRequest aRequest,  List<DataverseFieldTypeInputLevel> inputLevelList ) {
@@ -74,9 +76,13 @@ public Dataverse execute(CommandContext ctxt) throws CommandException {
                 }
             }
 
-            DataverseType oldDvType = ctxt.dataverses().find(editedDv.getId()).getDataverseType();
-            String oldDvAlias = ctxt.dataverses().find(editedDv.getId()).getAlias();
-            String oldDvName = ctxt.dataverses().find(editedDv.getId()).getName();
+            Dataverse oldDv = ctxt.dataverses().find(editedDv.getId());
+
+            DataverseType oldDvType = oldDv.getDataverseType();
+            String oldDvAlias = oldDv.getAlias();
+            String oldDvName = oldDv.getName();
+            oldDv = null; 
+
             Dataverse result = ctxt.dataverses().save(editedDv);
 
             if ( facetList != null ) {
@@ -101,6 +107,14 @@ public Dataverse execute(CommandContext ctxt) throws CommandException {
                 }
             }
 
+            // We don't want to reindex the children datasets unnecessarily: 
+            // When these values are changed we need to reindex all children datasets
+            // This check is not recursive as all the values just report the immediate parent
+            if (!oldDvType.equals(editedDv.getDataverseType())
+                || !oldDvName.equals(editedDv.getName())
+                || !oldDvAlias.equals(editedDv.getAlias())) {
+                datasetsReindexRequired = true;
+            }
 
             return result;
 	}
@@ -110,9 +124,16 @@ public boolean onSuccess(CommandContext ctxt, Object r) {
 
         // first kick of async index of datasets
         // TODO: is this actually needed? Is there a better way to handle
+        // It appears that we at some point lost some extra logic here, where
+        // we only reindex the underlying datasets if one or more of the specific set
+        // of fields have been changed (since these values are included in the 
+        // indexed solr documents for dataasets). So I'm putting that back. -L.A.
         Dataverse result = (Dataverse) r;
-        List<Dataset> datasets = ctxt.datasets().findByOwnerId(result.getId());
-        ctxt.index().asyncIndexDatasetList(datasets, true);
+
+        if (datasetsReindexRequired) {
+            List<Dataset> datasets = ctxt.datasets().findByOwnerId(result.getId());
+            ctxt.index().asyncIndexDatasetList(datasets, true);
+        }
 
         return ctxt.dataverses().index((Dataverse) r);
     }  

diff --git a/src/main/java/edu/harvard/iq/dataverse/mydata/DataRetrieverAPI.java b/src/main/java/edu/harvard/iq/dataverse/mydata/DataRetrieverAPI.java
@@ -39,7 +39,6 @@
 import jakarta.ws.rs.Path;
 import jakarta.ws.rs.Produces;
 import jakarta.ws.rs.QueryParam;
-import jakarta.ws.rs.DefaultValue;
 import jakarta.ws.rs.container.ContainerRequestContext;
 import jakarta.ws.rs.core.Context;
 
@@ -226,7 +225,12 @@ private SolrQueryResponse getTotalCountsFromSolr(DataverseRequest dataverseReque
                     //SearchFields.RELEASE_OR_CREATE_DATE, SortBy.DESCENDING,
                     0, //paginationStart,
                     true, // dataRelatedToMe
-                    SearchConstants.NUM_SOLR_DOCS_TO_RETRIEVE //10 // SearchFields.NUM_SOLR_DOCS_TO_RETRIEVE
+                    SearchConstants.NUM_SOLR_DOCS_TO_RETRIEVE, //10 // SearchFields.NUM_SOLR_DOCS_TO_RETRIEVE
+                    true, 
+                    null,
+                    null,
+                    false, // no need to request facets here ...
+                    false  // ... same for highlights
             );
         } catch (SearchException ex) {
             logger.severe("Search for total counts failed with filter query");

diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java
@@ -420,6 +420,7 @@ public void asyncIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) {
         }
     }
 
+    @Asynchronous
     public void asyncIndexDatasetList(List<Dataset> datasets, boolean doNormalSolrDocCleanUp) {
         for(Dataset dataset : datasets) {
             asyncIndexDataset(dataset, true);