Merge pull request #9558 from ErykKul/9557_dataset_update_api_perform…

…ance async indexing after update command
IQSS · Jun 15, 2023 · 49f42ba · 49f42ba
2 parents 9feb865 + 56c4a38
commit 49f42ba
Show file tree

Hide file tree

Showing 44 changed files with 394 additions and 807 deletions.
diff --git a/doc/release-notes/9558-async-indexing.md b/doc/release-notes/9558-async-indexing.md
@@ -0,0 +1,3 @@
+Performance improvements, especially for large datasets containing thousands of files.
+Uploading files one by one to the dataset is much faster now, allowing uploading thousands of files in an acceptable timeframe. Not only uploading a file, but all edit operations on datasets containing many files, got faster.
+Performance tweaks include indexing of the datasets in the background and optimizations in the amount of the indexing operations needed. Furthermore, updates to the dateset no longer wait for ingesting to finish. Ingesting was already running in the background, but it took a lock, preventing updating the dataset and degrading performance for datasets containing many files.
diff --git a/pom.xml b/pom.xml
@@ -184,6 +184,11 @@
             <scope>provided</scope>
             <!-- no version here as managed by Payara BOM above! -->
         </dependency>
+        <dependency>
+            <groupId>fish.payara.api</groupId>
+            <artifactId>payara-api</artifactId>
+            <scope>provided</scope>
+        </dependency>
         <dependency>
             <groupId>com.sun.mail</groupId>
             <artifactId>jakarta.mail</artifactId>

diff --git a/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java
diff --git a/src/main/java/edu/harvard/iq/dataverse/Dataset.java b/src/main/java/edu/harvard/iq/dataverse/Dataset.java
@@ -43,6 +43,10 @@
  * @author skraffmiller
  */
 @NamedQueries({
+    // Dataset.findById should only be used if you're going to iterate over files (otherwise, lazy loading in DatasetService.find() is better).
+    // If you are going to iterate over files, preferably call the DatasetService.findDeep() method i.s.o. using this query directly.
+    @NamedQuery(name = "Dataset.findById", 
+                query = "SELECT o FROM Dataset o LEFT JOIN FETCH o.files WHERE o.id=:id"),
     @NamedQuery(name = "Dataset.findIdStale",
                query = "SELECT d.id FROM Dataset d WHERE d.indexTime is NULL OR d.indexTime < d.modificationTime"),
     @NamedQuery(name = "Dataset.findIdStalePermission",

diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java
@@ -40,6 +40,7 @@
 import edu.harvard.iq.dataverse.ingest.IngestServiceBean;
 import edu.harvard.iq.dataverse.license.LicenseServiceBean;
 import edu.harvard.iq.dataverse.metadataimport.ForeignMetadataImportServiceBean;
+import edu.harvard.iq.dataverse.pidproviders.PidUtil;
 import edu.harvard.iq.dataverse.privateurl.PrivateUrl;
 import edu.harvard.iq.dataverse.privateurl.PrivateUrlServiceBean;
 import edu.harvard.iq.dataverse.privateurl.PrivateUrlUtil;
@@ -81,6 +82,8 @@
 import java.util.Set;
 import java.util.Collection;
 import java.util.logging.Logger;
+import java.util.stream.Collectors;
+
 import javax.ejb.EJB;
 import javax.ejb.EJBException;
 import javax.faces.application.FacesMessage;
@@ -233,6 +236,8 @@ public enum DisplayMode {
     ExternalToolServiceBean externalToolService;
     @EJB
     SolrClientService solrClientService;
+    @EJB
+    DvObjectServiceBean dvObjectService;
     @Inject
     DataverseRequestServiceBean dvRequestService;
     @Inject
@@ -678,48 +683,43 @@ public void showAll(){
     }
 
     private List<FileMetadata> selectFileMetadatasForDisplay() {
-        Set<Long> searchResultsIdSet = null;
-
-        if (isIndexedVersion()) {
+        final Set<Long> searchResultsIdSet;
+        if (StringUtil.isEmpty(fileLabelSearchTerm) && StringUtil.isEmpty(fileTypeFacet) && StringUtil.isEmpty(fileAccessFacet) && StringUtil.isEmpty(fileTagsFacet)) {
+            // But, if no search terms were specified, we return the full
+            // list of the files in the version:
+            // Since the search results should include the full set of fmds if all the
+            // terms/facets are empty, setting them to null should just be
+            // an optimization for the loop below
+            searchResultsIdSet = null;
+        } else if (isIndexedVersion()) {
             // We run the search even if no search term and/or facets are
             // specified - to generate the facet labels list:
             searchResultsIdSet = getFileIdsInVersionFromSolr(workingVersion.getId(), this.fileLabelSearchTerm);
-            // But, if no search terms were specified, we return the full
-            // list of the files in the version:
-            if (StringUtil.isEmpty(fileLabelSearchTerm)
-                    && StringUtil.isEmpty(fileTypeFacet)
-                    && StringUtil.isEmpty(fileAccessFacet)
-                    && StringUtil.isEmpty(fileTagsFacet)) {
-                // Since the search results should include the full set of fmds if all the
-                // terms/facets are empty, setting them to null should just be
-                // an optimization for the loop below
-                searchResultsIdSet = null;
-            }
-        } else {
+        } else if (!StringUtil.isEmpty(this.fileLabelSearchTerm)) {
             // No, this is not an indexed version.
             // If the search term was specified, we'll run a search in the db;
             // if not - return the full list of files in the version.
             // (no facets without solr!)
-            if (!StringUtil.isEmpty(this.fileLabelSearchTerm)) {
-                searchResultsIdSet = getFileIdsInVersionFromDb(workingVersion.getId(), this.fileLabelSearchTerm);
-            }
+            searchResultsIdSet = getFileIdsInVersionFromDb(workingVersion.getId(), this.fileLabelSearchTerm);
+        } else {
+            searchResultsIdSet = null;
         }
 
-        List<FileMetadata> retList = new ArrayList<>();
-
-        for (FileMetadata fileMetadata : workingVersion.getFileMetadatas()) {
-            if (searchResultsIdSet == null || searchResultsIdSet.contains(fileMetadata.getDataFile().getId())) {
-                retList.add(fileMetadata);
-            }
+        final List<FileMetadata> md = workingVersion.getFileMetadatas();
+        final List<FileMetadata> retList;
+        if (searchResultsIdSet == null) {
+            retList = new ArrayList<>(md);
+        } else {
+            retList = md.stream().filter(x -> searchResultsIdSet.contains(x.getDataFile().getId())).collect(Collectors.toList());
         }
         sortFileMetadatas(retList);
         return retList;
     }
 
-    private void sortFileMetadatas(List<FileMetadata> fileList) {
+    private void sortFileMetadatas(final List<FileMetadata> fileList) {
 
-        DataFileComparator dfc = new DataFileComparator();
-        Comparator<FileMetadata> comp = dfc.compareBy(folderPresort, tagPresort, fileSortField, !"desc".equals(fileSortOrder));
+        final DataFileComparator dfc = new DataFileComparator();
+        final Comparator<FileMetadata> comp = dfc.compareBy(folderPresort, tagPresort, fileSortField, !"desc".equals(fileSortOrder));
         Collections.sort(fileList, comp);
     }
 
@@ -1843,6 +1843,17 @@ public boolean webloaderUploadSupported() {
         return settingsWrapper.isWebloaderUpload() && StorageIO.isDirectUploadEnabled(dataset.getEffectiveStorageDriverId());
     }
 
+    private void setIdByPersistentId() {
+        GlobalId gid = PidUtil.parseAsGlobalID(persistentId);
+        Long id = dvObjectService.findIdByGlobalId(gid, DvObject.DType.Dataset);
+        if (id == null) {
+            id = dvObjectService.findIdByAltGlobalId(gid, DvObject.DType.Dataset);
+        }
+        if (id != null) {
+            this.setId(id);
+        }
+    }
+
     private String init(boolean initFull) {
 
         //System.out.println("_YE_OLDE_QUERY_COUNTER_");  // for debug purposes
@@ -1866,23 +1877,11 @@ private String init(boolean initFull) {
             // Set the workingVersion and Dataset
             // ---------------------------------------
             if (persistentId != null) {
-                logger.fine("initializing DatasetPage with persistent ID " + persistentId);
-                // Set Working Version and Dataset by PersistentID
-                dataset = datasetService.findByGlobalId(persistentId);
-                if (dataset == null) {
-                    logger.warning("No such dataset: "+persistentId);
-                    return permissionsWrapper.notFound();
-                }
-                logger.fine("retrieved dataset, id="+dataset.getId());
-
-                retrieveDatasetVersionResponse = datasetVersionService.selectRequestedVersion(dataset.getVersions(), version);
-                //retrieveDatasetVersionResponse = datasetVersionService.retrieveDatasetVersionByPersistentId(persistentId, version);
-                this.workingVersion = retrieveDatasetVersionResponse.getDatasetVersion();
-                logger.fine("retrieved version: id: " + workingVersion.getId() + ", state: " + this.workingVersion.getVersionState());
-
-            } else if (this.getId() != null) {
+                setIdByPersistentId();
+            }
+            if (this.getId() != null) {
                 // Set Working Version and Dataset by Datasaet Id and Version
-                dataset = datasetService.find(this.getId());
+                dataset = datasetService.findDeep(this.getId());
                 if (dataset == null) {
                     logger.warning("No such dataset: "+dataset);
                     return permissionsWrapper.notFound();
@@ -1978,11 +1977,6 @@ private String init(boolean initFull) {
                 // init the list of FileMetadatas
                 if (workingVersion.isDraft() && canUpdateDataset()) {
                     readOnly = false;
-                } else {
-                    // an attempt to retreive both the filemetadatas and datafiles early on, so that
-                    // we don't have to do so later (possibly, many more times than necessary):
-                    AuthenticatedUser au = session.getUser() instanceof AuthenticatedUser ? (AuthenticatedUser) session.getUser() : null;
-                    datafileService.findFileMetadataOptimizedExperimental(dataset, workingVersion, au);
                 }
                 // This will default to all the files in the version, if the search term
                 // parameter hasn't been specified yet:
@@ -2849,15 +2843,14 @@ public String refresh() {
         DatasetVersionServiceBean.RetrieveDatasetVersionResponse retrieveDatasetVersionResponse = null;
 
         if (persistentId != null) {
-            //retrieveDatasetVersionResponse = datasetVersionService.retrieveDatasetVersionByPersistentId(persistentId, version);
-            dataset = datasetService.findByGlobalId(persistentId);
+            setIdByPersistentId();
+        }
+        if (dataset.getId() != null) {
+            //retrieveDatasetVersionResponse = datasetVersionService.retrieveDatasetVersionById(dataset.getId(), version);
+            dataset = datasetService.findDeep(dataset.getId());
             retrieveDatasetVersionResponse = datasetVersionService.selectRequestedVersion(dataset.getVersions(), version);
         } else if (versionId != null) {
             retrieveDatasetVersionResponse = datasetVersionService.retrieveDatasetVersionByVersionId(versionId);
-        } else if (dataset.getId() != null) {
-            //retrieveDatasetVersionResponse = datasetVersionService.retrieveDatasetVersionById(dataset.getId(), version);
-            dataset = datasetService.find(dataset.getId());
-            retrieveDatasetVersionResponse = datasetVersionService.selectRequestedVersion(dataset.getVersions(), version);
         }
 
         if (retrieveDatasetVersionResponse == null) {
@@ -2882,11 +2875,6 @@ public String refresh() {
             this.dataset = this.workingVersion.getDataset();
         }
 
-        if (readOnly) {
-            AuthenticatedUser au = session.getUser() instanceof AuthenticatedUser ? (AuthenticatedUser) session.getUser() : null;
-            datafileService.findFileMetadataOptimizedExperimental(dataset, workingVersion, au);
-        }
-
         fileMetadatasSearch = selectFileMetadatasForDisplay();
 
         displayCitation = dataset.getCitation(true, workingVersion);

diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java
@@ -38,6 +38,7 @@
 import javax.ejb.TransactionAttributeType;
 import javax.inject.Named;
 import javax.persistence.EntityManager;
+import javax.persistence.LockModeType;
 import javax.persistence.NoResultException;
 import javax.persistence.PersistenceContext;
 import javax.persistence.Query;
@@ -105,6 +106,38 @@ public Dataset find(Object pk) {
         return em.find(Dataset.class, pk);
     }
 
+    /**
+     * Retrieve a dataset with the deep underlying structure in one query execution.
+     * This is a more optimal choice when accessing files of a dataset.
+     * In a contrast, the find() method does not pre-fetch the file objects and results in point queries when accessing these objects.
+     * Since the files have a deep structure, many queries can be prevented by using the findDeep() method, especially for large datasets
+     * containing many files, and when iterating over all the files.
+     * When you are not going to access the file objects, the default find() method is better because of the lazy loading.
+     * @return a dataset with pre-fetched file objects
+     */
+    public Dataset findDeep(Object pk) {
+        return (Dataset) em.createNamedQuery("Dataset.findById")
+            .setParameter("id", pk)
+            // Optimization hints: retrieve all data in one query; this prevents point queries when iterating over the files 
+            .setHint("eclipselink.left-join-fetch", "o.files.ingestRequest")
+            .setHint("eclipselink.left-join-fetch", "o.files.thumbnailForDataset")
+            .setHint("eclipselink.left-join-fetch", "o.files.dataTables")
+            .setHint("eclipselink.left-join-fetch", "o.files.auxiliaryFiles")
+            .setHint("eclipselink.left-join-fetch", "o.files.ingestReports")
+            .setHint("eclipselink.left-join-fetch", "o.files.dataFileTags")
+            .setHint("eclipselink.left-join-fetch", "o.files.fileMetadatas")
+            .setHint("eclipselink.left-join-fetch", "o.files.fileMetadatas.fileCategories")
+            .setHint("eclipselink.left-join-fetch", "o.files.guestbookResponses")
+            .setHint("eclipselink.left-join-fetch", "o.files.embargo")
+            .setHint("eclipselink.left-join-fetch", "o.files.fileAccessRequests")
+            .setHint("eclipselink.left-join-fetch", "o.files.owner")
+            .setHint("eclipselink.left-join-fetch", "o.files.releaseUser")
+            .setHint("eclipselink.left-join-fetch", "o.files.creator")
+            .setHint("eclipselink.left-join-fetch", "o.files.alternativePersistentIndentifiers")
+            .setHint("eclipselink.left-join-fetch", "o.files.roleAssignments")
+            .getSingleResult();
+    }
+
     public List<Dataset> findByOwnerId(Long ownerId) {
         return findByOwnerId(ownerId, false);
     }

diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java
@@ -1118,13 +1118,7 @@ public JsonObjectBuilder fixMissingUnf(String datasetVersionId, boolean forceRec
 
         // reindexing the dataset, to make sure the new UNF is in SOLR:
         boolean doNormalSolrDocCleanUp = true;
-        try {
-            Future<String> indexingResult = indexService.indexDataset(datasetVersion.getDataset(), doNormalSolrDocCleanUp);
-        } catch (IOException | SolrServerException e) {    
-            String failureLogText = "Post UNF update indexing failed. You can kickoff a re-index of this dataset with: \r\n curl http://localhost:8080/api/admin/index/datasets/" + datasetVersion.getDataset().getId().toString();
-            failureLogText += "\r\n" + e.getLocalizedMessage();
-            LoggingUtil.writeOnSuccessFailureLog(null, failureLogText,  datasetVersion.getDataset());
-        }
+        indexService.asyncIndexDataset(datasetVersion.getDataset(), doNormalSolrDocCleanUp);
         return info;
     }
 

diff --git a/src/main/java/edu/harvard/iq/dataverse/DvObject.java b/src/main/java/edu/harvard/iq/dataverse/DvObject.java
@@ -30,9 +30,13 @@
 			query="SELECT COUNT(obj) FROM DvObject obj WHERE obj.owner.id=:id"),
     @NamedQuery(name = "DvObject.findByGlobalId",
             query = "SELECT o FROM DvObject o WHERE o.identifier=:identifier and o.authority=:authority and o.protocol=:protocol and o.dtype=:dtype"),
+    @NamedQuery(name = "DvObject.findIdByGlobalId",
+            query = "SELECT o.id FROM DvObject o WHERE o.identifier=:identifier and o.authority=:authority and o.protocol=:protocol and o.dtype=:dtype"),
 
     @NamedQuery(name = "DvObject.findByAlternativeGlobalId",
             query = "SELECT o FROM DvObject o, AlternativePersistentIdentifier a  WHERE o.id = a.dvObject.id and a.identifier=:identifier and a.authority=:authority and a.protocol=:protocol and o.dtype=:dtype"),
+    @NamedQuery(name = "DvObject.findIdByAlternativeGlobalId",
+            query = "SELECT o.id FROM DvObject o, AlternativePersistentIdentifier a  WHERE o.id = a.dvObject.id and a.identifier=:identifier and a.authority=:authority and a.protocol=:protocol and o.dtype=:dtype"),
 
     @NamedQuery(name = "DvObject.findByProtocolIdentifierAuthority",
             query = "SELECT o FROM DvObject o WHERE o.identifier=:identifier and o.authority=:authority and o.protocol=:protocol"),

diff --git a/src/main/java/edu/harvard/iq/dataverse/DvObjectServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DvObjectServiceBean.java
@@ -116,6 +116,16 @@ public DvObject findByAltGlobalId(GlobalId globalId, DvObject.DType dtype) {
         return runFindByGlobalId(query, globalId, dtype);
     }
 
+    public Long findIdByGlobalId(GlobalId globalId, DvObject.DType dtype) {
+        Query query = em.createNamedQuery("DvObject.findIdByGlobalId");
+        return runFindIdByGlobalId(query, globalId, dtype);
+    }
+
+    public Long findIdByAltGlobalId(GlobalId globalId, DvObject.DType dtype) {
+        Query query = em.createNamedQuery("DvObject.findIdByAlternativeGlobalId");
+        return runFindIdByGlobalId(query, globalId, dtype);
+    }
+
     private DvObject runFindByGlobalId(Query query, GlobalId gid, DvObject.DType dtype) {
         DvObject foundDvObject = null;
         try {
@@ -136,6 +146,27 @@ private DvObject runFindByGlobalId(Query query, GlobalId gid, DvObject.DType dty
         }
         return foundDvObject;
     }
+
+    private Long runFindIdByGlobalId(Query query, GlobalId gid, DvObject.DType dtype) {
+        Long foundDvObject = null;
+        try {
+            query.setParameter("identifier", gid.getIdentifier());
+            query.setParameter("protocol", gid.getProtocol());
+            query.setParameter("authority", gid.getAuthority());
+            query.setParameter("dtype", dtype.getDType());
+            foundDvObject = (Long) query.getSingleResult();
+        } catch (javax.persistence.NoResultException e) {
+            // (set to .info, this can fill the log file with thousands of
+            // these messages during a large harvest run)
+            logger.fine("no dvObject found: " + gid.asString());
+            // DO nothing, just return null.
+            return null;
+        } catch (Exception ex) {
+            logger.info("Exception caught in findByGlobalId: " + ex.getLocalizedMessage());
+            return null;
+        }
+        return foundDvObject;
+    }
 
     public DvObject findByGlobalId(GlobalId globalId) {
         return (DvObject) em.createNamedQuery("DvObject.findByProtocolIdentifierAuthority")

diff --git a/src/main/java/edu/harvard/iq/dataverse/PermissionServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/PermissionServiceBean.java
@@ -750,6 +750,14 @@ else if (dataset.isLockedFor(DatasetLock.Reason.InReview)) {
             }
         }
     }
+
+    public void checkUpdateDatasetVersionLock(Dataset dataset, DataverseRequest dataverseRequest, Command<?> command) throws IllegalCommandException {
+        boolean hasAtLeastOneLockThatIsNotAnIngestLock = dataset.isLocked() && dataset.getLocks().stream()
+            .anyMatch(lock -> !DatasetLock.Reason.Ingest.equals(lock.getReason()));
+        if (hasAtLeastOneLockThatIsNotAnIngestLock) {
+            checkEditDatasetLock(dataset, dataverseRequest, command);
+        }
+    }
 
     public void checkPublishDatasetLock(Dataset dataset, DataverseRequest dataverseRequest, Command command) throws IllegalCommandException {
         if (dataset.isLocked()) {