Skip to content

Commit

Permalink
Merge pull request #9558 from ErykKul/9557_dataset_update_api_perform…
Browse files Browse the repository at this point in the history
…ance

async indexing after update command
  • Loading branch information
kcondon committed Jun 15, 2023
2 parents 9feb865 + 56c4a38 commit 49f42ba
Show file tree
Hide file tree
Showing 44 changed files with 394 additions and 807 deletions.
3 changes: 3 additions & 0 deletions doc/release-notes/9558-async-indexing.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Performance improvements, especially for large datasets containing thousands of files.
Uploading files one by one to the dataset is much faster now, allowing uploading thousands of files in an acceptable timeframe. Not only uploading a file, but all edit operations on datasets containing many files, got faster.
Performance tweaks include indexing of the datasets in the background and optimizations in the amount of the indexing operations needed. Furthermore, updates to the dateset no longer wait for ingesting to finish. Ingesting was already running in the background, but it took a lock, preventing updating the dataset and degrading performance for datasets containing many files.
5 changes: 5 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,11 @@
<scope>provided</scope>
<!-- no version here as managed by Payara BOM above! -->
</dependency>
<dependency>
<groupId>fish.payara.api</groupId>
<artifactId>payara-api</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.sun.mail</groupId>
<artifactId>jakarta.mail</artifactId>
Expand Down
366 changes: 0 additions & 366 deletions src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions src/main/java/edu/harvard/iq/dataverse/Dataset.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@
* @author skraffmiller
*/
@NamedQueries({
// Dataset.findById should only be used if you're going to iterate over files (otherwise, lazy loading in DatasetService.find() is better).
// If you are going to iterate over files, preferably call the DatasetService.findDeep() method i.s.o. using this query directly.
@NamedQuery(name = "Dataset.findById",
query = "SELECT o FROM Dataset o LEFT JOIN FETCH o.files WHERE o.id=:id"),
@NamedQuery(name = "Dataset.findIdStale",
query = "SELECT d.id FROM Dataset d WHERE d.indexTime is NULL OR d.indexTime < d.modificationTime"),
@NamedQuery(name = "Dataset.findIdStalePermission",
Expand Down
106 changes: 47 additions & 59 deletions src/main/java/edu/harvard/iq/dataverse/DatasetPage.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import edu.harvard.iq.dataverse.ingest.IngestServiceBean;
import edu.harvard.iq.dataverse.license.LicenseServiceBean;
import edu.harvard.iq.dataverse.metadataimport.ForeignMetadataImportServiceBean;
import edu.harvard.iq.dataverse.pidproviders.PidUtil;
import edu.harvard.iq.dataverse.privateurl.PrivateUrl;
import edu.harvard.iq.dataverse.privateurl.PrivateUrlServiceBean;
import edu.harvard.iq.dataverse.privateurl.PrivateUrlUtil;
Expand Down Expand Up @@ -81,6 +82,8 @@
import java.util.Set;
import java.util.Collection;
import java.util.logging.Logger;
import java.util.stream.Collectors;

import javax.ejb.EJB;
import javax.ejb.EJBException;
import javax.faces.application.FacesMessage;
Expand Down Expand Up @@ -233,6 +236,8 @@ public enum DisplayMode {
ExternalToolServiceBean externalToolService;
@EJB
SolrClientService solrClientService;
@EJB
DvObjectServiceBean dvObjectService;
@Inject
DataverseRequestServiceBean dvRequestService;
@Inject
Expand Down Expand Up @@ -678,48 +683,43 @@ public void showAll(){
}

private List<FileMetadata> selectFileMetadatasForDisplay() {
Set<Long> searchResultsIdSet = null;

if (isIndexedVersion()) {
final Set<Long> searchResultsIdSet;
if (StringUtil.isEmpty(fileLabelSearchTerm) && StringUtil.isEmpty(fileTypeFacet) && StringUtil.isEmpty(fileAccessFacet) && StringUtil.isEmpty(fileTagsFacet)) {
// But, if no search terms were specified, we return the full
// list of the files in the version:
// Since the search results should include the full set of fmds if all the
// terms/facets are empty, setting them to null should just be
// an optimization for the loop below
searchResultsIdSet = null;
} else if (isIndexedVersion()) {
// We run the search even if no search term and/or facets are
// specified - to generate the facet labels list:
searchResultsIdSet = getFileIdsInVersionFromSolr(workingVersion.getId(), this.fileLabelSearchTerm);
// But, if no search terms were specified, we return the full
// list of the files in the version:
if (StringUtil.isEmpty(fileLabelSearchTerm)
&& StringUtil.isEmpty(fileTypeFacet)
&& StringUtil.isEmpty(fileAccessFacet)
&& StringUtil.isEmpty(fileTagsFacet)) {
// Since the search results should include the full set of fmds if all the
// terms/facets are empty, setting them to null should just be
// an optimization for the loop below
searchResultsIdSet = null;
}
} else {
} else if (!StringUtil.isEmpty(this.fileLabelSearchTerm)) {
// No, this is not an indexed version.
// If the search term was specified, we'll run a search in the db;
// if not - return the full list of files in the version.
// (no facets without solr!)
if (!StringUtil.isEmpty(this.fileLabelSearchTerm)) {
searchResultsIdSet = getFileIdsInVersionFromDb(workingVersion.getId(), this.fileLabelSearchTerm);
}
searchResultsIdSet = getFileIdsInVersionFromDb(workingVersion.getId(), this.fileLabelSearchTerm);
} else {
searchResultsIdSet = null;
}

List<FileMetadata> retList = new ArrayList<>();

for (FileMetadata fileMetadata : workingVersion.getFileMetadatas()) {
if (searchResultsIdSet == null || searchResultsIdSet.contains(fileMetadata.getDataFile().getId())) {
retList.add(fileMetadata);
}
final List<FileMetadata> md = workingVersion.getFileMetadatas();
final List<FileMetadata> retList;
if (searchResultsIdSet == null) {
retList = new ArrayList<>(md);
} else {
retList = md.stream().filter(x -> searchResultsIdSet.contains(x.getDataFile().getId())).collect(Collectors.toList());
}
sortFileMetadatas(retList);
return retList;
}

private void sortFileMetadatas(List<FileMetadata> fileList) {
private void sortFileMetadatas(final List<FileMetadata> fileList) {

DataFileComparator dfc = new DataFileComparator();
Comparator<FileMetadata> comp = dfc.compareBy(folderPresort, tagPresort, fileSortField, !"desc".equals(fileSortOrder));
final DataFileComparator dfc = new DataFileComparator();
final Comparator<FileMetadata> comp = dfc.compareBy(folderPresort, tagPresort, fileSortField, !"desc".equals(fileSortOrder));
Collections.sort(fileList, comp);
}

Expand Down Expand Up @@ -1843,6 +1843,17 @@ public boolean webloaderUploadSupported() {
return settingsWrapper.isWebloaderUpload() && StorageIO.isDirectUploadEnabled(dataset.getEffectiveStorageDriverId());
}

private void setIdByPersistentId() {
GlobalId gid = PidUtil.parseAsGlobalID(persistentId);
Long id = dvObjectService.findIdByGlobalId(gid, DvObject.DType.Dataset);
if (id == null) {
id = dvObjectService.findIdByAltGlobalId(gid, DvObject.DType.Dataset);
}
if (id != null) {
this.setId(id);
}
}

private String init(boolean initFull) {

//System.out.println("_YE_OLDE_QUERY_COUNTER_"); // for debug purposes
Expand All @@ -1866,23 +1877,11 @@ private String init(boolean initFull) {
// Set the workingVersion and Dataset
// ---------------------------------------
if (persistentId != null) {
logger.fine("initializing DatasetPage with persistent ID " + persistentId);
// Set Working Version and Dataset by PersistentID
dataset = datasetService.findByGlobalId(persistentId);
if (dataset == null) {
logger.warning("No such dataset: "+persistentId);
return permissionsWrapper.notFound();
}
logger.fine("retrieved dataset, id="+dataset.getId());

retrieveDatasetVersionResponse = datasetVersionService.selectRequestedVersion(dataset.getVersions(), version);
//retrieveDatasetVersionResponse = datasetVersionService.retrieveDatasetVersionByPersistentId(persistentId, version);
this.workingVersion = retrieveDatasetVersionResponse.getDatasetVersion();
logger.fine("retrieved version: id: " + workingVersion.getId() + ", state: " + this.workingVersion.getVersionState());

} else if (this.getId() != null) {
setIdByPersistentId();
}
if (this.getId() != null) {
// Set Working Version and Dataset by Datasaet Id and Version
dataset = datasetService.find(this.getId());
dataset = datasetService.findDeep(this.getId());
if (dataset == null) {
logger.warning("No such dataset: "+dataset);
return permissionsWrapper.notFound();
Expand Down Expand Up @@ -1978,11 +1977,6 @@ private String init(boolean initFull) {
// init the list of FileMetadatas
if (workingVersion.isDraft() && canUpdateDataset()) {
readOnly = false;
} else {
// an attempt to retreive both the filemetadatas and datafiles early on, so that
// we don't have to do so later (possibly, many more times than necessary):
AuthenticatedUser au = session.getUser() instanceof AuthenticatedUser ? (AuthenticatedUser) session.getUser() : null;
datafileService.findFileMetadataOptimizedExperimental(dataset, workingVersion, au);
}
// This will default to all the files in the version, if the search term
// parameter hasn't been specified yet:
Expand Down Expand Up @@ -2849,15 +2843,14 @@ public String refresh() {
DatasetVersionServiceBean.RetrieveDatasetVersionResponse retrieveDatasetVersionResponse = null;

if (persistentId != null) {
//retrieveDatasetVersionResponse = datasetVersionService.retrieveDatasetVersionByPersistentId(persistentId, version);
dataset = datasetService.findByGlobalId(persistentId);
setIdByPersistentId();
}
if (dataset.getId() != null) {
//retrieveDatasetVersionResponse = datasetVersionService.retrieveDatasetVersionById(dataset.getId(), version);
dataset = datasetService.findDeep(dataset.getId());
retrieveDatasetVersionResponse = datasetVersionService.selectRequestedVersion(dataset.getVersions(), version);
} else if (versionId != null) {
retrieveDatasetVersionResponse = datasetVersionService.retrieveDatasetVersionByVersionId(versionId);
} else if (dataset.getId() != null) {
//retrieveDatasetVersionResponse = datasetVersionService.retrieveDatasetVersionById(dataset.getId(), version);
dataset = datasetService.find(dataset.getId());
retrieveDatasetVersionResponse = datasetVersionService.selectRequestedVersion(dataset.getVersions(), version);
}

if (retrieveDatasetVersionResponse == null) {
Expand All @@ -2882,11 +2875,6 @@ public String refresh() {
this.dataset = this.workingVersion.getDataset();
}

if (readOnly) {
AuthenticatedUser au = session.getUser() instanceof AuthenticatedUser ? (AuthenticatedUser) session.getUser() : null;
datafileService.findFileMetadataOptimizedExperimental(dataset, workingVersion, au);
}

fileMetadatasSearch = selectFileMetadatasForDisplay();

displayCitation = dataset.getCitation(true, workingVersion);
Expand Down
33 changes: 33 additions & 0 deletions src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import javax.ejb.TransactionAttributeType;
import javax.inject.Named;
import javax.persistence.EntityManager;
import javax.persistence.LockModeType;
import javax.persistence.NoResultException;
import javax.persistence.PersistenceContext;
import javax.persistence.Query;
Expand Down Expand Up @@ -105,6 +106,38 @@ public Dataset find(Object pk) {
return em.find(Dataset.class, pk);
}

/**
* Retrieve a dataset with the deep underlying structure in one query execution.
* This is a more optimal choice when accessing files of a dataset.
* In a contrast, the find() method does not pre-fetch the file objects and results in point queries when accessing these objects.
* Since the files have a deep structure, many queries can be prevented by using the findDeep() method, especially for large datasets
* containing many files, and when iterating over all the files.
* When you are not going to access the file objects, the default find() method is better because of the lazy loading.
* @return a dataset with pre-fetched file objects
*/
public Dataset findDeep(Object pk) {
return (Dataset) em.createNamedQuery("Dataset.findById")
.setParameter("id", pk)
// Optimization hints: retrieve all data in one query; this prevents point queries when iterating over the files
.setHint("eclipselink.left-join-fetch", "o.files.ingestRequest")
.setHint("eclipselink.left-join-fetch", "o.files.thumbnailForDataset")
.setHint("eclipselink.left-join-fetch", "o.files.dataTables")
.setHint("eclipselink.left-join-fetch", "o.files.auxiliaryFiles")
.setHint("eclipselink.left-join-fetch", "o.files.ingestReports")
.setHint("eclipselink.left-join-fetch", "o.files.dataFileTags")
.setHint("eclipselink.left-join-fetch", "o.files.fileMetadatas")
.setHint("eclipselink.left-join-fetch", "o.files.fileMetadatas.fileCategories")
.setHint("eclipselink.left-join-fetch", "o.files.guestbookResponses")
.setHint("eclipselink.left-join-fetch", "o.files.embargo")
.setHint("eclipselink.left-join-fetch", "o.files.fileAccessRequests")
.setHint("eclipselink.left-join-fetch", "o.files.owner")
.setHint("eclipselink.left-join-fetch", "o.files.releaseUser")
.setHint("eclipselink.left-join-fetch", "o.files.creator")
.setHint("eclipselink.left-join-fetch", "o.files.alternativePersistentIndentifiers")
.setHint("eclipselink.left-join-fetch", "o.files.roleAssignments")
.getSingleResult();
}

public List<Dataset> findByOwnerId(Long ownerId) {
return findByOwnerId(ownerId, false);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1118,13 +1118,7 @@ public JsonObjectBuilder fixMissingUnf(String datasetVersionId, boolean forceRec

// reindexing the dataset, to make sure the new UNF is in SOLR:
boolean doNormalSolrDocCleanUp = true;
try {
Future<String> indexingResult = indexService.indexDataset(datasetVersion.getDataset(), doNormalSolrDocCleanUp);
} catch (IOException | SolrServerException e) {
String failureLogText = "Post UNF update indexing failed. You can kickoff a re-index of this dataset with: \r\n curl http://localhost:8080/api/admin/index/datasets/" + datasetVersion.getDataset().getId().toString();
failureLogText += "\r\n" + e.getLocalizedMessage();
LoggingUtil.writeOnSuccessFailureLog(null, failureLogText, datasetVersion.getDataset());
}
indexService.asyncIndexDataset(datasetVersion.getDataset(), doNormalSolrDocCleanUp);
return info;
}

Expand Down
4 changes: 4 additions & 0 deletions src/main/java/edu/harvard/iq/dataverse/DvObject.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,13 @@
query="SELECT COUNT(obj) FROM DvObject obj WHERE obj.owner.id=:id"),
@NamedQuery(name = "DvObject.findByGlobalId",
query = "SELECT o FROM DvObject o WHERE o.identifier=:identifier and o.authority=:authority and o.protocol=:protocol and o.dtype=:dtype"),
@NamedQuery(name = "DvObject.findIdByGlobalId",
query = "SELECT o.id FROM DvObject o WHERE o.identifier=:identifier and o.authority=:authority and o.protocol=:protocol and o.dtype=:dtype"),

@NamedQuery(name = "DvObject.findByAlternativeGlobalId",
query = "SELECT o FROM DvObject o, AlternativePersistentIdentifier a WHERE o.id = a.dvObject.id and a.identifier=:identifier and a.authority=:authority and a.protocol=:protocol and o.dtype=:dtype"),
@NamedQuery(name = "DvObject.findIdByAlternativeGlobalId",
query = "SELECT o.id FROM DvObject o, AlternativePersistentIdentifier a WHERE o.id = a.dvObject.id and a.identifier=:identifier and a.authority=:authority and a.protocol=:protocol and o.dtype=:dtype"),

@NamedQuery(name = "DvObject.findByProtocolIdentifierAuthority",
query = "SELECT o FROM DvObject o WHERE o.identifier=:identifier and o.authority=:authority and o.protocol=:protocol"),
Expand Down
31 changes: 31 additions & 0 deletions src/main/java/edu/harvard/iq/dataverse/DvObjectServiceBean.java
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,16 @@ public DvObject findByAltGlobalId(GlobalId globalId, DvObject.DType dtype) {
return runFindByGlobalId(query, globalId, dtype);
}

public Long findIdByGlobalId(GlobalId globalId, DvObject.DType dtype) {
Query query = em.createNamedQuery("DvObject.findIdByGlobalId");
return runFindIdByGlobalId(query, globalId, dtype);
}

public Long findIdByAltGlobalId(GlobalId globalId, DvObject.DType dtype) {
Query query = em.createNamedQuery("DvObject.findIdByAlternativeGlobalId");
return runFindIdByGlobalId(query, globalId, dtype);
}

private DvObject runFindByGlobalId(Query query, GlobalId gid, DvObject.DType dtype) {
DvObject foundDvObject = null;
try {
Expand All @@ -136,6 +146,27 @@ private DvObject runFindByGlobalId(Query query, GlobalId gid, DvObject.DType dty
}
return foundDvObject;
}

private Long runFindIdByGlobalId(Query query, GlobalId gid, DvObject.DType dtype) {
Long foundDvObject = null;
try {
query.setParameter("identifier", gid.getIdentifier());
query.setParameter("protocol", gid.getProtocol());
query.setParameter("authority", gid.getAuthority());
query.setParameter("dtype", dtype.getDType());
foundDvObject = (Long) query.getSingleResult();
} catch (javax.persistence.NoResultException e) {
// (set to .info, this can fill the log file with thousands of
// these messages during a large harvest run)
logger.fine("no dvObject found: " + gid.asString());
// DO nothing, just return null.
return null;
} catch (Exception ex) {
logger.info("Exception caught in findByGlobalId: " + ex.getLocalizedMessage());
return null;
}
return foundDvObject;
}

public DvObject findByGlobalId(GlobalId globalId) {
return (DvObject) em.createNamedQuery("DvObject.findByProtocolIdentifierAuthority")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -750,6 +750,14 @@ else if (dataset.isLockedFor(DatasetLock.Reason.InReview)) {
}
}
}

public void checkUpdateDatasetVersionLock(Dataset dataset, DataverseRequest dataverseRequest, Command<?> command) throws IllegalCommandException {
boolean hasAtLeastOneLockThatIsNotAnIngestLock = dataset.isLocked() && dataset.getLocks().stream()
.anyMatch(lock -> !DatasetLock.Reason.Ingest.equals(lock.getReason()));
if (hasAtLeastOneLockThatIsNotAnIngestLock) {
checkEditDatasetLock(dataset, dataverseRequest, command);
}
}

public void checkPublishDatasetLock(Dataset dataset, DataverseRequest dataverseRequest, Command command) throws IllegalCommandException {
if (dataset.isLocked()) {
Expand Down

0 comments on commit 49f42ba

Please sign in to comment.