From 1ab8b570254a70c1511ff97852c700caca8c5b74 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 25 Mar 2025 16:18:20 -0400 Subject: [PATCH 01/83] add debug index logging --- .../edu/harvard/iq/dataverse/search/IndexServiceBean.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index a8e6c0661d7..16c30c069fa 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -1395,6 +1395,8 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set Date: Tue, 25 Mar 2025 17:16:13 -0400 Subject: [PATCH 02/83] use loop constants, etc. --- .../iq/dataverse/search/IndexServiceBean.java | 136 ++++++++++-------- 1 file changed, 76 insertions(+), 60 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index 16c30c069fa..57da8b4954e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -1394,31 +1394,61 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set datasetPublicationStatuses = new HashSet(); + if (dataset.getReleasedVersion() == null && !dataset.isHarvested()) { + datasetPublicationStatuses.add(UNPUBLISHED_STRING); + } + + if (datasetVersion.isInReview()) { + datasetPublicationStatuses.add(IN_REVIEW_STRING); + } + + if (indexableDataset.getDatasetState().equals(DatasetState.PUBLISHED)) { + datasetPublicationStatuses.add(PUBLISHED_STRING); + } else { + if (indexableDataset.getDatasetState().equals(DatasetState.WORKING_COPY)) { + datasetPublicationStatuses.add(DRAFT_STRING); + } + } + + String datasetVersionId = datasetVersion.getId().toString(); + boolean indexThisMetadata = indexableDataset.isFilesShouldBeIndexed(); + String datasetPersistentURL = dataset.getPersistentURL(); + for (FileMetadata fileMetadata : fileMetadatas) { long startTime = System.currentTimeMillis(); - + DataFile datafile = fileMetadata.getDataFile(); LocalDate end = null; LocalDate start = null; - Embargo emb= fileMetadata.getDataFile().getEmbargo(); + Embargo emb= datafile.getEmbargo(); if(emb!=null) { end = emb.getDateAvailable(); if(embargoEndDate==null || end.isAfter(embargoEndDate)) { embargoEndDate=end; } } - Retention ret= fileMetadata.getDataFile().getRetention(); + Retention ret= datafile.getRetention(); if(ret!=null) { start = ret.getDateUnavailable(); if(retentionEndDate==null || start.isBefore(retentionEndDate)) { retentionEndDate=start; } } - boolean indexThisMetadata = indexableDataset.isFilesShouldBeIndexed(); + if (indexThisMetadata && checkForDuplicateMetadata && !releasedFileMetadatas.isEmpty()) { logger.fine("Checking if this file metadata is a duplicate."); - FileMetadata getFromMap = fileMap.get(fileMetadata.getDataFile().getId()); + FileMetadata getFromMap = fileMap.get(datafile.getId()); if (getFromMap != null) { - if ((fileMetadata.getDataFile().isRestricted() == getFromMap.getDataFile().isRestricted())) { + if ((datafile.isRestricted() == getFromMap.getDataFile().isRestricted())) { if (fileMetadata.contentEquals(getFromMap) && VariableMetadataUtil.compareVariableMetadata(getFromMap, fileMetadata)) { indexThisMetadata = false; @@ -1434,11 +1464,11 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set accessObject = null; InputStream instream = null; ContentHandler textHandler = null; try { - accessObject = DataAccess.getStorageIO(fileMetadata.getDataFile(), + accessObject = DataAccess.getStorageIO(datafile, new DataAccessRequest()); if (accessObject != null) { accessObject.open(); @@ -1466,10 +1497,8 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS,s)); String fileSolrDocId = solrDocIdentifierFile + fileEntityId; - if (indexableDataset.getDatasetState().equals(indexableDataset.getDatasetState().PUBLISHED)) { - fileSolrDocId = solrDocIdentifierFile + fileEntityId; - datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, PUBLISHED_STRING); + indexableDataset.getDatasetState(); + if (datasetPublicationStatuses.contains(PUBLISHED_STRING)) { if (FeatureFlags.ADD_PUBLICOBJECT_SOLR_FIELD.enabled()) { datafileSolrInputDocument.addField(SearchFields.PUBLIC_OBJECT, true); } - // datafileSolrInputDocument.addField(SearchFields.PERMS, publicGroupString); addDatasetReleaseDateToSolrDoc(datafileSolrInputDocument, dataset); // has this published file been deleted from the current draft version? if (datafilesInDraftVersion != null && !datafilesInDraftVersion.contains(datafile.getId())) { datafileSolrInputDocument.addField(SearchFields.FILE_DELETED, true); } - } else if (indexableDataset.getDatasetState().equals(indexableDataset.getDatasetState().WORKING_COPY)) { - fileSolrDocId = solrDocIdentifierFile + fileEntityId + indexableDataset.getDatasetState().getSuffix(); - datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, DRAFT_STRING); + } else { + indexableDataset.getDatasetState(); + if (datasetPublicationStatuses.contains(DRAFT_STRING)) { + fileSolrDocId = solrDocIdentifierFile + fileEntityId + indexableDataset.getDatasetState().getSuffix(); + } } datafileSolrInputDocument.addField(SearchFields.ID, fileSolrDocId); - datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_FRIENDLY, fileMetadata.getDataFile().getFriendlyType()); - datafileSolrInputDocument.addField(SearchFields.FILE_CONTENT_TYPE, fileMetadata.getDataFile().getContentType()); - datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_SEARCHABLE, fileMetadata.getDataFile().getFriendlyType()); + datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_FRIENDLY, datafile.getFriendlyType()); + datafileSolrInputDocument.addField(SearchFields.FILE_CONTENT_TYPE, datafile.getContentType()); + datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_SEARCHABLE, datafile.getFriendlyType()); // For the file type facets, we have a property file that maps mime types // to facet-friendly names; "application/fits" should become "FITS", etc.: - datafileSolrInputDocument.addField(SearchFields.FILE_TYPE, FileUtil.getIndexableFacetFileType(fileMetadata.getDataFile())); - datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_SEARCHABLE, FileUtil.getIndexableFacetFileType(fileMetadata.getDataFile())); - datafileSolrInputDocument.addField(SearchFields.FILE_SIZE_IN_BYTES, fileMetadata.getDataFile().getFilesize()); - if (DataFile.ChecksumType.MD5.equals(fileMetadata.getDataFile().getChecksumType())) { + datafileSolrInputDocument.addField(SearchFields.FILE_TYPE, FileUtil.getIndexableFacetFileType(datafile)); + datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_SEARCHABLE, FileUtil.getIndexableFacetFileType(datafile)); + datafileSolrInputDocument.addField(SearchFields.FILE_SIZE_IN_BYTES, datafile.getFilesize()); + if (DataFile.ChecksumType.MD5.equals(datafile.getChecksumType())) { /** * @todo Someday we should probably deprecate this * FILE_MD5 in favor of a combination of * FILE_CHECKSUM_TYPE and FILE_CHECKSUM_VALUE. */ - datafileSolrInputDocument.addField(SearchFields.FILE_MD5, fileMetadata.getDataFile().getChecksumValue()); + datafileSolrInputDocument.addField(SearchFields.FILE_MD5, datafile.getChecksumValue()); } - datafileSolrInputDocument.addField(SearchFields.FILE_CHECKSUM_TYPE, fileMetadata.getDataFile().getChecksumType().toString()); - datafileSolrInputDocument.addField(SearchFields.FILE_CHECKSUM_VALUE, fileMetadata.getDataFile().getChecksumValue()); - datafileSolrInputDocument.addField(SearchFields.FILE_RESTRICTED, fileMetadata.getDataFile().isRestricted()); + datafileSolrInputDocument.addField(SearchFields.FILE_CHECKSUM_TYPE, datafile.getChecksumType().toString()); + datafileSolrInputDocument.addField(SearchFields.FILE_CHECKSUM_VALUE, datafile.getChecksumValue()); + datafileSolrInputDocument.addField(SearchFields.FILE_RESTRICTED, datafile.isRestricted()); datafileSolrInputDocument.addField(SearchFields.DESCRIPTION, fileMetadata.getDescription()); datafileSolrInputDocument.addField(SearchFields.FILE_DESCRIPTION, fileMetadata.getDescription()); - GlobalId filePid = fileMetadata.getDataFile().getGlobalId(); + GlobalId filePid = datafile.getGlobalId(); datafileSolrInputDocument.addField(SearchFields.FILE_PERSISTENT_ID, (filePid != null) ? filePid.toString() : null); - datafileSolrInputDocument.addField(SearchFields.UNF, fileMetadata.getDataFile().getUnf()); + datafileSolrInputDocument.addField(SearchFields.UNF, datafile.getUnf()); datafileSolrInputDocument.addField(SearchFields.SUBTREE, dataversePaths); // datafileSolrInputDocument.addField(SearchFields.HOST_DATAVERSE, // dataFile.getOwner().getOwner().getName()); @@ -1669,9 +1685,9 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set variables = fileMetadata.getDataFile().getDataTable().getDataVariables(); - Long observations = fileMetadata.getDataFile().getDataTable().getCaseQuantity(); + if (datafile.isTabularData()) { + List variables = datafile.getDataTable().getDataVariables(); + Long observations = datafile.getDataTable().getCaseQuantity(); datafileSolrInputDocument.addField(SearchFields.OBSERVATIONS, observations); datafileSolrInputDocument.addField(SearchFields.VARIABLE_COUNT, variables.size()); @@ -1728,7 +1744,7 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set Date: Tue, 25 Mar 2025 17:43:49 -0400 Subject: [PATCH 03/83] minimize work when details false, check restrict earlier/simplier --- .../iq/dataverse/FileVersionDifference.java | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java b/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java index e0dea739edc..5247cedd81c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java +++ b/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java @@ -64,7 +64,9 @@ When there are changes (after v4.19)to the file metadata data model this method if (newFileMetadata.getDataFile() == null && originalFileMetadata != null){ //File Deleted - updateDifferenceSummary("", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 0, 0, 1, 0); + if(details) { + updateDifferenceSummary("", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 0, 0, 1, 0); + } return false; } @@ -83,6 +85,17 @@ When there are changes (after v4.19)to the file metadata data model this method retVal = false; } + /* + Get Restriction Differences + */ + if (originalFileMetadata.isRestricted() != newFileMetadata.isRestricted()) { + if(details) { + String value2 = newFileMetadata.isRestricted() ? BundleUtil.getStringFromBundle("file.versionDifferences.fileRestricted") : BundleUtil.getStringFromBundle("file.versionDifferences.fileUnrestricted"); + updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileAccessTitle"), value2, 0, 0, 0, 0); + } + retVal = false; + } + if ( originalFileMetadata != null) { if (!newFileMetadata.getLabel().equals(originalFileMetadata.getLabel())) { if (details) { @@ -224,15 +237,7 @@ When there are changes (after v4.19)to the file metadata data model this method retVal = false; } - /* - Get Restriction Differences - */ - value1 = originalFileMetadata.isRestricted() ? BundleUtil.getStringFromBundle("file.versionDifferences.fileRestricted") : BundleUtil.getStringFromBundle("file.versionDifferences.fileUnrestricted"); - value2 = newFileMetadata.isRestricted() ? BundleUtil.getStringFromBundle("file.versionDifferences.fileRestricted") : BundleUtil.getStringFromBundle("file.versionDifferences.fileUnrestricted"); - if (!value1.equals(value2)) { - updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileAccessTitle"), value2, 0, 0, 0, 0); - retVal = false; - } + } return retVal; } From 3b746f7b175837a942492995e5f5dedb03c61771 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 25 Mar 2025 17:49:08 -0400 Subject: [PATCH 04/83] really fix test --- .../edu/harvard/iq/dataverse/search/IndexServiceBeanTest.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/test/java/edu/harvard/iq/dataverse/search/IndexServiceBeanTest.java b/src/test/java/edu/harvard/iq/dataverse/search/IndexServiceBeanTest.java index 2b54a4b12cd..eda9b995db5 100644 --- a/src/test/java/edu/harvard/iq/dataverse/search/IndexServiceBeanTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/search/IndexServiceBeanTest.java @@ -126,6 +126,7 @@ private IndexableDataset createIndexableDataset() { final Dataset dataset = MocksFactory.makeDataset(); dataset.setGlobalId(new GlobalId(AbstractDOIProvider.DOI_PROTOCOL,"10.666", "FAKE/fake", "/", AbstractDOIProvider.DOI_RESOLVER_URL, null)); final DatasetVersion datasetVersion = dataset.getCreateVersion(null); + datasetVersion.setId(1L); DatasetField field = createCVVField("language", "English", false); datasetVersion.getDatasetFields().add(field); final IndexableDataset indexableDataset = new IndexableDataset(datasetVersion); From f23a274c0de273c2a203a526b612d11942c653b2 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 25 Mar 2025 18:44:44 -0400 Subject: [PATCH 05/83] simplify - fix restrict bug --- .../iq/dataverse/FileVersionDifference.java | 239 ++++++++---------- 1 file changed, 109 insertions(+), 130 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java b/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java index 5247cedd81c..f2b7b37605c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java +++ b/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java @@ -16,23 +16,21 @@ * @author skraffmi */ public final class FileVersionDifference { - - private FileMetadata newFileMetadata; - private FileMetadata originalFileMetadata; + + private FileMetadata newFileMetadata; + private FileMetadata originalFileMetadata; private boolean details = false; private boolean same = false; - - - private List differenceSummaryGroups = new ArrayList<>(); + private List differenceSummaryGroups = new ArrayList<>(); private List differenceDetailItems = new ArrayList<>(); - + public FileVersionDifference(FileMetadata newFileMetadata, FileMetadata originalFileMetadata) { - this(newFileMetadata, originalFileMetadata, false); - - } - + this(newFileMetadata, originalFileMetadata, false); + + } + public FileVersionDifference(FileMetadata newFileMetadata, FileMetadata originalFileMetadata, boolean details) { this.newFileMetadata = newFileMetadata; @@ -40,77 +38,77 @@ public FileVersionDifference(FileMetadata newFileMetadata, FileMetadata original this.details = details; this.same = compareMetadata(newFileMetadata, originalFileMetadata); - //Compare versions - File Metadata first + // Compare versions - File Metadata first + + } - } - - public boolean compareMetadata(FileMetadata newFileMetadata, FileMetadata originalFileMetadata) { /* - This method both determines if there has been a change in file metadata between the two versions supplied - and it updates the FileVersionDifference object which is used to display the differences on the dataset versions tab. - The return value is used by the index service bean tomark whether a file needs to be re-indexed in the context of a dataset update. - When there are changes (after v4.19)to the file metadata data model this method must be updated. - retVal of True means metadatas are equal. - */ - + * This method both determines if there has been a change in file metadata + * between the two versions supplied and it updates the FileVersionDifference + * object which is used to display the differences on the dataset versions tab. + * The return value is used by the index service bean to mark whether a file + * needs to be re-indexed in the context of a dataset update. When there are + * changes (after v4.19)to the file metadata data model this method must be + * updated. retVal of True means metadatas are equal. + */ + boolean retVal = true; - if (newFileMetadata.getDataFile() == null && originalFileMetadata == null){ - //File in neither version - //Don't add any groups + if (newFileMetadata.getDataFile() == null && originalFileMetadata == null) { + // File in neither version + // Don't add any groups return true; } - - if (newFileMetadata.getDataFile() == null && originalFileMetadata != null){ - //File Deleted - if(details) { - updateDifferenceSummary("", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 0, 0, 1, 0); + + if (newFileMetadata.getDataFile() == null && originalFileMetadata != null) { + // File Deleted + if (details) { + updateDifferenceSummary("", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 0, 0, 1, 0); } return false; } - - if (this.originalFileMetadata == null && this.newFileMetadata.getDataFile() != null ){ - //File Added - if (!details) return false; - retVal = false; - updateDifferenceSummary( "", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 1, 0, 0, 0); - } - - //Check to see if File replaced - if (originalFileMetadata != null && - newFileMetadata.getDataFile() != null && originalFileMetadata.getDataFile() != null &&!this.originalFileMetadata.getDataFile().equals(this.newFileMetadata.getDataFile())){ - if (!details) return false; - updateDifferenceSummary( "", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 0, 0, 0, 1); + + if (this.originalFileMetadata == null && this.newFileMetadata.getDataFile() != null) { + // File Added + if (!details) + return false; retVal = false; + updateDifferenceSummary("", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 1, 0, 0, 0); } - /* - Get Restriction Differences - */ - if (originalFileMetadata.isRestricted() != newFileMetadata.isRestricted()) { - if(details) { - String value2 = newFileMetadata.isRestricted() ? BundleUtil.getStringFromBundle("file.versionDifferences.fileRestricted") : BundleUtil.getStringFromBundle("file.versionDifferences.fileUnrestricted"); - updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileAccessTitle"), value2, 0, 0, 0, 0); + if (originalFileMetadata != null) { + // Check to see if File replaced + if (newFileMetadata.getDataFile() != null && originalFileMetadata.getDataFile() != null && !this.originalFileMetadata.getDataFile().equals(this.newFileMetadata.getDataFile())) { + if (!details) + return false; + updateDifferenceSummary("", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 0, 0, 0, 1); + retVal = false; } - retVal = false; - } - - if ( originalFileMetadata != null) { + + /* + * Get Restriction Differences + */ + if (originalFileMetadata.isRestricted() != newFileMetadata.isRestricted()) { + if (details) { + String value2 = newFileMetadata.isRestricted() ? BundleUtil.getStringFromBundle("file.versionDifferences.fileRestricted") : BundleUtil.getStringFromBundle("file.versionDifferences.fileUnrestricted"); + updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileAccessTitle"), value2, 0, 0, 0, 0); + } + retVal = false; + } + if (!newFileMetadata.getLabel().equals(originalFileMetadata.getLabel())) { if (details) { differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.fileNameDetailTitle"), originalFileMetadata.getLabel(), newFileMetadata.getLabel())); - } else{ + } else { return false; } updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileMetadataGroupTitle"), BundleUtil.getStringFromBundle("file.versionDifferences.fileNameDetailTitle"), 0, 1, 0, 0); retVal = false; } - } - //Description differences - if ( originalFileMetadata != null) { + // Description differences if (newFileMetadata.getDescription() != null && originalFileMetadata.getDescription() != null && !newFileMetadata.getDescription().equals(originalFileMetadata.getDescription())) { @@ -124,8 +122,7 @@ When there are changes (after v4.19)to the file metadata data model this method retVal = false; } if (newFileMetadata.getDescription() != null - && originalFileMetadata.getDescription() == null - ) { + && originalFileMetadata.getDescription() == null) { if (details) { differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), "", newFileMetadata.getDescription())); } else { @@ -136,10 +133,9 @@ When there are changes (after v4.19)to the file metadata data model this method retVal = false; } if (newFileMetadata.getDescription() == null - && originalFileMetadata.getDescription() != null - ) { + && originalFileMetadata.getDescription() != null) { if (details) { - differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), originalFileMetadata.getDescription(), "" )); + differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), originalFileMetadata.getDescription(), "")); } else { return false; } @@ -147,9 +143,7 @@ When there are changes (after v4.19)to the file metadata data model this method BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), 0, 0, 1, 0); retVal = false; } - } - //Provenance Description differences - if ( originalFileMetadata != null) { + // Provenance Description differences if ((newFileMetadata.getProvFreeForm() != null && !newFileMetadata.getProvFreeForm().isEmpty()) && (originalFileMetadata.getProvFreeForm() != null && !originalFileMetadata.getProvFreeForm().isEmpty()) && !newFileMetadata.getProvFreeForm().equals(originalFileMetadata.getProvFreeForm())) { @@ -163,8 +157,7 @@ When there are changes (after v4.19)to the file metadata data model this method retVal = false; } if ((newFileMetadata.getProvFreeForm() != null && !newFileMetadata.getProvFreeForm().isEmpty()) - && (originalFileMetadata.getProvFreeForm() == null || originalFileMetadata.getProvFreeForm().isEmpty()) - ) { + && (originalFileMetadata.getProvFreeForm() == null || originalFileMetadata.getProvFreeForm().isEmpty())) { if (details) { differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), "", newFileMetadata.getProvFreeForm())); } else { @@ -175,10 +168,9 @@ When there are changes (after v4.19)to the file metadata data model this method retVal = false; } if ((newFileMetadata.getProvFreeForm() == null || newFileMetadata.getProvFreeForm().isEmpty()) - && (originalFileMetadata.getProvFreeForm() != null && !originalFileMetadata.getProvFreeForm().isEmpty()) - ) { + && (originalFileMetadata.getProvFreeForm() != null && !originalFileMetadata.getProvFreeForm().isEmpty())) { if (details) { - differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), originalFileMetadata.getProvFreeForm(), "" )); + differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), originalFileMetadata.getProvFreeForm(), "")); } else { return false; } @@ -186,11 +178,9 @@ When there are changes (after v4.19)to the file metadata data model this method BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), 0, 0, 1, 0); retVal = false; } - } - if (originalFileMetadata != null) { /* - get Tags differences - */ + * get Tags differences + */ String value1 = originalFileMetadata.getCategoriesByName().toString(); String value2 = newFileMetadata.getCategoriesByName().toString(); if (value1 == null || value1.isEmpty() || value1.equals(" ")) { @@ -201,57 +191,56 @@ When there are changes (after v4.19)to the file metadata data model this method } if (!value1.equals(value2)) { - if (!details) return false; + if (!details) + return false; int added = 0; int deleted = 0; - + added = newFileMetadata.getCategoriesByName().stream().map((tag) -> { boolean found = false; - for (String tagOld : originalFileMetadata.getCategoriesByName() ){ - if (tag.equals(tagOld)){ + for (String tagOld : originalFileMetadata.getCategoriesByName()) { + if (tag.equals(tagOld)) { found = true; break; } } return found; }).filter((found) -> (!found)).map((_item) -> 1).reduce(added, Integer::sum); - + for (String tag : originalFileMetadata.getCategoriesByName()) { boolean found = false; - for (String tagNew : newFileMetadata.getCategoriesByName() ){ - if (tag.equals(tagNew)){ + for (String tagNew : newFileMetadata.getCategoriesByName()) { + if (tag.equals(tagNew)) { found = true; break; } } - if (!found){ + if (!found) { deleted++; } } - if (added > 0){ + if (added > 0) { updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileTagsGroupTitle"), "", added, 0, 0, 0, true); } - if (deleted > 0){ + if (deleted > 0) { updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileTagsGroupTitle"), "", 0, 0, deleted, 0, true); } retVal = false; } - } return retVal; } - - private void updateDifferenceSummary(String groupLabel, String itemLabel, int added, int changed, int deleted, int replaced) { + + private void updateDifferenceSummary(String groupLabel, String itemLabel, int added, int changed, int deleted, int replaced) { updateDifferenceSummary(groupLabel, itemLabel, added, changed, deleted, replaced, false); } - - + private void updateDifferenceSummary(String groupLabel, String itemLabel, int added, int changed, int deleted, int replaced, boolean multiple) { FileDifferenceSummaryGroup summaryGroup = new FileDifferenceSummaryGroup(groupLabel); FileDifferenceSummaryItem summaryItem = new FileDifferenceSummaryItem(itemLabel, added, changed, deleted, replaced, multiple); - - if (!this.differenceSummaryGroups.contains(summaryGroup)) { + + if (!this.differenceSummaryGroups.contains(summaryGroup)) { summaryGroup.getFileDifferenceSummaryItems().add(summaryItem); this.differenceSummaryGroups.add(summaryGroup); } else { @@ -260,23 +249,23 @@ private void updateDifferenceSummary(String groupLabel, String itemLabel, int ad }); } } - - public FileMetadata getNewFileMetadata(){ + + public FileMetadata getNewFileMetadata() { return this.newFileMetadata; } - - public void setNewFileMetadata(FileMetadata in){ - this.newFileMetadata = in; + + public void setNewFileMetadata(FileMetadata in) { + this.newFileMetadata = in; } - public FileMetadata getOriginalFileMetadata() { + public FileMetadata getOriginalFileMetadata() { return originalFileMetadata; } public void setOriginalFileMetadata(FileMetadata originalFileMetadata) { this.originalFileMetadata = originalFileMetadata; } - + public boolean isSame() { return same; } @@ -284,8 +273,7 @@ public boolean isSame() { public void setSame(boolean same) { this.same = same; } - - + public List getDifferenceSummaryGroups() { return differenceSummaryGroups; } @@ -294,19 +282,17 @@ public void setDifferenceSummaryGroups(List differen this.differenceSummaryGroups = differenceSummaryGroups; } - public class FileDifferenceSummaryGroup { - - + public class FileDifferenceSummaryGroup { private String name; private List fileDifferenceSummaryItems; - + public FileDifferenceSummaryGroup(String name) { this.name = name; this.fileDifferenceSummaryItems = new ArrayList<>(); - + } - + public String getName() { return name; } @@ -322,23 +308,22 @@ public List getFileDifferenceSummaryItems() { public void setFileDifferenceSummaryItems(List fileDifferenceSummaryItems) { this.fileDifferenceSummaryItems = fileDifferenceSummaryItems; } - + @Override public String toString() { - + String retval = getName(); - if (!retval.isEmpty()){ + if (!retval.isEmpty()) { retval += ": "; } - - for (FileDifferenceSummaryItem item : this.fileDifferenceSummaryItems){ + + for (FileDifferenceSummaryItem item : this.fileDifferenceSummaryItems) { retval += " " + item.toString(); } - + return retval; } - - + @Override public int hashCode() { int hash = 5; @@ -361,8 +346,8 @@ public boolean equals(Object obj) { return Objects.equals(this.name, other.name); } } - - public final class FileDifferenceDetailItem{ + + public final class FileDifferenceDetailItem { private String displayName; private String originalValue; private String newValue; @@ -372,9 +357,7 @@ public FileDifferenceDetailItem(String displayName, String originalValue, String this.originalValue = originalValue; this.newValue = newValue; } - - - + public String getDisplayName() { return displayName; } @@ -400,11 +383,8 @@ public void setNewValue(String newValue) { } } - - - - public class FileDifferenceSummaryItem{ + public class FileDifferenceSummaryItem { private String name; private int added; @@ -412,7 +392,7 @@ public class FileDifferenceSummaryItem{ private int deleted; private int replaced; private boolean multiple; - + public FileDifferenceSummaryItem(String name, int added, int changed, int deleted, int replaced, boolean multiple) { this.name = name; this.added = added; @@ -421,7 +401,7 @@ public FileDifferenceSummaryItem(String name, int added, int changed, int delete this.replaced = replaced; this.multiple = multiple; } - + public String getName() { return name; } @@ -469,8 +449,7 @@ public boolean isMultiple() { public void setMultiple(boolean multiple) { this.multiple = multiple; } - - - } - + + } + } From 8f89906da0e09da19ed01853274e295b7d85bb14 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 26 Mar 2025 10:52:52 -0400 Subject: [PATCH 06/83] release note --- doc/release-notes/11374-indexing-improvement.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 doc/release-notes/11374-indexing-improvement.md diff --git a/doc/release-notes/11374-indexing-improvement.md b/doc/release-notes/11374-indexing-improvement.md new file mode 100644 index 00000000000..5547a498987 --- /dev/null +++ b/doc/release-notes/11374-indexing-improvement.md @@ -0,0 +1,3 @@ +### Solr Indexing speed improved + +The performance if Solr indexing for files has been improved by ~30-40% From 17cd5b552d593cbe334df671fad9c99d08f7b9bb Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 26 Mar 2025 11:28:02 -0400 Subject: [PATCH 07/83] fix compile issue, additional tweaks --- .../iq/dataverse/search/IndexServiceBean.java | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index 57da8b4954e..1e0d6590288 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -1395,7 +1395,6 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set 0) && (size <= maxSize)) { textHandler = new BodyContentHandler(-1); Metadata metadata = new Metadata(); /* @@ -1518,7 +1518,6 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set d try { solrClientIndexService.getSolrClient().add(docs.getDocuments()); } catch (SolrServerException | IOException ex) { + logger.warning("Check process-failures logs re: " + ex.getLocalizedMessage()); if (ex.getCause() instanceof SolrServerException) { throw new SolrServerException(ex); } else if (ex.getCause() instanceof IOException) { From fb36f3b6abe3a16ff21deebaad10df0c4559aca8 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 27 Mar 2025 18:00:20 -0400 Subject: [PATCH 08/83] try parallel file loop --- .../iq/dataverse/search/IndexServiceBean.java | 67 ++++++++++--------- 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index 1e0d6590288..3963fe66e99 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -69,6 +69,7 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.Future; import java.util.concurrent.Semaphore; +import java.util.concurrent.atomic.AtomicReference; import java.util.function.Function; import java.util.logging.Level; import java.util.logging.Logger; @@ -90,6 +91,7 @@ import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; +import org.apache.logging.log4j.util.Strings; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery.SortClause; import org.apache.solr.client.solrj.SolrServerException; @@ -1376,31 +1378,35 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set filesIndexed = new ArrayList<>(); if (datasetVersion != null) { List fileMetadatas = datasetVersion.getFileMetadatas(); - List releasedFileMetadatas = new ArrayList<>(); + List rfm = new ArrayList<>(); Map fileMap = new HashMap<>(); - boolean checkForDuplicateMetadata = false; + boolean check = false; if (datasetVersion.isDraft() && dataset.isReleased() && dataset.getReleasedVersion() != null) { - checkForDuplicateMetadata = true; - releasedFileMetadatas = dataset.getReleasedVersion().getFileMetadatas(); - for(FileMetadata released: releasedFileMetadatas){ + check = true; + rfm = dataset.getReleasedVersion().getFileMetadatas(); + for (FileMetadata released : rfm) { fileMap.put(released.getDataFile().getId(), released); } logger.fine( "We are indexing a draft version of a dataset that has a released version. We'll be checking file metadatas if they are exact clones of the released versions."); } - LocalDate embargoEndDate=null; - LocalDate retentionEndDate=null; - final String datasetCitation = (dataset.isReleased() && dataset.getReleasedVersion() != null) ? - dataset.getCitation(dataset.getReleasedVersion()) : dataset.getCitation(); + final List releasedFileMetadatas = rfm; + final boolean checkForDuplicateMetadata = check; + AtomicReference embargoEndDateRef = new AtomicReference<>(null); + AtomicReference retentionEndDateRef = new AtomicReference<>(null); + final String datasetCitation = (dataset.isReleased() && dataset.getReleasedVersion() != null) ? dataset.getCitation(dataset.getReleasedVersion()) : dataset.getCitation(); final Long datasetId = dataset.getId(); final String datasetGlobalId = dataset.getGlobalId().toString(); + final String parentTitle = parentDatasetTitle; - AutoDetectParser autoParser = null; - ParseContext context = null; + AutoDetectParser ap = null; + ParseContext ct = null; if(doFullTextIndexing) { - autoParser = new AutoDetectParser(); - context = new ParseContext(); + ap = new AutoDetectParser(); + ct = new ParseContext(); } + final AutoDetectParser autoParser = ap; + final ParseContext context = ct; Set datasetPublicationStatuses = new HashSet(); if (dataset.getReleasedVersion() == null && !dataset.isHarvested()) { @@ -1424,25 +1430,19 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set { DataFile datafile = fileMetadata.getDataFile(); - LocalDate end = null; - LocalDate start = null; Embargo emb= datafile.getEmbargo(); + LocalDate end = emb.getDateAvailable(); if(emb!=null) { - end = emb.getDateAvailable(); - if(embargoEndDate==null || end.isAfter(embargoEndDate)) { - embargoEndDate=end; + embargoEndDateRef.updateAndGet(current -> (current == null || end.isAfter(current)) ? end : current); } - } Retention ret= datafile.getRetention(); + LocalDate start = ret.getDateUnavailable(); if(ret!=null) { - start = ret.getDateUnavailable(); - if(retentionEndDate==null || start.isBefore(retentionEndDate)) { - retentionEndDate=start; + retentionEndDateRef.updateAndGet(current -> (current == null || start.isBefore(current)) ? start : current); } - } - + boolean indexThisFile=indexThisMetadata; if (indexThisMetadata && checkForDuplicateMetadata && !releasedFileMetadatas.isEmpty()) { logger.fine("Checking if this file metadata is a duplicate."); FileMetadata getFromMap = fileMap.get(datafile.getId()); @@ -1450,7 +1450,7 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set variablesByMetadata = variableService.findVarMetByFileMetaId(fileMetadata.getId()); variableMap = - variablesByMetadata.stream().collect(Collectors.toMap(VariableMetadata::getId, Function.identity())); + variablesByMetadata.stream().collect(Collectors.toMap(VariableMetadata::getId, Function.identity())); for (DataVariable var : variables) { @@ -1750,13 +1750,18 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set Date: Thu, 27 Mar 2025 18:12:44 -0400 Subject: [PATCH 09/83] fix NPE and final issues --- .../iq/dataverse/search/IndexServiceBean.java | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index 3963fe66e99..1ed33d6dbf9 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -1431,16 +1431,20 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set { - DataFile datafile = fileMetadata.getDataFile(); - Embargo emb= datafile.getEmbargo(); - LocalDate end = emb.getDateAvailable(); - if(emb!=null) { - embargoEndDateRef.updateAndGet(current -> (current == null || end.isAfter(current)) ? end : current); + DataFile datafile = fileMetadata.getDataFile(); + Embargo emb = datafile.getEmbargo(); + LocalDate end = null; + if (emb != null) { + final LocalDate endDate = emb.getDateAvailable(); + embargoEndDateRef.updateAndGet(current -> (current == null || endDate.isAfter(current)) ? endDate : current); + end = endDate; } - Retention ret= datafile.getRetention(); - LocalDate start = ret.getDateUnavailable(); - if(ret!=null) { - retentionEndDateRef.updateAndGet(current -> (current == null || start.isBefore(current)) ? start : current); + Retention ret = datafile.getRetention(); + LocalDate start = null; + if (ret != null) { + final LocalDate startDate = ret.getDateUnavailable(); + retentionEndDateRef.updateAndGet(current -> (current == null || startDate.isBefore(current)) ? startDate : current); + start = startDate; } boolean indexThisFile=indexThisMetadata; if (indexThisMetadata && checkForDuplicateMetadata && !releasedFileMetadatas.isEmpty()) { From 646bb835c2e28b9d367033e9c48c7dafa06d40da Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 27 Mar 2025 18:42:35 -0400 Subject: [PATCH 10/83] try finddeep --- src/main/java/edu/harvard/iq/dataverse/api/Index.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Index.java b/src/main/java/edu/harvard/iq/dataverse/api/Index.java index bc9a8ae692b..1a95c55ea0c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Index.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Index.java @@ -303,6 +303,7 @@ public Response indexDatasetByPersistentId(@QueryParam("persistentId") String pe Dataset dataset = null; try { dataset = datasetService.findByGlobalId(persistentId); + dataset = datasetService.findDeep(dataset.getId()); } catch (Exception ex) { return error(Status.BAD_REQUEST, "Problem looking up dataset with persistent id \"" + persistentId + "\". Error: " + ex.getMessage()); } From 612e5219cacba482bd6b86f3c6e9b46445dbf12c Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 28 Mar 2025 09:19:23 -0400 Subject: [PATCH 11/83] avoid double loop --- .../datavariable/VariableMetadataUtil.java | 35 +++++++++---------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/datavariable/VariableMetadataUtil.java b/src/main/java/edu/harvard/iq/dataverse/datavariable/VariableMetadataUtil.java index 209ffd93fe3..136bd1b7bae 100644 --- a/src/main/java/edu/harvard/iq/dataverse/datavariable/VariableMetadataUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/datavariable/VariableMetadataUtil.java @@ -3,34 +3,33 @@ import edu.harvard.iq.dataverse.FileMetadata; import java.util.Collection; +import java.util.HashMap; +import java.util.Map; public class VariableMetadataUtil { public static boolean compareVariableMetadata(FileMetadata fmdo, FileMetadata fmdn) { Collection vmlo = fmdo.getVariableMetadatas(); Collection vmln = fmdn.getVariableMetadatas(); - - int count = 0; + if (vmlo.size() != vmln.size()) { return false; - } else { - for (VariableMetadata vmo : vmlo) { - for (VariableMetadata vmn : vmln) { - if (vmo.getDataVariable().getId().equals(vmn.getDataVariable().getId())) { - count++; - if (!compareVarMetadata(vmo, vmn)) { - return false; - } - } - } - } } - if (count == vmlo.size()) { - return true; - } else { - return false; + + Map vmnMap = new HashMap<>(); + for (VariableMetadata vmn : vmln) { + vmnMap.put(vmn.getDataVariable().getId(), vmn); } - + + for (VariableMetadata vmo : vmlo) { + Long id = vmo.getDataVariable().getId(); + VariableMetadata vmn = vmnMap.get(id); + if (vmn == null || !compareVarMetadata(vmo, vmn)) { + return false; + } + } + + return true; } public static boolean compareVarMetadata(VariableMetadata vmOld, VariableMetadata vmNew) { From 0d6f7bec1d6bd81a7ae608a0ad8f8f6cd13d9c03 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 28 Mar 2025 11:24:02 -0400 Subject: [PATCH 12/83] diff by query --- .../iq/dataverse/search/IndexServiceBean.java | 107 ++++++++++++------ 1 file changed, 72 insertions(+), 35 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index 1ed33d6dbf9..22e1e03c276 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -88,6 +88,7 @@ import jakarta.json.JsonObject; import jakarta.persistence.EntityManager; import jakarta.persistence.PersistenceContext; +import jakarta.persistence.Query; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -1375,23 +1376,53 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set filesIndexed = new ArrayList<>(); - if (datasetVersion != null) { - List fileMetadatas = datasetVersion.getFileMetadatas(); + List filesIndexed = new ArrayList<>(); + final List changedFileMetadataIds = new ArrayList<>(); + if (datasetVersion != null) { + List fileMetadatas = datasetVersion.getFileMetadatas(); List rfm = new ArrayList<>(); - Map fileMap = new HashMap<>(); - boolean check = false; - if (datasetVersion.isDraft() && dataset.isReleased() && dataset.getReleasedVersion() != null) { - check = true; + Map fileMap = new HashMap<>(); + if (datasetVersion.isDraft() && dataset.isReleased() && dataset.getReleasedVersion() != null) { rfm = dataset.getReleasedVersion().getFileMetadatas(); for (FileMetadata released : rfm) { - fileMap.put(released.getDataFile().getId(), released); + fileMap.put(released.getDataFile().getId(), released); + } + + String compareFileMetadataQuery = "WITH fm_categories AS (" + + " SELECT fmd.filemetadatas_id, " + + " STRING_AGG(dfc.name, ',' ORDER BY dfc.name) AS categories " + + " FROM FileMetadata_DataFileCategory fmd " + + " JOIN DataFileCategory dfc ON fmd.filecategories_id = dfc.id " + + " GROUP BY fmd.filemetadatas_id " + + ") " + + "SELECT fm1.id " + + "FROM FileMetadata fm1 " + + "LEFT JOIN FileMetadata fm2 ON fm1.datafile_id = fm2.datafile_id " + + " AND fm2.datasetversion_id = :releasedVersionId " + + "LEFT JOIN fm_categories fc1 ON fc1.filemetadatas_id = fm1.id " + + "LEFT JOIN fm_categories fc2 ON fc2.filemetadatas_id = fm2.id " + + "WHERE fm1.datasetversion_id = :currentVersionId " + + " AND (fm2.id IS NULL " + + " OR (fm1.datafile_id = fm2.datafile_id " + + " AND (fm2.description IS DISTINCT FROM fm1.description " + + " OR fm2.directoryLabel IS DISTINCT FROM fm1.directoryLabel " + + " OR fm2.label != fm1.label " + + " OR fm2.restricted IS DISTINCT FROM fm1.restricted " + + " OR fm2.prov_freeform IS DISTINCT FROM fm1.prov_freeform " + + " OR fc1.categories IS DISTINCT FROM fc2.categories " + + " ) " + + " ) " + + " )"; + + Query query = em.createNativeQuery(compareFileMetadataQuery); + query.setParameter("releasedVersionId", dataset.getReleasedVersion().getId()); + query.setParameter("currentVersionId", datasetVersion.getId()); + + changedFileMetadataIds.addAll(query.getResultList()); + logger.fine( + "We are indexing a draft version of a dataset that has a released version. We'll be checking file metadatas if they are exact clones of the released versions."); } - logger.fine( - "We are indexing a draft version of a dataset that has a released version. We'll be checking file metadatas if they are exact clones of the released versions."); - } - final List releasedFileMetadatas = rfm; - final boolean checkForDuplicateMetadata = check; + AtomicReference embargoEndDateRef = new AtomicReference<>(null); AtomicReference retentionEndDateRef = new AtomicReference<>(null); final String datasetCitation = (dataset.isReleased() && dataset.getReleasedVersion() != null) ? dataset.getCitation(dataset.getReleasedVersion()) : dataset.getCitation(); @@ -1423,13 +1454,22 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set { DataFile datafile = fileMetadata.getDataFile(); Embargo emb = datafile.getEmbargo(); @@ -1446,21 +1486,18 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set (current == null || startDate.isBefore(current)) ? startDate : current); start = startDate; } - boolean indexThisFile=indexThisMetadata; - if (indexThisMetadata && checkForDuplicateMetadata && !releasedFileMetadatas.isEmpty()) { - logger.fine("Checking if this file metadata is a duplicate."); - FileMetadata getFromMap = fileMap.get(datafile.getId()); - if (getFromMap != null) { - if ((datafile.isRestricted() == getFromMap.getDataFile().isRestricted())) { - if (fileMetadata.contentEquals(getFromMap) - && VariableMetadataUtil.compareVariableMetadata(getFromMap, fileMetadata)) { - indexThisFile = false; - logger.fine("This file metadata hasn't changed since the released version; skipping indexing."); - } else { - logger.fine("This file metadata has changed since the released version; we want to index it!"); - } - } else { - logger.fine("This file's restricted status has changed since the released version; we want to index it!"); + boolean indexThisFile=false; + + if (indexThisMetadata && changedFileMetadataIds.contains(fileMetadata.getId())) { + indexThisFile=true; + } else if(indexThisMetadata) { + logger.fine("Checking if this file metadata is a duplicate."); + FileMetadata getFromMap = fileMap.get(datafile.getId()); + if (getFromMap != null) { + if (!VariableMetadataUtil.compareVariableMetadata(getFromMap, fileMetadata)) { + indexThisFile = true; + logger.fine("This file metadata hasn't changed since the released version; skipping indexing."); + } } } } From e2d4e9896df234cdaf2bd642b84a0122eb268c46 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 28 Mar 2025 11:42:54 -0400 Subject: [PATCH 13/83] numeric params --- .../edu/harvard/iq/dataverse/search/IndexServiceBean.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index 22e1e03c276..9b4a2875fbd 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -1398,10 +1398,10 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set Date: Fri, 28 Mar 2025 12:49:56 -0400 Subject: [PATCH 14/83] fix merge issues, change doFullText logic --- .../iq/dataverse/search/IndexServiceBean.java | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index 9b4a2875fbd..f411b4b5f6d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -1454,12 +1454,6 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set accessObject = null; InputStream instream = null; @@ -1536,8 +1530,6 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set 0) && (size <= maxSize)) { textHandler = new BodyContentHandler(-1); Metadata metadata = new Metadata(); /* @@ -1548,7 +1540,6 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set Date: Fri, 28 Mar 2025 12:54:57 -0400 Subject: [PATCH 15/83] formatting --- .../iq/dataverse/search/IndexServiceBean.java | 396 +++++++++--------- 1 file changed, 195 insertions(+), 201 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index f411b4b5f6d..d6d62657df6 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -924,7 +924,7 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset) throws Sol return result; } - public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set datafilesInDraftVersion) throws SolrServerException, IOException { + public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set datafilesInDraftVersion) throws SolrServerException, IOException { IndexableDataset.DatasetState state = indexableDataset.getDatasetState(); Dataset dataset = indexableDataset.getDatasetVersion().getDataset(); logger.fine("adding or updating Solr document for dataset id " + dataset.getId()); @@ -946,7 +946,7 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set cvocMap = datasetFieldService.getCVocConf(true); Map> cvocManagedFieldMap = new HashMap<>(); for (Map.Entry cvocEntry : cvocMap.entrySet()) { - if(cvocEntry.getValue().containsKey("managed-fields")) { + if (cvocEntry.getValue().containsKey("managed-fields")) { JsonObject managedFields = cvocEntry.getValue().getJsonObject("managed-fields"); Set managedFieldValues = new HashSet<>(); for (String s : managedFields.keySet()) { @@ -1053,8 +1053,6 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set metadataBlocksWithValue = new HashSet<>(); for (DatasetField dsf : datasetVersion.getFlatDatasetFields()) { @@ -1123,7 +1121,7 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set vals = dsf.getValues_nondisplay(); Set searchStrings = new HashSet<>(); - for (String val: vals) { + for (String val : vals) { searchStrings.add(val); // Try to get string values from externalvocabularyvalue using val as termUri searchStrings.addAll(datasetFieldService.getIndexableStringsByTermUri(val, cvocMap.get(dsfType.getId()), dsfType.getName())); - if(dsfType.getParentDatasetFieldType()!=null) { + if (dsfType.getParentDatasetFieldType() != null) { List childDatasetFields = dsf.getParentDatasetFieldCompoundValue().getChildDatasetFields(); for (DatasetField df : childDatasetFields) { - if(cvocManagedFieldMap.containsKey(dsfType.getId()) && cvocManagedFieldMap.get(dsfType.getId()).contains(df.getDatasetFieldType().getName())) { + if (cvocManagedFieldMap.containsKey(dsfType.getId()) && cvocManagedFieldMap.get(dsfType.getId()).contains(df.getDatasetFieldType().getName())) { String solrManagedFieldSearchable = df.getDatasetFieldType().getSolrField().getNameSearchable(); // Try to get string values from externalvocabularyvalue but for a managed fields of the CVOCConf Set stringsForManagedField = datasetFieldService.getIndexableStringsByTermUri(val, cvocMap.get(dsfType.getId()), df.getDatasetFieldType().getName()); logger.fine(solrManagedFieldSearchable + " filled with externalvocabularyvalue : " + stringsForManagedField); - //.addField works as addition of value not a replace of value + // .addField works as addition of value not a replace of value // it allows to add mapped values by CVOCConf before or after indexing real DatasetField value(s) of solrManagedFieldSearchable solrInputDocument.addField(solrManagedFieldSearchable, stringsForManagedField); } @@ -1241,7 +1239,7 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set Float.parseFloat(westLon)) { - minWestLon=westLon; + // Find the overall bounding box that includes all bounding boxes + if (minWestLon == null || Float.parseFloat(minWestLon) > Float.parseFloat(westLon)) { + minWestLon = westLon; } - if(maxEastLon==null || Float.parseFloat(maxEastLon) < Float.parseFloat(eastLon)) { - maxEastLon=eastLon; + if (maxEastLon == null || Float.parseFloat(maxEastLon) < Float.parseFloat(eastLon)) { + maxEastLon = eastLon; } - if(minSouthLat==null || Float.parseFloat(minSouthLat) > Float.parseFloat(southLat)) { - minSouthLat=southLat; + if (minSouthLat == null || Float.parseFloat(minSouthLat) > Float.parseFloat(southLat)) { + minSouthLat = southLat; } - if(maxNorthLat==null || Float.parseFloat(maxNorthLat) < Float.parseFloat(northLat)) { - maxNorthLat=northLat; + if (maxNorthLat == null || Float.parseFloat(maxNorthLat) < Float.parseFloat(northLat)) { + maxNorthLat = northLat; } if (DatasetFieldValueValidator.validateBoundingBox(westLon, eastLon, northLat, southLat)) { - //W, E, N, S + // W, E, N, S solrInputDocument.addField(SearchFields.GEOLOCATION, "ENVELOPE(" + westLon + "," + eastLon + "," + northLat + "," + southLat + ")"); } } } - //Only one bbox per dataset - //W, E, N, S + // Only one bbox per dataset + // W, E, N, S if (DatasetFieldValueValidator.validateBoundingBox(minWestLon, maxEastLon, maxNorthLat, minSouthLat) && (minWestLon != null || maxEastLon != null) && (maxNorthLat != null || minSouthLat != null)) { solrInputDocument.addField(SearchFields.BOUNDING_BOX, "ENVELOPE(" + minWestLon + "," + maxEastLon + "," + maxNorthLat + "," + minSouthLat + ")"); @@ -1345,12 +1343,12 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set dataversePaths = retrieveDVOPaths(dataset); + + List dataversePaths = retrieveDVOPaths(dataset); solrInputDocument.addField(SearchFields.SUBTREE, dataversePaths); // solrInputDocument.addField(SearchFields.HOST_DATAVERSE, // dataset.getOwner().getName()); @@ -1376,125 +1374,123 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set filesIndexed = new ArrayList<>(); - final List changedFileMetadataIds = new ArrayList<>(); - if (datasetVersion != null) { - List fileMetadatas = datasetVersion.getFileMetadatas(); - List rfm = new ArrayList<>(); - Map fileMap = new HashMap<>(); - if (datasetVersion.isDraft() && dataset.isReleased() && dataset.getReleasedVersion() != null) { - rfm = dataset.getReleasedVersion().getFileMetadatas(); - for (FileMetadata released : rfm) { - fileMap.put(released.getDataFile().getId(), released); - } - - String compareFileMetadataQuery = "WITH fm_categories AS (" + - " SELECT fmd.filemetadatas_id, " + - " STRING_AGG(dfc.name, ',' ORDER BY dfc.name) AS categories " + - " FROM FileMetadata_DataFileCategory fmd " + - " JOIN DataFileCategory dfc ON fmd.filecategories_id = dfc.id " + - " GROUP BY fmd.filemetadatas_id " + - ") " + - "SELECT fm1.id " + - "FROM FileMetadata fm1 " + - "LEFT JOIN FileMetadata fm2 ON fm1.datafile_id = fm2.datafile_id " + - " AND fm2.datasetversion_id = ?1 " + - "LEFT JOIN fm_categories fc1 ON fc1.filemetadatas_id = fm1.id " + - "LEFT JOIN fm_categories fc2 ON fc2.filemetadatas_id = fm2.id " + - "WHERE fm1.datasetversion_id = ?2 " + - " AND (fm2.id IS NULL " + - " OR (fm1.datafile_id = fm2.datafile_id " + - " AND (fm2.description IS DISTINCT FROM fm1.description " + - " OR fm2.directoryLabel IS DISTINCT FROM fm1.directoryLabel " + - " OR fm2.label != fm1.label " + - " OR fm2.restricted IS DISTINCT FROM fm1.restricted " + - " OR fm2.prov_freeform IS DISTINCT FROM fm1.prov_freeform " + - " OR fc1.categories IS DISTINCT FROM fc2.categories " + - " ) " + - " ) " + - " )"; - - Query query = em.createNativeQuery(compareFileMetadataQuery); - query.setParameter(1, dataset.getReleasedVersion().getId()); - query.setParameter(2, datasetVersion.getId()); - - changedFileMetadataIds.addAll(query.getResultList()); - logger.fine( - "We are indexing a draft version of a dataset that has a released version. We'll be checking file metadatas if they are exact clones of the released versions."); + List filesIndexed = new ArrayList<>(); + final List changedFileMetadataIds = new ArrayList<>(); + if (datasetVersion != null) { + List fileMetadatas = datasetVersion.getFileMetadatas(); + List rfm = new ArrayList<>(); + Map fileMap = new HashMap<>(); + if (datasetVersion.isDraft() && dataset.isReleased() && dataset.getReleasedVersion() != null) { + rfm = dataset.getReleasedVersion().getFileMetadatas(); + for (FileMetadata released : rfm) { + fileMap.put(released.getDataFile().getId(), released); } - AtomicReference embargoEndDateRef = new AtomicReference<>(null); - AtomicReference retentionEndDateRef = new AtomicReference<>(null); - final String datasetCitation = (dataset.isReleased() && dataset.getReleasedVersion() != null) ? dataset.getCitation(dataset.getReleasedVersion()) : dataset.getCitation(); + String compareFileMetadataQuery = "WITH fm_categories AS (" + + " SELECT fmd.filemetadatas_id, " + + " STRING_AGG(dfc.name, ',' ORDER BY dfc.name) AS categories " + + " FROM FileMetadata_DataFileCategory fmd " + + " JOIN DataFileCategory dfc ON fmd.filecategories_id = dfc.id " + + " GROUP BY fmd.filemetadatas_id " + + ") " + + "SELECT fm1.id " + + "FROM FileMetadata fm1 " + + "LEFT JOIN FileMetadata fm2 ON fm1.datafile_id = fm2.datafile_id " + + " AND fm2.datasetversion_id = ?1 " + + "LEFT JOIN fm_categories fc1 ON fc1.filemetadatas_id = fm1.id " + + "LEFT JOIN fm_categories fc2 ON fc2.filemetadatas_id = fm2.id " + + "WHERE fm1.datasetversion_id = ?2 " + + " AND (fm2.id IS NULL " + + " OR (fm1.datafile_id = fm2.datafile_id " + + " AND (fm2.description IS DISTINCT FROM fm1.description " + + " OR fm2.directoryLabel IS DISTINCT FROM fm1.directoryLabel " + + " OR fm2.label != fm1.label " + + " OR fm2.restricted IS DISTINCT FROM fm1.restricted " + + " OR fm2.prov_freeform IS DISTINCT FROM fm1.prov_freeform " + + " OR fc1.categories IS DISTINCT FROM fc2.categories " + + " ) " + + " ) " + + " )"; + + Query query = em.createNativeQuery(compareFileMetadataQuery); + query.setParameter(1, dataset.getReleasedVersion().getId()); + query.setParameter(2, datasetVersion.getId()); + + changedFileMetadataIds.addAll(query.getResultList()); + logger.fine( + "We are indexing a draft version of a dataset that has a released version. We'll be checking file metadatas if they are exact clones of the released versions."); + } + + AtomicReference embargoEndDateRef = new AtomicReference<>(null); + AtomicReference retentionEndDateRef = new AtomicReference<>(null); + final String datasetCitation = (dataset.isReleased() && dataset.getReleasedVersion() != null) ? dataset.getCitation(dataset.getReleasedVersion()) : dataset.getCitation(); final Long datasetId = dataset.getId(); final String datasetGlobalId = dataset.getGlobalId().toString(); - final String parentTitle = parentDatasetTitle; - - AutoDetectParser ap = null; - ParseContext ct = null; - if(doFullTextIndexing) { - ap = new AutoDetectParser(); - ct = new ParseContext(); + final String parentTitle = parentDatasetTitle; + + AutoDetectParser ap = null; + ParseContext ct = null; + if (doFullTextIndexing) { + ap = new AutoDetectParser(); + ct = new ParseContext(); } - final AutoDetectParser autoParser = ap; - final ParseContext context = ct; + final AutoDetectParser autoParser = ap; + final ParseContext context = ct; Set datasetPublicationStatuses = new HashSet(); if (dataset.getReleasedVersion() == null && !dataset.isHarvested()) { datasetPublicationStatuses.add(UNPUBLISHED_STRING); - } + } if (datasetVersion.isInReview()) { datasetPublicationStatuses.add(IN_REVIEW_STRING); } - + if (indexableDataset.getDatasetState().equals(DatasetState.PUBLISHED)) { datasetPublicationStatuses.add(PUBLISHED_STRING); } else { if (indexableDataset.getDatasetState().equals(DatasetState.WORKING_COPY)) { datasetPublicationStatuses.add(DRAFT_STRING); } - } + } - String datasetVersionId = datasetVersion.getId().toString(); - boolean indexThisMetadata = indexableDataset.isFilesShouldBeIndexed(); - - - - String datasetPersistentURL = dataset.getPersistentURL(); - boolean isHarvested = dataset.isHarvested(); - long startTime = System.currentTimeMillis(); - fileMetadatas.parallelStream().forEach(fileMetadata -> { - DataFile datafile = fileMetadata.getDataFile(); - Embargo emb = datafile.getEmbargo(); - LocalDate end = null; - if (emb != null) { - final LocalDate endDate = emb.getDateAvailable(); - embargoEndDateRef.updateAndGet(current -> (current == null || endDate.isAfter(current)) ? endDate : current); - end = endDate; - } - Retention ret = datafile.getRetention(); - LocalDate start = null; - if (ret != null) { - final LocalDate startDate = ret.getDateUnavailable(); - retentionEndDateRef.updateAndGet(current -> (current == null || startDate.isBefore(current)) ? startDate : current); - start = startDate; - } - boolean indexThisFile=false; - - if (indexThisMetadata && changedFileMetadataIds.contains(fileMetadata.getId())) { - indexThisFile=true; - } else if(indexThisMetadata) { - logger.fine("Checking if this file metadata is a duplicate."); - FileMetadata getFromMap = fileMap.get(datafile.getId()); - if (getFromMap != null) { - if (!VariableMetadataUtil.compareVariableMetadata(getFromMap, fileMetadata)) { - indexThisFile = true; - logger.fine("This file metadata hasn't changed since the released version; skipping indexing."); - } + String datasetVersionId = datasetVersion.getId().toString(); + boolean indexThisMetadata = indexableDataset.isFilesShouldBeIndexed(); + + String datasetPersistentURL = dataset.getPersistentURL(); + boolean isHarvested = dataset.isHarvested(); + long startTime = System.currentTimeMillis(); + fileMetadatas.parallelStream().forEach(fileMetadata -> { + DataFile datafile = fileMetadata.getDataFile(); + Embargo emb = datafile.getEmbargo(); + LocalDate end = null; + if (emb != null) { + final LocalDate endDate = emb.getDateAvailable(); + embargoEndDateRef.updateAndGet(current -> (current == null || endDate.isAfter(current)) ? endDate : current); + end = endDate; + } + Retention ret = datafile.getRetention(); + LocalDate start = null; + if (ret != null) { + final LocalDate startDate = ret.getDateUnavailable(); + retentionEndDateRef.updateAndGet(current -> (current == null || startDate.isBefore(current)) ? startDate : current); + start = startDate; + } + boolean indexThisFile = false; + + if (indexThisMetadata && changedFileMetadataIds.contains(fileMetadata.getId())) { + indexThisFile = true; + } else if (indexThisMetadata) { + logger.fine("Checking if this file metadata is a duplicate."); + FileMetadata getFromMap = fileMap.get(datafile.getId()); + if (getFromMap != null) { + if (!VariableMetadataUtil.compareVariableMetadata(getFromMap, fileMetadata)) { + indexThisFile = true; + logger.fine("This file metadata hasn't changed since the released version; skipping indexing."); } } - if (indexThisFile) { + } + if (indexThisFile) { SolrInputDocument datafileSolrInputDocument = new SolrInputDocument(); Long fileEntityId = datafile.getId(); @@ -1504,10 +1500,10 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS,s)); + datasetPublicationStatuses.forEach(s -> datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, s)); String fileSolrDocId = solrDocIdentifierFile + fileEntityId; indexableDataset.getDatasetState(); @@ -1666,7 +1662,7 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set variableMap = null; List variablesByMetadata = variableService.findVarMetByFileMetaId(fileMetadata.getId()); - variableMap = - variablesByMetadata.stream().collect(Collectors.toMap(VariableMetadata::getId, Function.identity())); - - + variableMap = variablesByMetadata.stream().collect(Collectors.toMap(VariableMetadata::getId, Function.identity())); + for (DataVariable var : variables) { // Hard-coded search fields, for now: // TODO: eventually: review, decide how datavariables should @@ -1740,19 +1734,19 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set datafilesInDraftVersion) throws SolrServerException, IOException { final SolrInputDocuments docs = toSolrDocs(indexableDataset, datafilesInDraftVersion); From 3d2c408f0de3b1850eb6e89978acfde0a06815a5 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 28 Mar 2025 13:45:59 -0400 Subject: [PATCH 16/83] restore indexing of released files --- .../edu/harvard/iq/dataverse/search/IndexServiceBean.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index d6d62657df6..2a83c60002d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -1456,6 +1456,7 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set Date: Fri, 28 Mar 2025 14:46:57 -0400 Subject: [PATCH 17/83] delay getting dataset until semaphore is available --- .../iq/dataverse/search/IndexServiceBean.java | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index 2a83c60002d..b2848fbf976 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -342,9 +342,7 @@ public Future indexDataverse(Dataverse dataverse, boolean processPaths) @TransactionAttribute(REQUIRES_NEW) public void indexDatasetInNewTransaction(Long datasetId) { //Dataset dataset) { boolean doNormalSolrDocCleanUp = false; - Dataset dataset = datasetService.findDeep(datasetId); - asyncIndexDataset(dataset, doNormalSolrDocCleanUp); - dataset = null; + asyncIndexDataset(datasetId, doNormalSolrDocCleanUp); } // The following two variables are only used in the synchronized getNextToIndex method and do not need to be synchronized themselves @@ -433,6 +431,23 @@ public void asyncIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) { } } + @Asynchronous + public void asyncIndexDataset(Long datasetId, boolean doNormalSolrDocCleanUp) { + //Initialize dataset here for logging (LoggingUtil) purposes + Dataset dataset = new Dataset(); + dataset.setId(datasetId); + try { + acquirePermitFromSemaphore(); + dataset = datasetService.findDeep(datasetId); + doAsyncIndexDataset(dataset, doNormalSolrDocCleanUp); + } catch (InterruptedException e) { + String failureLogText = "Indexing failed: interrupted. You can kickoff a re-index of this dataset with: \r\n curl http://localhost:8080/api/admin/index/datasets/" + datasetId.toString(); + failureLogText += "\r\n" + e.getLocalizedMessage(); + LoggingUtil.writeOnSuccessFailureLog(null, failureLogText, dataset); + } finally { + ASYNC_INDEX_SEMAPHORE.release(); + } + } private void doAsyncIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) { Long id = dataset.getId(); Dataset next = getNextToIndex(id, dataset); // if there is an ongoing index job for this dataset, next is null (ongoing index job will reindex the newest version after current indexing finishes) From 1b2548a643e188d0d94730a54a2ae20d23d679c0 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 28 Mar 2025 15:40:40 -0400 Subject: [PATCH 18/83] restore transaction, don't finddeep --- .../java/edu/harvard/iq/dataverse/search/IndexServiceBean.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index b2848fbf976..626c85e9952 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -438,7 +438,7 @@ public void asyncIndexDataset(Long datasetId, boolean doNormalSolrDocCleanUp) { dataset.setId(datasetId); try { acquirePermitFromSemaphore(); - dataset = datasetService.findDeep(datasetId); + dataset = datasetService.find(datasetId); doAsyncIndexDataset(dataset, doNormalSolrDocCleanUp); } catch (InterruptedException e) { String failureLogText = "Indexing failed: interrupted. You can kickoff a re-index of this dataset with: \r\n curl http://localhost:8080/api/admin/index/datasets/" + datasetId.toString(); From 9deef72b272e364a927a3f0afc3a1ee3fcd39e89 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 28 Mar 2025 16:48:00 -0400 Subject: [PATCH 19/83] simplify ToU logic --- .../java/edu/harvard/iq/dataverse/search/IndexServiceBean.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index 626c85e9952..5308bb27547 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -29,6 +29,7 @@ import edu.harvard.iq.dataverse.GlobalId; import edu.harvard.iq.dataverse.PermissionServiceBean; import edu.harvard.iq.dataverse.Retention; +import edu.harvard.iq.dataverse.TermsOfUseAndAccess; import edu.harvard.iq.dataverse.authorization.AuthenticationServiceBean; import edu.harvard.iq.dataverse.authorization.providers.builtin.BuiltinUserServiceBean; import edu.harvard.iq.dataverse.batch.util.LoggingUtil; @@ -1993,7 +1994,7 @@ private void addLicenseToSolrDoc(SolrInputDocument solrInputDocument, DatasetVer if (datasetVersion != null && datasetVersion.getTermsOfUseAndAccess() != null) { //test to see if the terms of use are the default set in 5.10 - if so and there's no license then don't add license to solr doc. //fixes 10513 - if (datasetVersionService.isVersionDefaultCustomTerms(datasetVersion)){ + if(TermsOfUseAndAccess.DEFAULT_NOTERMS.equals(datasetVersion.getTermsOfUseAndAccess().getTermsOfUse())) { return; } From 9e5ea007318f2be2f0d1b282412befe1891c4bc3 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 28 Mar 2025 17:15:19 -0400 Subject: [PATCH 20/83] avoid keeping files in List --- .../search/SolrIndexServiceBean.java | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index 2b4f08807ef..b2dc947779d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -407,15 +407,17 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) for (Dataset dataset : directChildDatasetsOfDvDefPoint) { indexPermissionsForOneDvObject(dataset); numObjects++; - for (DataFile datafile : filesToReIndexPermissionsFor(dataset)) { - filesToReindexAsBatch.add(datafile); - i++; - if (i % 100 == 0) { - reindexFilesInBatches(filesToReindexAsBatch); - filesToReindexAsBatch.clear(); - } - if (i % 1000 == 0) { - logger.fine("Progress: " +i + " files permissions reindexed"); + for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { + for (FileMetadata fmd : version.getFileMetadatas()) { + filesToReindexAsBatch.add(fmd.getDataFile()); + i++; + if (i % 100 == 0) { + reindexFilesInBatches(filesToReindexAsBatch); + filesToReindexAsBatch.clear(); + } + if (i % 1000 == 0) { + logger.fine("Progress: " + i + "files permissions reindexed"); + } } } logger.fine("Progress : dataset " + dataset.getId() + " permissions reindexed"); @@ -509,18 +511,16 @@ private String reindexFilesInBatches(List filesToReindexPermissionsFor } } - private List filesToReIndexPermissionsFor(Dataset dataset) { - List filesToReindexPermissionsFor = new ArrayList<>(); + private List versionsToReIndexPermissionsFor(Dataset dataset) { + List versionsToReindexPermissionsFor = new ArrayList<>(); Map desiredCards = searchPermissionsService.getDesiredCards(dataset); for (DatasetVersion version : datasetVersionsToBuildCardsFor(dataset)) { boolean cardShouldExist = desiredCards.get(version.getVersionState()); if (cardShouldExist) { - for (FileMetadata fileMetadata : version.getFileMetadatas()) { - filesToReindexPermissionsFor.add(fileMetadata.getDataFile()); - } + versionsToReindexPermissionsFor.add(version); } } - return filesToReindexPermissionsFor; + return versionsToReindexPermissionsFor; } public IndexResponse deleteMultipleSolrIds(List solrIdsToDelete) { From b7924a360756857c4977bc9eb91e931e7612b792 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 28 Mar 2025 17:24:23 -0400 Subject: [PATCH 21/83] change dataset case too --- .../iq/dataverse/search/SolrIndexServiceBean.java | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index b2dc947779d..4eadc884aa2 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -427,12 +427,14 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) numObjects++; // index files Dataset dataset = (Dataset) definitionPoint; - for (DataFile datafile : filesToReIndexPermissionsFor(dataset)) { - filesToReindexAsBatch.add(datafile); - i++; - if (i % 100 == 0) { - reindexFilesInBatches(filesToReindexAsBatch); - filesToReindexAsBatch.clear(); + for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { + for (FileMetadata fmd : version.getFileMetadatas()) { + filesToReindexAsBatch.add(fmd.getDataFile()); + i++; + if (i % 100 == 0) { + reindexFilesInBatches(filesToReindexAsBatch); + filesToReindexAsBatch.clear(); + } } } } else { From 6f6e32ee5160b31415cd3ad8cb60d97a9eb95779 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 28 Mar 2025 18:05:38 -0400 Subject: [PATCH 22/83] avoid variableservice saw IndirectList failure in this section, simplifying --- .../iq/dataverse/search/IndexServiceBean.java | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index 5308bb27547..f63489392fc 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -4,6 +4,7 @@ import edu.harvard.iq.dataverse.DataFile; import edu.harvard.iq.dataverse.DataFileServiceBean; import edu.harvard.iq.dataverse.DataFileTag; +import edu.harvard.iq.dataverse.DataTable; import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetField; import edu.harvard.iq.dataverse.DatasetFieldCompoundValue; @@ -40,7 +41,6 @@ import edu.harvard.iq.dataverse.datavariable.DataVariable; import edu.harvard.iq.dataverse.datavariable.VariableMetadata; import edu.harvard.iq.dataverse.datavariable.VariableMetadataUtil; -import edu.harvard.iq.dataverse.datavariable.VariableServiceBean; import edu.harvard.iq.dataverse.harvest.client.HarvestingClient; import edu.harvard.iq.dataverse.search.IndexableDataset.DatasetState; import edu.harvard.iq.dataverse.settings.FeatureFlags; @@ -157,9 +157,6 @@ public class IndexServiceBean { @EJB DataFileServiceBean dataFileService; - @EJB - VariableServiceBean variableService; - @EJB DatasetFieldServiceBean datasetFieldService; @@ -1726,16 +1723,19 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set variables = datafile.getDataTable().getDataVariables(); - Long observations = datafile.getDataTable().getCaseQuantity(); + DataTable dtable = datafile.getDataTable(); + if (dtable != null) { + List variables = dtable.getDataVariables(); + Long observations = dtable.getCaseQuantity(); datafileSolrInputDocument.addField(SearchFields.OBSERVATIONS, observations); datafileSolrInputDocument.addField(SearchFields.VARIABLE_COUNT, variables.size()); + Map variableMap = null; + Collection variablesByMetadata = fileMetadata.getVariableMetadatas(); + Map variableMap = null; List variablesByMetadata = variableService.findVarMetByFileMetaId(fileMetadata.getId()); From dfbf6038be9bce69b1634b6dea41159b1d4680ca Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 28 Mar 2025 18:37:05 -0400 Subject: [PATCH 23/83] try EAGER --- src/main/java/edu/harvard/iq/dataverse/DataTable.java | 3 ++- .../java/edu/harvard/iq/dataverse/search/IndexServiceBean.java | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DataTable.java b/src/main/java/edu/harvard/iq/dataverse/DataTable.java index 95f3aed0f40..2782429cccf 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataTable.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataTable.java @@ -10,6 +10,7 @@ import java.util.List; import jakarta.persistence.CascadeType; import jakarta.persistence.Entity; +import jakarta.persistence.FetchType; import jakarta.persistence.GeneratedValue; import jakarta.persistence.GenerationType; import jakarta.persistence.Id; @@ -83,7 +84,7 @@ public DataTable() { /* * DataVariables in this DataTable: */ - @OneToMany (mappedBy="dataTable", cascade={ CascadeType.REMOVE, CascadeType.MERGE,CascadeType.PERSIST}) + @OneToMany (mappedBy="dataTable",cascade={ CascadeType.REMOVE, CascadeType.MERGE,CascadeType.PERSIST}, fetch = FetchType.EAGER) @OrderBy ("fileOrder") private List dataVariables; diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index f63489392fc..f8dc141ebde 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -1730,8 +1730,8 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set variables = dtable.getDataVariables(); Long observations = dtable.getCaseQuantity(); - datafileSolrInputDocument.addField(SearchFields.OBSERVATIONS, observations); datafileSolrInputDocument.addField(SearchFields.VARIABLE_COUNT, variables.size()); + datafileSolrInputDocument.addField(SearchFields.OBSERVATIONS, observations); Map variableMap = null; Collection variablesByMetadata = fileMetadata.getVariableMetadatas(); From 7e508b624dc3ce442d046194e0d83e4c133be8b1 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 28 Mar 2025 19:00:41 -0400 Subject: [PATCH 24/83] avoid isTabularData --- .../edu/harvard/iq/dataverse/search/IndexServiceBean.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index f8dc141ebde..f72bb4c96de 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -1712,7 +1712,7 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set variableMap = null; Collection variablesByMetadata = fileMetadata.getVariableMetadatas(); - Map variableMap = null; - List variablesByMetadata = variableService.findVarMetByFileMetaId(fileMetadata.getId()); - variableMap = variablesByMetadata.stream().collect(Collectors.toMap(VariableMetadata::getId, Function.identity())); for (DataVariable var : variables) { From 7296db3c8dc3931a45bdf8abae9c456b0ea300be Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Sat, 29 Mar 2025 11:46:48 -0400 Subject: [PATCH 25/83] restore indexing new files in first versions --- .../edu/harvard/iq/dataverse/search/IndexServiceBean.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index f72bb4c96de..8f351c921a8 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -1464,6 +1464,13 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set Date: Sun, 30 Mar 2025 11:59:46 -0400 Subject: [PATCH 26/83] revert to loop, add try around datatable part --- .../edu/harvard/iq/dataverse/search/IndexServiceBean.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index 8f351c921a8..b22f2f76e69 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -1481,7 +1481,7 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set { + for (FileMetadata fileMetadata : fileMetadatas) { DataFile datafile = fileMetadata.getDataFile(); Embargo emb = datafile.getEmbargo(); LocalDate end = null; @@ -1806,7 +1806,7 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set Date: Mon, 31 Mar 2025 14:48:28 -0400 Subject: [PATCH 27/83] messed merge --- .../iq/dataverse/search/IndexServiceBean.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index b22f2f76e69..1b9539fd02b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -1432,6 +1432,13 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set embargoEndDateRef = new AtomicReference<>(null); @@ -1464,13 +1471,6 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set Date: Sun, 30 Mar 2025 12:48:25 -0400 Subject: [PATCH 28/83] Shift dataset-level constants out of loops --- .../search/SolrIndexServiceBean.java | 65 +++++++++---------- 1 file changed, 30 insertions(+), 35 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index 4eadc884aa2..b6a22400aaf 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -275,7 +275,7 @@ public IndexResponse indexAllPermissions() { Map> filesPerDataset = new HashMap<>(); List allExceptFiles = dvObjectService.findAll(); for (DvObject dvObject : allExceptFiles) { - logger.info("determining definition points for dvobject id " + dvObject.getId()); + logger.fine("determining definition points for dvobject id " + dvObject.getId()); if (dvObject.isInstanceofDataFile()) { Long dataset = dvObject.getOwner().getId(); Long datafile = dvObject.getId(); @@ -303,9 +303,9 @@ public IndexResponse indexAllPermissions() { } for (DvObjectSolrDoc dvObjectSolrDoc : definitionPoints) { - logger.info("creating solr doc in memory for " + dvObjectSolrDoc.getSolrId()); + logger.fine("creating solr doc in memory for " + dvObjectSolrDoc.getSolrId()); SolrInputDocument solrInputDocument = SearchUtil.createSolrDoc(dvObjectSolrDoc); - logger.info("adding to list of docs to index " + dvObjectSolrDoc.getSolrId()); + logger.fine("adding to list of docs to index " + dvObjectSolrDoc.getSolrId()); docs.add(solrInputDocument); } try { @@ -407,12 +407,16 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) for (Dataset dataset : directChildDatasetsOfDvDefPoint) { indexPermissionsForOneDvObject(dataset); numObjects++; + + Map desiredCards = searchPermissionsService.getDesiredCards(dataset); + Set datasetVersions = datasetVersionsToBuildCardsFor(dataset); + for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { for (FileMetadata fmd : version.getFileMetadatas()) { filesToReindexAsBatch.add(fmd.getDataFile()); i++; if (i % 100 == 0) { - reindexFilesInBatches(filesToReindexAsBatch); + reindexFilesInBatches(filesToReindexAsBatch, desiredCards, datasetVersions); filesToReindexAsBatch.clear(); } if (i % 1000 == 0) { @@ -420,6 +424,7 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) } } } + reindexFilesInBatches(filesToReindexAsBatch, desiredCards, datasetVersions); logger.fine("Progress : dataset " + dataset.getId() + " permissions reindexed"); } } else if (definitionPoint.isInstanceofDataset()) { @@ -427,16 +432,20 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) numObjects++; // index files Dataset dataset = (Dataset) definitionPoint; + Map desiredCards = searchPermissionsService.getDesiredCards(dataset); + Set datasetVersions = datasetVersionsToBuildCardsFor(dataset); + for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { for (FileMetadata fmd : version.getFileMetadatas()) { filesToReindexAsBatch.add(fmd.getDataFile()); i++; if (i % 100 == 0) { - reindexFilesInBatches(filesToReindexAsBatch); + reindexFilesInBatches(filesToReindexAsBatch, desiredCards, datasetVersions); filesToReindexAsBatch.clear(); } } } + reindexFilesInBatches(filesToReindexAsBatch, desiredCards, datasetVersions); } else { indexPermissionsForOneDvObject(definitionPoint); numObjects++; @@ -448,64 +457,50 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) * @todo Should update timestamps, probably, even thought these are * files, see https://github.com/IQSS/dataverse/issues/2421 */ - reindexFilesInBatches(filesToReindexAsBatch); logger.fine("Reindexed permissions for " + i + " files and " + numObjects + " datasets/collections"); return new IndexResponse("Number of dvObject permissions indexed for " + definitionPoint + ": " + numObjects); } - private String reindexFilesInBatches(List filesToReindexPermissionsFor) { + private String reindexFilesInBatches(List filesToReindexPermissionsFor, + Map desiredCards, + Set datasetVersions) { List docs = new ArrayList<>(); - Map> byParentId = new HashMap<>(); Map> permStringByDatasetVersion = new HashMap<>(); - int i = 0; try { - for (DataFile file : filesToReindexPermissionsFor) { - Dataset dataset = (Dataset) file.getOwner(); - Map desiredCards = searchPermissionsService.getDesiredCards(dataset); - for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersionsToBuildCardsFor(dataset)) { - boolean cardShouldExist = desiredCards.get(datasetVersionFileIsAttachedTo.getVersionState()); - if (cardShouldExist) { + // Assume all files have the same owner + if (filesToReindexPermissionsFor.isEmpty()) { + return "No files to reindex"; + } + + for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersions) { + boolean cardShouldExist = desiredCards.get(datasetVersionFileIsAttachedTo.getVersionState()); + if (cardShouldExist) { + for (DataFile file : filesToReindexPermissionsFor) { List cachedPermission = permStringByDatasetVersion.get(datasetVersionFileIsAttachedTo.getId()); if (cachedPermission == null) { logger.finest("no cached permission! Looking it up..."); - List fileSolrDocs = constructDatafileSolrDocs((DataFile) file, permStringByDatasetVersion); + List fileSolrDocs = constructDatafileSolrDocs(file, permStringByDatasetVersion); for (DvObjectSolrDoc fileSolrDoc : fileSolrDocs) { Long datasetVersionId = fileSolrDoc.getDatasetVersionId(); if (datasetVersionId != null) { permStringByDatasetVersion.put(datasetVersionId, fileSolrDoc.getPermissions()); SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc); docs.add(solrDoc); - i++; } } } else { logger.finest("cached permission is " + cachedPermission); - List fileSolrDocsBasedOnCachedPermissions = constructDatafileSolrDocs((DataFile) file, permStringByDatasetVersion); + List fileSolrDocsBasedOnCachedPermissions = constructDatafileSolrDocs(file, permStringByDatasetVersion); for (DvObjectSolrDoc fileSolrDoc : fileSolrDocsBasedOnCachedPermissions) { SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc); docs.add(solrDoc); - i++; } } - if (i % 20 == 0) { - persistToSolr(docs); - docs = new ArrayList<>(); - } } } - Long parent = file.getOwner().getId(); - List existingList = byParentId.get(parent); - if (existingList == null) { - List empty = new ArrayList<>(); - byParentId.put(parent, empty); - } else { - List updatedList = existingList; - updatedList.add(file.getId()); - byParentId.put(parent, updatedList); - } } - + persistToSolr(docs); return " " + filesToReindexPermissionsFor.size() + " files indexed across " + docs.size() + " Solr documents "; } catch (SolrServerException | IOException ex) { @@ -542,7 +537,7 @@ public IndexResponse deleteMultipleSolrIds(List solrIdsToDelete) { public JsonObjectBuilder deleteAllFromSolrAndResetIndexTimes() throws SolrServerException, IOException { JsonObjectBuilder response = Json.createObjectBuilder(); - logger.info("attempting to delete all Solr documents before a complete re-index"); + logger.fine("attempting to delete all Solr documents before a complete re-index"); solrClientService.getSolrClient().deleteByQuery("*:*"); int numRowsAffected = dvObjectService.clearAllIndexTimes(); response.add(numRowsClearedByClearAllIndexTimes, numRowsAffected); From 9cc7ba46b3446a82a631cb7ed2635dd8a896621b Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Sun, 30 Mar 2025 13:07:32 -0400 Subject: [PATCH 29/83] Calculate desired cards once --- .../iq/dataverse/search/SolrIndexServiceBean.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index b6a22400aaf..9c5fe7cb30c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -91,7 +91,8 @@ public List determineSolrDocs(DvObject dvObject) { solrDocs.addAll(datasetSolrDocs); } else if (dvObject.isInstanceofDataFile()) { Map> permStringByDatasetVersion = new HashMap<>(); - List fileSolrDocs = constructDatafileSolrDocs((DataFile) dvObject, permStringByDatasetVersion); + Map desiredCards = searchPermissionsService.getDesiredCards(((DataFile) dvObject).getOwner()); + List fileSolrDocs = constructDatafileSolrDocs((DataFile) dvObject, permStringByDatasetVersion, desiredCards); solrDocs.addAll(fileSolrDocs); } else { logger.info("Unexpected DvObject: " + dvObject.getClass().getName()); @@ -148,9 +149,8 @@ private List constructDatasetSolrDocs(Dataset dataset) { } // private List constructDatafileSolrDocs(DataFile dataFile) { - private List constructDatafileSolrDocs(DataFile dataFile, Map> permStringByDatasetVersion) { + private List constructDatafileSolrDocs(DataFile dataFile, Map> permStringByDatasetVersion, Map desiredCards) { List datafileSolrDocs = new ArrayList<>(); - Map desiredCards = searchPermissionsService.getDesiredCards(dataFile.getOwner()); for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersionsToBuildCardsFor(dataFile.getOwner())) { boolean cardShouldExist = desiredCards.get(datasetVersionFileIsAttachedTo.getVersionState()); /* @@ -480,7 +480,7 @@ private String reindexFilesInBatches(List filesToReindexPermissionsFor List cachedPermission = permStringByDatasetVersion.get(datasetVersionFileIsAttachedTo.getId()); if (cachedPermission == null) { logger.finest("no cached permission! Looking it up..."); - List fileSolrDocs = constructDatafileSolrDocs(file, permStringByDatasetVersion); + List fileSolrDocs = constructDatafileSolrDocs(file, permStringByDatasetVersion, desiredCards); for (DvObjectSolrDoc fileSolrDoc : fileSolrDocs) { Long datasetVersionId = fileSolrDoc.getDatasetVersionId(); if (datasetVersionId != null) { @@ -491,7 +491,7 @@ private String reindexFilesInBatches(List filesToReindexPermissionsFor } } else { logger.finest("cached permission is " + cachedPermission); - List fileSolrDocsBasedOnCachedPermissions = constructDatafileSolrDocs(file, permStringByDatasetVersion); + List fileSolrDocsBasedOnCachedPermissions = constructDatafileSolrDocs(file, permStringByDatasetVersion, desiredCards); for (DvObjectSolrDoc fileSolrDoc : fileSolrDocsBasedOnCachedPermissions) { SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc); docs.add(solrDoc); From 829826b0543f7a3c8f4f07b295230c70eaeea43f Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Sun, 30 Mar 2025 13:47:07 -0400 Subject: [PATCH 30/83] calc datasetVersionsToBuildCardsFor once --- .../dataverse/search/SolrIndexServiceBean.java | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index 9c5fe7cb30c..e5a999f8edc 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -91,8 +91,11 @@ public List determineSolrDocs(DvObject dvObject) { solrDocs.addAll(datasetSolrDocs); } else if (dvObject.isInstanceofDataFile()) { Map> permStringByDatasetVersion = new HashMap<>(); - Map desiredCards = searchPermissionsService.getDesiredCards(((DataFile) dvObject).getOwner()); - List fileSolrDocs = constructDatafileSolrDocs((DataFile) dvObject, permStringByDatasetVersion, desiredCards); + DataFile datafile = (DataFile) dvObject; + Map desiredCards = searchPermissionsService.getDesiredCards(datafile.getOwner()); + Set datasetVersions = datasetVersionsToBuildCardsFor(datafile.getOwner()); + + List fileSolrDocs = constructDatafileSolrDocs(datafile, permStringByDatasetVersion, desiredCards, datasetVersions); solrDocs.addAll(fileSolrDocs); } else { logger.info("Unexpected DvObject: " + dvObject.getClass().getName()); @@ -149,9 +152,9 @@ private List constructDatasetSolrDocs(Dataset dataset) { } // private List constructDatafileSolrDocs(DataFile dataFile) { - private List constructDatafileSolrDocs(DataFile dataFile, Map> permStringByDatasetVersion, Map desiredCards) { + private List constructDatafileSolrDocs(DataFile dataFile, Map> permStringByDatasetVersion, Map desiredCards, Set datasetVersions) { List datafileSolrDocs = new ArrayList<>(); - for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersionsToBuildCardsFor(dataFile.getOwner())) { + for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersions) { boolean cardShouldExist = desiredCards.get(datasetVersionFileIsAttachedTo.getVersionState()); /* * Since datasetVersionFileIsAttachedTo should be a draft or the most recent @@ -475,12 +478,13 @@ private String reindexFilesInBatches(List filesToReindexPermissionsFor for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersions) { boolean cardShouldExist = desiredCards.get(datasetVersionFileIsAttachedTo.getVersionState()); + if (cardShouldExist) { for (DataFile file : filesToReindexPermissionsFor) { List cachedPermission = permStringByDatasetVersion.get(datasetVersionFileIsAttachedTo.getId()); if (cachedPermission == null) { logger.finest("no cached permission! Looking it up..."); - List fileSolrDocs = constructDatafileSolrDocs(file, permStringByDatasetVersion, desiredCards); + List fileSolrDocs = constructDatafileSolrDocs(file, permStringByDatasetVersion, desiredCards, datasetVersions); for (DvObjectSolrDoc fileSolrDoc : fileSolrDocs) { Long datasetVersionId = fileSolrDoc.getDatasetVersionId(); if (datasetVersionId != null) { @@ -491,7 +495,7 @@ private String reindexFilesInBatches(List filesToReindexPermissionsFor } } else { logger.finest("cached permission is " + cachedPermission); - List fileSolrDocsBasedOnCachedPermissions = constructDatafileSolrDocs(file, permStringByDatasetVersion, desiredCards); + List fileSolrDocsBasedOnCachedPermissions = constructDatafileSolrDocs(file, permStringByDatasetVersion, desiredCards, datasetVersions); for (DvObjectSolrDoc fileSolrDoc : fileSolrDocsBasedOnCachedPermissions) { SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc); docs.add(solrDoc); From 2aa27095449eed667dc10b0c1f4d849acae80f95 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Sun, 30 Mar 2025 15:34:01 -0400 Subject: [PATCH 31/83] Custom permission query for filedownloaders --- .../iq/dataverse/RoleAssigneeServiceBean.java | 13 +++++++++++++ .../search/SearchPermissionsServiceBean.java | 15 +++++++++++++++ .../iq/dataverse/search/SolrIndexServiceBean.java | 1 + 3 files changed, 29 insertions(+) diff --git a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java index 88acc1916cf..db00d9a2bd0 100644 --- a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java @@ -395,6 +395,19 @@ public List filterRoleAssignees(String query, DvObject dvObject, L return roleAssigneeList; } + + public List findFileDownloaders(Long fileId) { + String sql = "SELECT DISTINCT assigneeidentifier FROM roleassignment ra, dataverserole dr, datafile df " + + "WHERE ra.role_id = dr.id " + + "AND get_bit(dr.permissionbits::bit(64), 59) = '1' " + + "AND ra.definitionpoint_id = df.id " + + "AND df.restricted = 't' " + + "AND df.id = ? " + + "GROUP BY assigneeidentifier"; + + return em.createNativeQuery(sql).setParameter(1, fileId).getResultList(); + + } private void msg(String s) { //System.out.println(s); diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java index 0dd2153f75b..3f7b4d276cc 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java @@ -99,6 +99,21 @@ public List findDvObjectPerms(DvObject dvObject) { resetRoleAssigneeCache(); return permStrings; } + + public List findRestrictedDatafilePerms(long fileId) { + List permStrings = new ArrayList<>(); + + List assigneeIdStrings = roleAssigneeService.findFileDownloaders(fileId); + for (String id : assigneeIdStrings) { + // Don't need to cache RoleAssignees since each is unique + RoleAssignee userOrGroup = roleAssigneeService.getRoleAssignee(id); + String indexableUserOrGroupPermissionString = getIndexableStringForUserOrGroup(userOrGroup); + if (indexableUserOrGroupPermissionString != null) { + permStrings.add(indexableUserOrGroupPermissionString); + } + } + return permStrings; + } private void resetRoleAssigneeCache() { roleAssigneeCache.clear(); diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index e5a999f8edc..a18370963af 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -169,6 +169,7 @@ private List constructDatafileSolrDocs(DataFile dataFile, Map perms = new ArrayList<>(); + if (unpublishedDataRelatedToMeModeEnabled) { List cachedPerms = null; if (permStringByDatasetVersion != null) { From a1f624dd7631b118248429f8af395b8111f88a8e Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Sun, 30 Mar 2025 16:40:22 -0400 Subject: [PATCH 32/83] replace findDvObjectPerms --- .../iq/dataverse/RoleAssigneeServiceBean.java | 15 ++++--- .../dataverse/authorization/Permission.java | 3 +- .../search/SearchPermissionsServiceBean.java | 44 +++++++++++++++++-- .../search/SolrIndexServiceBean.java | 1 + 4 files changed, 50 insertions(+), 13 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java index db00d9a2bd0..a8137b62cf3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java @@ -2,6 +2,7 @@ import edu.harvard.iq.dataverse.authorization.AuthenticationServiceBean; import edu.harvard.iq.dataverse.authorization.DataverseRole; +import edu.harvard.iq.dataverse.authorization.Permission; import edu.harvard.iq.dataverse.authorization.RoleAssignee; import edu.harvard.iq.dataverse.authorization.groups.Group; import edu.harvard.iq.dataverse.authorization.groups.GroupServiceBean; @@ -396,16 +397,16 @@ public List filterRoleAssignees(String query, DvObject dvObject, L return roleAssigneeList; } - public List findFileDownloaders(Long fileId) { - String sql = "SELECT DISTINCT assigneeidentifier FROM roleassignment ra, dataverserole dr, datafile df " + + public List findAssigneesWithPermissionOnDvObject(Long fileId, Permission permission) { + int bitpos = 63 - permission.ordinal(); + String sql = "SELECT DISTINCT assigneeidentifier FROM roleassignment ra, dataverserole dr, dvobject dob " + "WHERE ra.role_id = dr.id " + - "AND get_bit(dr.permissionbits::bit(64), 59) = '1' " + - "AND ra.definitionpoint_id = df.id " + - "AND df.restricted = 't' " + - "AND df.id = ? " + + "AND get_bit(dr.permissionbits::bit(64), ?1) = '1' " + + "AND ra.definitionpoint_id = dob.id " + + "AND df.id = ?2 " + "GROUP BY assigneeidentifier"; - return em.createNativeQuery(sql).setParameter(1, fileId).getResultList(); + return em.createNativeQuery(sql).setParameter(1, bitpos).setParameter(2, fileId).getResultList(); } diff --git a/src/main/java/edu/harvard/iq/dataverse/authorization/Permission.java b/src/main/java/edu/harvard/iq/dataverse/authorization/Permission.java index 32937098118..2ad8881155f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/authorization/Permission.java +++ b/src/main/java/edu/harvard/iq/dataverse/authorization/Permission.java @@ -5,6 +5,7 @@ import edu.harvard.iq.dataverse.Dataverse; import edu.harvard.iq.dataverse.DvObject; import java.util.Arrays; +import java.util.Enumeration; import java.util.HashSet; import java.util.Set; import edu.harvard.iq.dataverse.util.BundleUtil; @@ -100,6 +101,4 @@ public boolean appliesTo(Class aClass) { public boolean requiresAuthenticatedUser() { return requiresAuthenticatedUser; } - - } diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java index 3f7b4d276cc..3722af4c1e3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java @@ -71,6 +71,14 @@ public List findDataversePerms(Dataverse dataverse) { return permStrings; } + public List findDataFilePermsforDatasetVersion(DataFile dataFile, DatasetVersion version) { + if (dataFile.isRestricted()) { + return(findDvObjectPerms(dataFile)); + } else { + return findDatasetVersionPerms(version); + } + } + public List findDatasetVersionPerms(DatasetVersion version) { List perms = new ArrayList<>(); if (version.isReleased()) { @@ -82,6 +90,7 @@ public List findDatasetVersionPerms(DatasetVersion version) { return perms; } + /* public List findDvObjectPerms(DvObject dvObject) { List permStrings = new ArrayList<>(); resetRoleAssigneeCache(); @@ -95,11 +104,34 @@ public List findDvObjectPerms(DvObject dvObject) { permStrings.add(indexableUserOrGroupPermissionString); } } + for (String id : assigneeIdStrings) { + // Don't need to cache RoleAssignees since each is unique + RoleAssignee userOrGroup = roleAssigneeService.getRoleAssignee(id); + String indexableUserOrGroupPermissionString = getIndexableStringForUserOrGroup(userOrGroup); + if (indexableUserOrGroupPermissionString != null) { + permStrings.add(indexableUserOrGroupPermissionString); + } } - resetRoleAssigneeCache(); return permStrings; } - +*/ + public List findDvObjectPerms(DvObject dvObject) { + List permStrings = new ArrayList<>(); + Permission p = getRequiredSearchPermission(dvObject); + + List assigneeIdStrings = roleAssigneeService. findAssigneesWithPermissionOnDvObject(dvObject.getId(), p); + for (String id : assigneeIdStrings) { + // Don't need to cache RoleAssignees since each is unique + RoleAssignee userOrGroup = roleAssigneeService.getRoleAssignee(id); + String indexableUserOrGroupPermissionString = getIndexableStringForUserOrGroup(userOrGroup); + if (indexableUserOrGroupPermissionString != null) { + permStrings.add(indexableUserOrGroupPermissionString); + } + } + return permStrings; + } + + /* public List findRestrictedDatafilePerms(long fileId) { List permStrings = new ArrayList<>(); @@ -114,11 +146,13 @@ public List findRestrictedDatafilePerms(long fileId) { } return permStrings; } - +*/ + @Deprecated private void resetRoleAssigneeCache() { roleAssigneeCache.clear(); } + @Deprecated private RoleAssignee getRoleAssignee(String idtf) { RoleAssignee ra = roleAssigneeCache.get(idtf); if (ra != null) { @@ -239,8 +273,10 @@ private boolean hasBeenPublished(Dataverse dataverse) { private Permission getRequiredSearchPermission(DvObject dvObject) { if (dvObject.isInstanceofDataverse()) { return Permission.ViewUnpublishedDataverse; - } else { + } else if(dvObject.isInstanceofDataset()) { return Permission.ViewUnpublishedDataset; + } else { + return Permission.DownloadFile; } } diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index a18370963af..588ce81fd87 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -154,6 +154,7 @@ private List constructDatasetSolrDocs(Dataset dataset) { // private List constructDatafileSolrDocs(DataFile dataFile) { private List constructDatafileSolrDocs(DataFile dataFile, Map> permStringByDatasetVersion, Map desiredCards, Set datasetVersions) { List datafileSolrDocs = new ArrayList<>(); + for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersions) { boolean cardShouldExist = desiredCards.get(datasetVersionFileIsAttachedTo.getVersionState()); /* From 6611cfd2ec6b72f0db0465ba0ff43ee2283ac919 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Sun, 30 Mar 2025 16:48:48 -0400 Subject: [PATCH 33/83] typo --- .../java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java index a8137b62cf3..40943ef6395 100644 --- a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java @@ -403,7 +403,7 @@ public List findAssigneesWithPermissionOnDvObject(Long fileId, Permissio "WHERE ra.role_id = dr.id " + "AND get_bit(dr.permissionbits::bit(64), ?1) = '1' " + "AND ra.definitionpoint_id = dob.id " + - "AND df.id = ?2 " + + "AND dob.id = ?2 " + "GROUP BY assigneeidentifier"; return em.createNativeQuery(sql).setParameter(1, bitpos).setParameter(2, fileId).getResultList(); From 80a3fb27803854cbf7a46e102324570273d91b84 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Sun, 30 Mar 2025 17:24:14 -0400 Subject: [PATCH 34/83] cache up front --- .../iq/dataverse/search/SolrIndexServiceBean.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index 588ce81fd87..79b64620851 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -478,6 +478,13 @@ private String reindexFilesInBatches(List filesToReindexPermissionsFor return "No files to reindex"; } + for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersions) { + if(datasetVersionFileIsAttachedTo.getId() != null) { + permStringByDatasetVersion.put(datasetVersionFileIsAttachedTo.getId(), searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo)); + } + } + + //ToDo - are we creating these docs twice - both this loop and constructDatafileSolrDocs go through all versions? for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersions) { boolean cardShouldExist = desiredCards.get(datasetVersionFileIsAttachedTo.getVersionState()); @@ -485,7 +492,7 @@ private String reindexFilesInBatches(List filesToReindexPermissionsFor for (DataFile file : filesToReindexPermissionsFor) { List cachedPermission = permStringByDatasetVersion.get(datasetVersionFileIsAttachedTo.getId()); if (cachedPermission == null) { - logger.finest("no cached permission! Looking it up..."); + logger.warning("no cached permission! Looking it up..."); List fileSolrDocs = constructDatafileSolrDocs(file, permStringByDatasetVersion, desiredCards, datasetVersions); for (DvObjectSolrDoc fileSolrDoc : fileSolrDocs) { Long datasetVersionId = fileSolrDoc.getDatasetVersionId(); From 4cd01b879f73a498fcdeb2251ac1078960eb1030 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Sun, 30 Mar 2025 17:26:50 -0400 Subject: [PATCH 35/83] avoid duplicate loop over datasetVersions --- .../search/SolrIndexServiceBean.java | 34 ++++--------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index 79b64620851..b05346c4f8f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -483,34 +483,12 @@ private String reindexFilesInBatches(List filesToReindexPermissionsFor permStringByDatasetVersion.put(datasetVersionFileIsAttachedTo.getId(), searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo)); } } - - //ToDo - are we creating these docs twice - both this loop and constructDatafileSolrDocs go through all versions? - for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersions) { - boolean cardShouldExist = desiredCards.get(datasetVersionFileIsAttachedTo.getVersionState()); - - if (cardShouldExist) { - for (DataFile file : filesToReindexPermissionsFor) { - List cachedPermission = permStringByDatasetVersion.get(datasetVersionFileIsAttachedTo.getId()); - if (cachedPermission == null) { - logger.warning("no cached permission! Looking it up..."); - List fileSolrDocs = constructDatafileSolrDocs(file, permStringByDatasetVersion, desiredCards, datasetVersions); - for (DvObjectSolrDoc fileSolrDoc : fileSolrDocs) { - Long datasetVersionId = fileSolrDoc.getDatasetVersionId(); - if (datasetVersionId != null) { - permStringByDatasetVersion.put(datasetVersionId, fileSolrDoc.getPermissions()); - SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc); - docs.add(solrDoc); - } - } - } else { - logger.finest("cached permission is " + cachedPermission); - List fileSolrDocsBasedOnCachedPermissions = constructDatafileSolrDocs(file, permStringByDatasetVersion, desiredCards, datasetVersions); - for (DvObjectSolrDoc fileSolrDoc : fileSolrDocsBasedOnCachedPermissions) { - SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc); - docs.add(solrDoc); - } - } - } + + for (DataFile file : filesToReindexPermissionsFor) { + List fileSolrDocsBasedOnCachedPermissions = constructDatafileSolrDocs(file, permStringByDatasetVersion, desiredCards, datasetVersions); + for (DvObjectSolrDoc fileSolrDoc : fileSolrDocsBasedOnCachedPermissions) { + SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc); + docs.add(solrDoc); } } From c3ab56f84a785a6c470d2fccceca5060114d35bb Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Mon, 31 Mar 2025 08:32:54 -0400 Subject: [PATCH 36/83] let exceptions bubble up --- .../java/edu/harvard/iq/dataverse/search/IndexServiceBean.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index 1b9539fd02b..58b3639cd30 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -1432,7 +1432,7 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set Date: Mon, 31 Mar 2025 10:52:54 -0400 Subject: [PATCH 37/83] remove deprecated always true boolean --- .../search/SolrIndexServiceBean.java | 79 +++++-------------- 1 file changed, 18 insertions(+), 61 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index b05346c4f8f..8b648fa0e95 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -51,32 +51,6 @@ public class SolrIndexServiceBean { public static String numRowsClearedByClearAllIndexTimes = "numRowsClearedByClearAllIndexTimes"; public static String messageString = "message"; - /** - * @deprecated Now that MyData has shipped in 4.1 we have no plans to change - * the unpublishedDataRelatedToMeModeEnabled boolean to false. We should - * probably remove the boolean altogether to simplify the code. - * - * This non-default mode changes the behavior of the "Data Related To Me" - * feature to be more like "**Unpublished** Data Related to Me" after you - * have changed this boolean to true and run "index all". - * - * The "Data Related to Me" feature relies on *always* indexing permissions - * regardless of if the DvObject is published or not. - * - * In "Unpublished Data Related to Me" mode, we first check if the DvObject - * is published. If it's published, we set the search permissions to *only* - * contain "group_public", which is quick and cheap to do. If the DvObject - * in question is *not* public, we perform the expensive operation of - * rooting around in the system to determine who should be able to - * "discover" the unpublished version of DvObject. By default this mode is - * *not* enabled. If you want to enable it, change the boolean to true and - * run "index all". - * - * See also https://github.com/IQSS/dataverse/issues/50 - */ - @Deprecated - private boolean unpublishedDataRelatedToMeModeEnabled = true; - public List determineSolrDocs(DvObject dvObject) { List emptyList = new ArrayList<>(); if (dvObject == null) { @@ -123,12 +97,8 @@ private List determineSolrDocsForFilesFromDataset(Map.Entry perms = new ArrayList<>(); - if (unpublishedDataRelatedToMeModeEnabled) { - if (dataverse.isReleased()) { - perms.add(IndexServiceBean.getPublicGroupString()); - } else { - perms = searchPermissionsService.findDataversePerms(dataverse); - } + if (dataverse.isReleased()) { + perms.add(IndexServiceBean.getPublicGroupString()); } else { perms = searchPermissionsService.findDataversePerms(dataverse); } @@ -171,24 +141,19 @@ private List constructDatafileSolrDocs(DataFile dataFile, Map perms = new ArrayList<>(); - if (unpublishedDataRelatedToMeModeEnabled) { - List cachedPerms = null; - if (permStringByDatasetVersion != null) { - cachedPerms = permStringByDatasetVersion.get(datasetVersionFileIsAttachedTo.getId()); - } - if (cachedPerms != null) { - logger.finest("reusing cached perms for file " + dataFile.getId()); - perms = cachedPerms; - } else if (datasetVersionFileIsAttachedTo.isReleased()) { - logger.finest("no cached perms, file is public/discoverable/searchable for file " + dataFile.getId()); - perms.add(IndexServiceBean.getPublicGroupString()); - } else { - // go to the well (slow) - logger.finest("no cached perms, file is not public, finding perms for file " + dataFile.getId()); - perms = searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo); - } + List cachedPerms = null; + if (permStringByDatasetVersion != null) { + cachedPerms = permStringByDatasetVersion.get(datasetVersionFileIsAttachedTo.getId()); + } + if (cachedPerms != null) { + logger.finest("reusing cached perms for file " + dataFile.getId()); + perms = cachedPerms; + } else if (datasetVersionFileIsAttachedTo.isReleased()) { + logger.finest("no cached perms, file is public/discoverable/searchable for file " + dataFile.getId()); + perms.add(IndexServiceBean.getPublicGroupString()); } else { - // This should never be executed per the deprecation notice on the boolean. + // go to the well (slow) + logger.finest("no cached perms, file is not public, finding perms for file " + dataFile.getId()); perms = searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo); } DvObjectSolrDoc dataFileSolrDoc = new DvObjectSolrDoc(dataFile.getId().toString(), solrId, datasetVersionFileIsAttachedTo.getId(), dataFile.getDisplayName(), perms); @@ -206,12 +171,8 @@ private List constructDatafileSolrDocsFromDataset(Dataset datas boolean cardShouldExist = desiredCards.get(datasetVersionFileIsAttachedTo.getVersionState()); if (cardShouldExist) { List perms = new ArrayList<>(); - if (unpublishedDataRelatedToMeModeEnabled) { - if (datasetVersionFileIsAttachedTo.isReleased()) { - perms.add(IndexServiceBean.getPublicGroupString()); - } else { - perms = searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo); - } + if (datasetVersionFileIsAttachedTo.isReleased()) { + perms.add(IndexServiceBean.getPublicGroupString()); } else { perms = searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo); } @@ -249,12 +210,8 @@ private DvObjectSolrDoc makeDatasetSolrDoc(DatasetVersion version) { String solrId = solrIdStart + solrIdEnd; String name = version.getTitle(); List perms = new ArrayList<>(); - if (unpublishedDataRelatedToMeModeEnabled) { - if (version.isReleased()) { - perms.add(IndexServiceBean.getPublicGroupString()); - } else { - perms = searchPermissionsService.findDatasetVersionPerms(version); - } + if (version.isReleased()) { + perms.add(IndexServiceBean.getPublicGroupString()); } else { perms = searchPermissionsService.findDatasetVersionPerms(version); } From 857b474159a898a123110c0ff9ff49146462c6f7 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Mon, 31 Mar 2025 15:56:46 -0400 Subject: [PATCH 38/83] avoid duplicate doc generation --- .../search/SolrIndexServiceBean.java | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index 8b648fa0e95..dea06b761ea 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -374,9 +374,14 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) Set datasetVersions = datasetVersionsToBuildCardsFor(dataset); for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { + boolean isDraft = version.isDraft(); for (FileMetadata fmd : version.getFileMetadatas()) { - filesToReindexAsBatch.add(fmd.getDataFile()); - i++; + DataFile file = fmd.getDataFile(); + //Since reindexFilesInBatches() re-indexes a file in all versions needed, we should not send a file already in the released version twice + if (isDraft && !file.isReleased()) { + filesToReindexAsBatch.add(file); + i++; + } if (i % 100 == 0) { reindexFilesInBatches(filesToReindexAsBatch, desiredCards, datasetVersions); filesToReindexAsBatch.clear(); @@ -386,6 +391,7 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) } } } + //Re-index any remaining files in the dataset (so that desiredCards and datasetVersions remain constants for all files in the batch) reindexFilesInBatches(filesToReindexAsBatch, desiredCards, datasetVersions); logger.fine("Progress : dataset " + dataset.getId() + " permissions reindexed"); } @@ -398,15 +404,21 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) Set datasetVersions = datasetVersionsToBuildCardsFor(dataset); for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { + boolean isDraft = version.isDraft(); for (FileMetadata fmd : version.getFileMetadatas()) { - filesToReindexAsBatch.add(fmd.getDataFile()); - i++; + DataFile file = fmd.getDataFile(); + //Since reindexFilesInBatches() re-indexes a file in all versions needed, we should not send a file already in the released version twice + if (isDraft && !file.isReleased()) { + filesToReindexAsBatch.add(file); + i++; + } if (i % 100 == 0) { reindexFilesInBatches(filesToReindexAsBatch, desiredCards, datasetVersions); filesToReindexAsBatch.clear(); } } } + //Re-index any remaining files in the dataset (so that desiredCards and datasetVersions remain constants for all files in the batch) reindexFilesInBatches(filesToReindexAsBatch, desiredCards, datasetVersions); } else { indexPermissionsForOneDvObject(definitionPoint); From 54f4f41e551985f856af1bdb66b392fd76e4f357 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Mon, 31 Mar 2025 16:07:41 -0400 Subject: [PATCH 39/83] add debug timing, remove unused code --- .../search/SolrIndexServiceBean.java | 21 ++++++------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index dea06b761ea..ee02260f1d3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -325,17 +325,7 @@ private void persistToSolr(Collection docs) throws SolrServer /** * @todo Do something with these responses from Solr. */ - UpdateResponse addResponse = solrClientService.getSolrClient().add(docs); - } - - public IndexResponse indexPermissionsOnSelfAndChildren(long definitionPointId) { - DvObject definitionPoint = dvObjectService.findDvObject(definitionPointId); - if (definitionPoint == null) { - logger.log(Level.WARNING, "Cannot find a DvOpbject with id of {0}", definitionPointId); - return null; - } else { - return indexPermissionsOnSelfAndChildren(definitionPoint); - } + solrClientService.getSolrClient().add(docs); } /** @@ -359,6 +349,7 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) // so don't create a Solr "permission" doc either. int i = 0; int numObjects = 0; + long globalStartTime = System.currentTimeMillis(); if (definitionPoint.isInstanceofDataverse()) { Dataverse selfDataverse = (Dataverse) definitionPoint; if (!selfDataverse.equals(dataverseService.findRootDataverse())) { @@ -372,7 +363,7 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) Map desiredCards = searchPermissionsService.getDesiredCards(dataset); Set datasetVersions = datasetVersionsToBuildCardsFor(dataset); - + long startTime = System.currentTimeMillis(); for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { boolean isDraft = version.isDraft(); for (FileMetadata fmd : version.getFileMetadatas()) { @@ -393,7 +384,7 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) } //Re-index any remaining files in the dataset (so that desiredCards and datasetVersions remain constants for all files in the batch) reindexFilesInBatches(filesToReindexAsBatch, desiredCards, datasetVersions); - logger.fine("Progress : dataset " + dataset.getId() + " permissions reindexed"); + logger.fine("Progress : dataset " + dataset.getId() + " permissions reindexed in " + (System.currentTimeMillis() - startTime) + " ms"); } } else if (definitionPoint.isInstanceofDataset()) { indexPermissionsForOneDvObject(definitionPoint); @@ -402,7 +393,7 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) Dataset dataset = (Dataset) definitionPoint; Map desiredCards = searchPermissionsService.getDesiredCards(dataset); Set datasetVersions = datasetVersionsToBuildCardsFor(dataset); - + for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { boolean isDraft = version.isDraft(); for (FileMetadata fmd : version.getFileMetadatas()) { @@ -431,7 +422,7 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) * @todo Should update timestamps, probably, even thought these are * files, see https://github.com/IQSS/dataverse/issues/2421 */ - logger.fine("Reindexed permissions for " + i + " files and " + numObjects + " datasets/collections"); + logger.fine("Reindexed permissions for " + i + " files and " + numObjects + "datasets/collections in " + (System.currentTimeMillis() - globalStartTime) + " ms"); return new IndexResponse("Number of dvObject permissions indexed for " + definitionPoint + ": " + numObjects); } From 04895f8c0e21433ef6351dd362353754a53d1e96 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Mon, 31 Mar 2025 16:23:33 -0400 Subject: [PATCH 40/83] fix logic --- .../edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index ee02260f1d3..ae1fb808bfa 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -369,7 +369,7 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) for (FileMetadata fmd : version.getFileMetadatas()) { DataFile file = fmd.getDataFile(); //Since reindexFilesInBatches() re-indexes a file in all versions needed, we should not send a file already in the released version twice - if (isDraft && !file.isReleased()) { + if (!isDraft || !file.isReleased()) { filesToReindexAsBatch.add(file); i++; } @@ -399,7 +399,7 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) for (FileMetadata fmd : version.getFileMetadatas()) { DataFile file = fmd.getDataFile(); //Since reindexFilesInBatches() re-indexes a file in all versions needed, we should not send a file already in the released version twice - if (isDraft && !file.isReleased()) { + if (!isDraft || !file.isReleased()) { filesToReindexAsBatch.add(file); i++; } From e5e9fd40be27487c2e9a44d6aa0303705ddf394a Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Mon, 31 Mar 2025 17:28:45 -0400 Subject: [PATCH 41/83] use named query, log no cache case (should never be true now) --- .../iq/dataverse/RoleAssigneeServiceBean.java | 28 ++++++++++++------- .../search/SearchPermissionsServiceBean.java | 2 +- .../search/SolrIndexServiceBean.java | 2 +- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java index 40943ef6395..b6423930f12 100644 --- a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java @@ -28,6 +28,7 @@ import jakarta.ejb.Stateless; import jakarta.inject.Named; import jakarta.persistence.EntityManager; +import jakarta.persistence.NamedNativeQuery; import jakarta.persistence.PersistenceContext; import org.apache.commons.lang3.StringUtils; @@ -38,8 +39,19 @@ */ @Stateless @Named +@NamedNativeQuery( + name = "RoleAssignment.findAssigneesWithPermissionOnDvObject", + query = "SELECT DISTINCT ra.assigneeidentifier FROM roleassignment ra " + + "JOIN dataverserole dr ON ra.role_id = dr.id " + + "JOIN dvobject dob ON ra.definitionpoint_id = dob.id " + + "WHERE get_bit(dr.permissionbits::bit(64), :bitpos) = '1' " + + "AND dob.id = :objectId", + resultClass = String.class + ) public class RoleAssigneeServiceBean { + + private static final Logger logger = Logger.getLogger(RoleAssigneeServiceBean.class.getName()); @PersistenceContext(unitName = "VDCNet-ejbPU") private EntityManager em; @@ -397,17 +409,13 @@ public List filterRoleAssignees(String query, DvObject dvObject, L return roleAssigneeList; } - public List findAssigneesWithPermissionOnDvObject(Long fileId, Permission permission) { - int bitpos = 63 - permission.ordinal(); - String sql = "SELECT DISTINCT assigneeidentifier FROM roleassignment ra, dataverserole dr, dvobject dob " + - "WHERE ra.role_id = dr.id " + - "AND get_bit(dr.permissionbits::bit(64), ?1) = '1' " + - "AND ra.definitionpoint_id = dob.id " + - "AND dob.id = ?2 " + - "GROUP BY assigneeidentifier"; - - return em.createNativeQuery(sql).setParameter(1, bitpos).setParameter(2, fileId).getResultList(); + public List findAssigneesWithPermissionOnDvObject(Long objectId, Permission permission) { + int bitpos = 63 - permission.ordinal(); + return em.createNamedQuery("RoleAssignment.findAssigneesWithPermissionOnDvObject", String.class) + .setParameter("bitpos", bitpos) + .setParameter("objectId", objectId) + .getResultList(); } private void msg(String s) { diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java index 3722af4c1e3..6891cc4307f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java @@ -119,7 +119,7 @@ public List findDvObjectPerms(DvObject dvObject) { List permStrings = new ArrayList<>(); Permission p = getRequiredSearchPermission(dvObject); - List assigneeIdStrings = roleAssigneeService. findAssigneesWithPermissionOnDvObject(dvObject.getId(), p); + List assigneeIdStrings = roleAssigneeService.findAssigneesWithPermissionOnDvObject(dvObject.getId(), p); for (String id : assigneeIdStrings) { // Don't need to cache RoleAssignees since each is unique RoleAssignee userOrGroup = roleAssigneeService.getRoleAssignee(id); diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index ae1fb808bfa..cbb4afe7901 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -153,7 +153,7 @@ private List constructDatafileSolrDocs(DataFile dataFile, Map Date: Mon, 31 Mar 2025 17:52:58 -0400 Subject: [PATCH 42/83] move query --- .../harvard/iq/dataverse/RoleAssigneeServiceBean.java | 9 --------- .../java/edu/harvard/iq/dataverse/RoleAssignment.java | 10 ++++++++++ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java index b6423930f12..54af470e364 100644 --- a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java @@ -39,15 +39,6 @@ */ @Stateless @Named -@NamedNativeQuery( - name = "RoleAssignment.findAssigneesWithPermissionOnDvObject", - query = "SELECT DISTINCT ra.assigneeidentifier FROM roleassignment ra " + - "JOIN dataverserole dr ON ra.role_id = dr.id " + - "JOIN dvobject dob ON ra.definitionpoint_id = dob.id " + - "WHERE get_bit(dr.permissionbits::bit(64), :bitpos) = '1' " + - "AND dob.id = :objectId", - resultClass = String.class - ) public class RoleAssigneeServiceBean { diff --git a/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java b/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java index df004fe1357..5a1357dd711 100644 --- a/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java +++ b/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java @@ -12,6 +12,7 @@ import jakarta.persistence.Index; import jakarta.persistence.JoinColumn; import jakarta.persistence.ManyToOne; +import jakarta.persistence.NamedNativeQuery; import jakarta.persistence.NamedQueries; import jakarta.persistence.NamedQuery; import jakarta.persistence.Table; @@ -52,6 +53,15 @@ @NamedQuery( name = "RoleAssignment.deleteAllByAssigneeIdentifier_Definition_PointId_RoleType", query = "DELETE FROM RoleAssignment r WHERE r.assigneeIdentifier=:assigneeIdentifier AND r.role.id=:roleId and r.definitionPoint.id=:definitionPointId") }) +@NamedNativeQuery( + name = "RoleAssignment.findAssigneesWithPermissionOnDvObject", + query = "SELECT DISTINCT ra.assigneeidentifier FROM roleassignment ra " + + "JOIN dataverserole dr ON ra.role_id = dr.id " + + "JOIN dvobject dob ON ra.definitionpoint_id = dob.id " + + "WHERE get_bit(dr.permissionbits::bit(64), :bitpos) = '1' " + + "AND dob.id = :objectId", + resultClass = String.class + ) public class RoleAssignment implements java.io.Serializable { @Id @GeneratedValue(strategy = GenerationType.IDENTITY) From b1233abda8c464bc4f4f6da3826fc98578289887 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Mon, 31 Mar 2025 18:13:57 -0400 Subject: [PATCH 43/83] drop String.class --- .../edu/harvard/iq/dataverse/RoleAssignment.java | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java b/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java index 5a1357dd711..96e912e6f2d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java +++ b/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java @@ -54,14 +54,13 @@ query = "DELETE FROM RoleAssignment r WHERE r.assigneeIdentifier=:assigneeIdentifier AND r.role.id=:roleId and r.definitionPoint.id=:definitionPointId") }) @NamedNativeQuery( - name = "RoleAssignment.findAssigneesWithPermissionOnDvObject", - query = "SELECT DISTINCT ra.assigneeidentifier FROM roleassignment ra " + - "JOIN dataverserole dr ON ra.role_id = dr.id " + - "JOIN dvobject dob ON ra.definitionpoint_id = dob.id " + - "WHERE get_bit(dr.permissionbits::bit(64), :bitpos) = '1' " + - "AND dob.id = :objectId", - resultClass = String.class - ) + name = "RoleAssignment.findAssigneesWithPermissionOnDvObject", + query = "SELECT DISTINCT ra.assigneeidentifier FROM roleassignment ra " + + "JOIN dataverserole dr ON ra.role_id = dr.id " + + "JOIN dvobject dob ON ra.definitionpoint_id = dob.id " + + "WHERE get_bit(dr.permissionbits::bit(64), :bitpos) = '1' " + + "AND dob.id = :objectId" +)) public class RoleAssignment implements java.io.Serializable { @Id @GeneratedValue(strategy = GenerationType.IDENTITY) From 9550fc2d2089b87742c723c30c3f08222266656a Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Mon, 31 Mar 2025 18:21:42 -0400 Subject: [PATCH 44/83] try result set mapping --- .../java/edu/harvard/iq/dataverse/RoleAssignment.java | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java b/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java index 96e912e6f2d..be406c7ce14 100644 --- a/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java +++ b/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java @@ -5,6 +5,7 @@ import java.util.Objects; import jakarta.persistence.CascadeType; import jakarta.persistence.Column; +import jakarta.persistence.ColumnResult; import jakarta.persistence.Entity; import jakarta.persistence.GeneratedValue; import jakarta.persistence.GenerationType; @@ -15,6 +16,7 @@ import jakarta.persistence.NamedNativeQuery; import jakarta.persistence.NamedQueries; import jakarta.persistence.NamedQuery; +import jakarta.persistence.SqlResultSetMapping; import jakarta.persistence.Table; import jakarta.persistence.UniqueConstraint; @@ -59,8 +61,13 @@ "JOIN dataverserole dr ON ra.role_id = dr.id " + "JOIN dvobject dob ON ra.definitionpoint_id = dob.id " + "WHERE get_bit(dr.permissionbits::bit(64), :bitpos) = '1' " + - "AND dob.id = :objectId" -)) + "AND dob.id = :objectId", + resultSetMapping = "AssigneeIdentifierMapping" +) +@SqlResultSetMapping( + name = "AssigneeIdentifierMapping", + columns = @ColumnResult(name = "assigneeidentifier") + ) public class RoleAssignment implements java.io.Serializable { @Id @GeneratedValue(strategy = GenerationType.IDENTITY) From dc01500cf6b88e681ff07c7a7dc85225fa28b2bb Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Mon, 31 Mar 2025 18:36:38 -0400 Subject: [PATCH 45/83] numeric params --- .../edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java | 4 ++-- src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java index 54af470e364..1ba5768d46a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java @@ -404,8 +404,8 @@ public List filterRoleAssignees(String query, DvObject dvObject, L public List findAssigneesWithPermissionOnDvObject(Long objectId, Permission permission) { int bitpos = 63 - permission.ordinal(); return em.createNamedQuery("RoleAssignment.findAssigneesWithPermissionOnDvObject", String.class) - .setParameter("bitpos", bitpos) - .setParameter("objectId", objectId) + .setParameter(1, bitpos) + .setParameter(2, objectId) .getResultList(); } diff --git a/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java b/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java index be406c7ce14..88ae443ed37 100644 --- a/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java +++ b/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java @@ -60,8 +60,8 @@ query = "SELECT DISTINCT ra.assigneeidentifier FROM roleassignment ra " + "JOIN dataverserole dr ON ra.role_id = dr.id " + "JOIN dvobject dob ON ra.definitionpoint_id = dob.id " + - "WHERE get_bit(dr.permissionbits::bit(64), :bitpos) = '1' " + - "AND dob.id = :objectId", + "WHERE get_bit(dr.permissionbits::bit(64), ?1) = '1' " + + "AND dob.id = ?2", resultSetMapping = "AssigneeIdentifierMapping" ) @SqlResultSetMapping( From e85cd2294ca9406a1081f1d1f12f6afef34c50a0 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 1 Apr 2025 08:18:29 -0400 Subject: [PATCH 46/83] test bypassing query --- .../edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java index 1ba5768d46a..487939ddcda 100644 --- a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java @@ -402,6 +402,11 @@ public List filterRoleAssignees(String query, DvObject dvObject, L public List findAssigneesWithPermissionOnDvObject(Long objectId, Permission permission) { + if(permission.ordinal()==4) { + List authenticatedUsersList = new ArrayList<>(); + authenticatedUsersList.add(":authenticated-users"); + return authenticatedUsersList; + } int bitpos = 63 - permission.ordinal(); return em.createNamedQuery("RoleAssignment.findAssigneesWithPermissionOnDvObject", String.class) .setParameter(1, bitpos) From 0469d35b071ea8804464389875e1d6cbf2ce0f31 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 1 Apr 2025 08:50:30 -0400 Subject: [PATCH 47/83] query hardcode to role 2 --- .../iq/dataverse/RoleAssigneeServiceBean.java | 7 +++++ .../harvard/iq/dataverse/RoleAssignment.java | 27 ++++++++++++------- .../search/SearchPermissionsServiceBean.java | 25 ++++++++++++++++- 3 files changed, 49 insertions(+), 10 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java index 487939ddcda..aab40b5a312 100644 --- a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java @@ -413,6 +413,13 @@ public List findAssigneesWithPermissionOnDvObject(Long objectId, Permiss .setParameter(2, objectId) .getResultList(); } + + public List findAssigneesWithRoleOnDvObject(Long objectId, List dataverseRoleIds) { + return em.createNamedQuery("RoleAssignment.findAssigneesWithRoleOnDvObject", String.class) + .setParameter(1, dataverseRoleIds) + .setParameter(2, objectId) + .getResultList(); + } private void msg(String s) { //System.out.println(s); diff --git a/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java b/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java index 88ae443ed37..c1a20ed09ed 100644 --- a/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java +++ b/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java @@ -13,6 +13,7 @@ import jakarta.persistence.Index; import jakarta.persistence.JoinColumn; import jakarta.persistence.ManyToOne; +import jakarta.persistence.NamedNativeQueries; import jakarta.persistence.NamedNativeQuery; import jakarta.persistence.NamedQueries; import jakarta.persistence.NamedQuery; @@ -55,15 +56,23 @@ @NamedQuery( name = "RoleAssignment.deleteAllByAssigneeIdentifier_Definition_PointId_RoleType", query = "DELETE FROM RoleAssignment r WHERE r.assigneeIdentifier=:assigneeIdentifier AND r.role.id=:roleId and r.definitionPoint.id=:definitionPointId") }) -@NamedNativeQuery( - name = "RoleAssignment.findAssigneesWithPermissionOnDvObject", - query = "SELECT DISTINCT ra.assigneeidentifier FROM roleassignment ra " + - "JOIN dataverserole dr ON ra.role_id = dr.id " + - "JOIN dvobject dob ON ra.definitionpoint_id = dob.id " + - "WHERE get_bit(dr.permissionbits::bit(64), ?1) = '1' " + - "AND dob.id = ?2", - resultSetMapping = "AssigneeIdentifierMapping" -) +@NamedNativeQueries({ + @NamedNativeQuery( + name = "RoleAssignment.findAssigneesWithRoleOnDvObject", + query = "SELECT DISTINCT ra.assigneeidentifier FROM roleassignment ra " + + "WHERE ra.role_id IN (?1) " + + "AND ra.definitionpoint_id = ?2", + resultSetMapping = "AssigneeIdentifierMapping"), + @NamedNativeQuery( + name = "RoleAssignment.findAssigneesWithPermissionOnDvObject", + query = "SELECT DISTINCT ra.assigneeidentifier FROM roleassignment ra " + + "JOIN dataverserole dr ON ra.role_id = dr.id " + + "JOIN dvobject dob ON ra.definitionpoint_id = dob.id " + + "WHERE get_bit(dr.permissionbits::bit(64), ?1) = '1' " + + "AND dob.id = ?2", + resultSetMapping = "AssigneeIdentifierMapping" + ) +}) @SqlResultSetMapping( name = "AssigneeIdentifierMapping", columns = @ColumnResult(name = "assigneeidentifier") diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java index 6891cc4307f..695eaf4b43c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java @@ -119,7 +119,15 @@ public List findDvObjectPerms(DvObject dvObject) { List permStrings = new ArrayList<>(); Permission p = getRequiredSearchPermission(dvObject); - List assigneeIdStrings = roleAssigneeService.findAssigneesWithPermissionOnDvObject(dvObject.getId(), p); + List assigneeIdStrings = null; + if(dvObject instanceof DataFile) { + List downloadRole = new ArrayList; + downloadRole.add(2L); + assigneeIdStrings = roleAssigneeService.findAssigneesWithRoleOnDvObject(dvObject.getId(), ); + + } else { + assigneeIdStrings = roleAssigneeService.findAssigneesWithPermissionOnDvObject(dvObject.getId(), p); + } for (String id : assigneeIdStrings) { // Don't need to cache RoleAssignees since each is unique RoleAssignee userOrGroup = roleAssigneeService.getRoleAssignee(id); @@ -130,7 +138,22 @@ public List findDvObjectPerms(DvObject dvObject) { } return permStrings; } + + /* public List findDvObjectPerms(DvObject dvObject, List dataverseRoleIds) { + List permStrings = new ArrayList<>(); + List assigneeIdStrings = roleAssigneeService.findAssigneesWithPermissionOnDvObject(dvObject.getId(), dataverseRoleIds); + for (String id : assigneeIdStrings) { + // Don't need to cache RoleAssignees since each is unique + RoleAssignee userOrGroup = roleAssigneeService.getRoleAssignee(id); + String indexableUserOrGroupPermissionString = getIndexableStringForUserOrGroup(userOrGroup); + if (indexableUserOrGroupPermissionString != null) { + permStrings.add(indexableUserOrGroupPermissionString); + } + } + return permStrings; + } +*/ /* public List findRestrictedDatafilePerms(long fileId) { List permStrings = new ArrayList<>(); From 3a6faeed56e9dde2a052130ea930b6d809bccaac Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 1 Apr 2025 08:53:14 -0400 Subject: [PATCH 48/83] typos --- .../iq/dataverse/search/SearchPermissionsServiceBean.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java index 695eaf4b43c..ebdff5ba518 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java @@ -121,9 +121,9 @@ public List findDvObjectPerms(DvObject dvObject) { List assigneeIdStrings = null; if(dvObject instanceof DataFile) { - List downloadRole = new ArrayList; + List downloadRole = new ArrayList(); downloadRole.add(2L); - assigneeIdStrings = roleAssigneeService.findAssigneesWithRoleOnDvObject(dvObject.getId(), ); + assigneeIdStrings = roleAssigneeService.findAssigneesWithRoleOnDvObject(dvObject.getId(), downloadRole); } else { assigneeIdStrings = roleAssigneeService.findAssigneesWithPermissionOnDvObject(dvObject.getId(), p); From 81d54a74470b7513084fe5b6df054023abe7c9f2 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 1 Apr 2025 09:17:30 -0400 Subject: [PATCH 49/83] use array and any --- .../edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java | 4 ++-- src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java | 2 +- .../iq/dataverse/search/SearchPermissionsServiceBean.java | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java index aab40b5a312..042379f36bb 100644 --- a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java @@ -414,9 +414,9 @@ public List findAssigneesWithPermissionOnDvObject(Long objectId, Permiss .getResultList(); } - public List findAssigneesWithRoleOnDvObject(Long objectId, List dataverseRoleIds) { + public List findAssigneesWithRoleOnDvObject(Long objectId, Long[] downloadRole) { return em.createNamedQuery("RoleAssignment.findAssigneesWithRoleOnDvObject", String.class) - .setParameter(1, dataverseRoleIds) + .setParameter(1, downloadRole) .setParameter(2, objectId) .getResultList(); } diff --git a/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java b/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java index c1a20ed09ed..c8a5e7658e9 100644 --- a/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java +++ b/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java @@ -60,7 +60,7 @@ @NamedNativeQuery( name = "RoleAssignment.findAssigneesWithRoleOnDvObject", query = "SELECT DISTINCT ra.assigneeidentifier FROM roleassignment ra " + - "WHERE ra.role_id IN (?1) " + + "WHERE ra.role_id = ANY(?1) " + "AND ra.definitionpoint_id = ?2", resultSetMapping = "AssigneeIdentifierMapping"), @NamedNativeQuery( diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java index ebdff5ba518..859f0846e84 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java @@ -121,8 +121,8 @@ public List findDvObjectPerms(DvObject dvObject) { List assigneeIdStrings = null; if(dvObject instanceof DataFile) { - List downloadRole = new ArrayList(); - downloadRole.add(2L); + Long[] downloadRole = new Long[1]; + downloadRole[0] =2L; assigneeIdStrings = roleAssigneeService.findAssigneesWithRoleOnDvObject(dvObject.getId(), downloadRole); } else { From eadb0b4262ce2ed28ec191645416c3b9bcd9e12e Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 1 Apr 2025 10:04:35 -0400 Subject: [PATCH 50/83] add per batch logging --- .../edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index cbb4afe7901..4f358da5d1d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -404,8 +404,10 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) i++; } if (i % 100 == 0) { + long startTime = System.currentTimeMillis(); reindexFilesInBatches(filesToReindexAsBatch, desiredCards, datasetVersions); filesToReindexAsBatch.clear(); + logger.info("Progress: 100 file permissions at " + i + "files reindexed in " + (System.currentTimeMillis() - startTime) + " ms"); } } } From f542e55ef9320bbd4622b6ff9c5ed4b06af6a954 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 1 Apr 2025 12:29:55 -0400 Subject: [PATCH 51/83] try stream --- .../search/SolrIndexServiceBean.java | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index 4f358da5d1d..8b0f969000b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -347,7 +347,7 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) // We don't create a Solr "primary/content" doc for the root dataverse // so don't create a Solr "permission" doc either. - int i = 0; + final int[] counter = {0}; int numObjects = 0; long globalStartTime = System.currentTimeMillis(); if (definitionPoint.isInstanceofDataverse()) { @@ -371,14 +371,14 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) //Since reindexFilesInBatches() re-indexes a file in all versions needed, we should not send a file already in the released version twice if (!isDraft || !file.isReleased()) { filesToReindexAsBatch.add(file); - i++; + counter[0]++; } - if (i % 100 == 0) { + if (counter[0] % 100 == 0) { reindexFilesInBatches(filesToReindexAsBatch, desiredCards, datasetVersions); filesToReindexAsBatch.clear(); } - if (i % 1000 == 0) { - logger.fine("Progress: " + i + "files permissions reindexed"); + if (counter[0] % 1000 == 0) { + logger.info("Progress: " + counter[0] + "files permissions reindexed"); } } } @@ -393,24 +393,24 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) Dataset dataset = (Dataset) definitionPoint; Map desiredCards = searchPermissionsService.getDesiredCards(dataset); Set datasetVersions = datasetVersionsToBuildCardsFor(dataset); - for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { boolean isDraft = version.isDraft(); - for (FileMetadata fmd : version.getFileMetadatas()) { - DataFile file = fmd.getDataFile(); - //Since reindexFilesInBatches() re-indexes a file in all versions needed, we should not send a file already in the released version twice - if (!isDraft || !file.isReleased()) { - filesToReindexAsBatch.add(file); - i++; - } - if (i % 100 == 0) { - long startTime = System.currentTimeMillis(); - reindexFilesInBatches(filesToReindexAsBatch, desiredCards, datasetVersions); - filesToReindexAsBatch.clear(); - logger.info("Progress: 100 file permissions at " + i + "files reindexed in " + (System.currentTimeMillis() - startTime) + " ms"); - } - } - } + version.getFileMetadatas().stream() + .forEach(fmd -> { + DataFile file = fmd.getDataFile(); + // Since reindexFilesInBatches() re-indexes a file in all versions needed, we should not send a file already in the released version twice + if (!isDraft || !file.isReleased()) { + filesToReindexAsBatch.add(file); + counter[0]++; + } + if (counter[0] % 100 == 0) { + long startTime = System.currentTimeMillis(); + reindexFilesInBatches(filesToReindexAsBatch, desiredCards, datasetVersions); + filesToReindexAsBatch.clear(); + logger.info("Progress: 100 file permissions at " + counter[0] + "files reindexed in " + (System.currentTimeMillis() - startTime) + " ms"); + } + }); + }; //Re-index any remaining files in the dataset (so that desiredCards and datasetVersions remain constants for all files in the batch) reindexFilesInBatches(filesToReindexAsBatch, desiredCards, datasetVersions); } else { @@ -424,7 +424,7 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) * @todo Should update timestamps, probably, even thought these are * files, see https://github.com/IQSS/dataverse/issues/2421 */ - logger.fine("Reindexed permissions for " + i + " files and " + numObjects + "datasets/collections in " + (System.currentTimeMillis() - globalStartTime) + " ms"); + logger.fine("Reindexed permissions for " + counter[0] + " files and " + numObjects + "datasets/collections in " + (System.currentTimeMillis() - globalStartTime) + " ms"); return new IndexResponse("Number of dvObject permissions indexed for " + definitionPoint + ": " + numObjects); } From dbca9554a8f14b2aa119bc18f051610c0bf16411 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 1 Apr 2025 12:44:50 -0400 Subject: [PATCH 52/83] remove eager on datatable --- src/main/java/edu/harvard/iq/dataverse/DataTable.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DataTable.java b/src/main/java/edu/harvard/iq/dataverse/DataTable.java index 2782429cccf..acf70360edf 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataTable.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataTable.java @@ -84,7 +84,7 @@ public DataTable() { /* * DataVariables in this DataTable: */ - @OneToMany (mappedBy="dataTable",cascade={ CascadeType.REMOVE, CascadeType.MERGE,CascadeType.PERSIST}, fetch = FetchType.EAGER) + @OneToMany (mappedBy="dataTable",cascade={ CascadeType.REMOVE, CascadeType.MERGE,CascadeType.PERSIST}) @OrderBy ("fileOrder") private List dataVariables; From 82fe5df4f71145dec29162540c0201e026bf4883 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 1 Apr 2025 16:32:02 -0400 Subject: [PATCH 53/83] revert to original query --- .../iq/dataverse/search/SearchPermissionsServiceBean.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java index 859f0846e84..a5d4fcbce0e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java @@ -120,14 +120,15 @@ public List findDvObjectPerms(DvObject dvObject) { Permission p = getRequiredSearchPermission(dvObject); List assigneeIdStrings = null; - if(dvObject instanceof DataFile) { +/* if(dvObject instanceof DataFile) { Long[] downloadRole = new Long[1]; downloadRole[0] =2L; assigneeIdStrings = roleAssigneeService.findAssigneesWithRoleOnDvObject(dvObject.getId(), downloadRole); } else { + */ assigneeIdStrings = roleAssigneeService.findAssigneesWithPermissionOnDvObject(dvObject.getId(), p); - } +// } for (String id : assigneeIdStrings) { // Don't need to cache RoleAssignees since each is unique RoleAssignee userOrGroup = roleAssigneeService.getRoleAssignee(id); From 1ece7f12651454c7ced66303cc42d6a9a401df50 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 1 Apr 2025 16:33:28 -0400 Subject: [PATCH 54/83] one version at a time in construct docs --- .../search/SolrIndexServiceBean.java | 93 ++++++++----------- 1 file changed, 41 insertions(+), 52 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index 8b0f969000b..952021b0880 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -68,9 +68,13 @@ public List determineSolrDocs(DvObject dvObject) { DataFile datafile = (DataFile) dvObject; Map desiredCards = searchPermissionsService.getDesiredCards(datafile.getOwner()); Set datasetVersions = datasetVersionsToBuildCardsFor(datafile.getOwner()); - - List fileSolrDocs = constructDatafileSolrDocs(datafile, permStringByDatasetVersion, desiredCards, datasetVersions); - solrDocs.addAll(fileSolrDocs); + for (DatasetVersion datasetVersion : datasetVersions) { + if(desiredCards.containsKey(datasetVersion.getVersionState()) && desiredCards.get(datasetVersion.getVersionState()) && datafile.isInDatasetVersion(datasetVersion)) { + + DvObjectSolrDoc fileSolrDoc = constructDatafileSolrDoc(datafile, datasetVersion, permStringByDatasetVersion); + solrDocs.add(fileSolrDoc); + } + } } else { logger.info("Unexpected DvObject: " + dvObject.getClass().getName()); } @@ -121,47 +125,26 @@ private List constructDatasetSolrDocs(Dataset dataset) { return solrDocs; } - // private List constructDatafileSolrDocs(DataFile dataFile) { - private List constructDatafileSolrDocs(DataFile dataFile, Map> permStringByDatasetVersion, Map desiredCards, Set datasetVersions) { - List datafileSolrDocs = new ArrayList<>(); + private DvObjectSolrDoc constructDatafileSolrDoc(DataFile dataFile, DatasetVersion version, Map> permStringByDatasetVersion) { - for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersions) { - boolean cardShouldExist = desiredCards.get(datasetVersionFileIsAttachedTo.getVersionState()); - /* - * Since datasetVersionFileIsAttachedTo should be a draft or the most recent - * released one, it could be more efficient to stop the search through - * FileMetadatas after those two (versus continuing through all prior versions - * as in isInDatasetVersion). Alternately, perhaps filesToReIndexPermissionsFor - * should not combine the list of files for the different datsetversions into a - * single list to start with. - */ - if (cardShouldExist && dataFile.isInDatasetVersion(datasetVersionFileIsAttachedTo)) { - String solrIdStart = IndexServiceBean.solrDocIdentifierFile + dataFile.getId(); - String solrIdEnd = getDatasetOrDataFileSolrEnding(datasetVersionFileIsAttachedTo.getVersionState()); - String solrId = solrIdStart + solrIdEnd; - List perms = new ArrayList<>(); + String solrIdStart = IndexServiceBean.solrDocIdentifierFile + dataFile.getId(); + String solrIdEnd = getDatasetOrDataFileSolrEnding(version.getVersionState()); + String solrId = solrIdStart + solrIdEnd; + List perms = new ArrayList<>(); - List cachedPerms = null; - if (permStringByDatasetVersion != null) { - cachedPerms = permStringByDatasetVersion.get(datasetVersionFileIsAttachedTo.getId()); - } - if (cachedPerms != null) { - logger.finest("reusing cached perms for file " + dataFile.getId()); - perms = cachedPerms; - } else if (datasetVersionFileIsAttachedTo.isReleased()) { - logger.finest("no cached perms, file is public/discoverable/searchable for file " + dataFile.getId()); - perms.add(IndexServiceBean.getPublicGroupString()); - } else { - // go to the well (slow) - logger.info("no cached perms, file is not public, finding perms for file " + dataFile.getId()); - perms = searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo); - } - DvObjectSolrDoc dataFileSolrDoc = new DvObjectSolrDoc(dataFile.getId().toString(), solrId, datasetVersionFileIsAttachedTo.getId(), dataFile.getDisplayName(), perms); - datafileSolrDocs.add(dataFileSolrDoc); - } + List cachedPerms = permStringByDatasetVersion.get(version.getId()); + if (cachedPerms != null) { + logger.finest("reusing cached perms for file " + dataFile.getId()); + perms = cachedPerms; + } else if (version.isReleased()) { + logger.finest("no cached perms, file is public/discoverable/searchable for file " + dataFile.getId()); + perms.add(IndexServiceBean.getPublicGroupString()); + } else { + // go to the well (slow) + logger.info("no cached perms, file is not public, finding perms for file " + dataFile.getId()); + perms = searchPermissionsService.findDatasetVersionPerms(version); } - - return datafileSolrDocs; + return new DvObjectSolrDoc(dataFile.getId().toString(), solrId, version.getId(), dataFile.getDisplayName(), perms, ftperms); } private List constructDatafileSolrDocsFromDataset(Dataset dataset) { @@ -395,6 +378,9 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) Set datasetVersions = datasetVersionsToBuildCardsFor(dataset); for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { boolean isDraft = version.isDraft(); + if(version.getFileMetadatas().size()>1000) { + + } else { version.getFileMetadatas().stream() .forEach(fmd -> { DataFile file = fmd.getDataFile(); @@ -410,6 +396,7 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) logger.info("Progress: 100 file permissions at " + counter[0] + "files reindexed in " + (System.currentTimeMillis() - startTime) + " ms"); } }); + } }; //Re-index any remaining files in the dataset (so that desiredCards and datasetVersions remain constants for all files in the batch) reindexFilesInBatches(filesToReindexAsBatch, desiredCards, datasetVersions); @@ -440,20 +427,22 @@ private String reindexFilesInBatches(List filesToReindexPermissionsFor return "No files to reindex"; } - for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersions) { - if(datasetVersionFileIsAttachedTo.getId() != null) { - permStringByDatasetVersion.put(datasetVersionFileIsAttachedTo.getId(), searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo)); - } - } + for (DatasetVersion datasetVersion : datasetVersions) { + if (desiredCards.get(datasetVersion.getVersionState())) { + if (datasetVersion.getId() != null) { + permStringByDatasetVersion.put(datasetVersion.getId(), searchPermissionsService.findDatasetVersionPerms(datasetVersion)); + } - for (DataFile file : filesToReindexPermissionsFor) { - List fileSolrDocsBasedOnCachedPermissions = constructDatafileSolrDocs(file, permStringByDatasetVersion, desiredCards, datasetVersions); - for (DvObjectSolrDoc fileSolrDoc : fileSolrDocsBasedOnCachedPermissions) { - SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc); - docs.add(solrDoc); + for (DataFile file : filesToReindexPermissionsFor) { + if (file.isInDatasetVersion(datasetVersion)) { + + DvObjectSolrDoc fileSolrDoc = constructDatafileSolrDoc(file, datasetVersion, permStringByDatasetVersion); + SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc); + docs.add(solrDoc); + } + } } } - persistToSolr(docs); return " " + filesToReindexPermissionsFor.size() + " files indexed across " + docs.size() + " Solr documents "; } catch (SolrServerException | IOException ex) { From ba13ddab5eaf6807820ae3150ed4e359e2c8f881 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 1 Apr 2025 17:18:07 -0400 Subject: [PATCH 55/83] avoid getting list of filemetadatas --- .../iq/dataverse/DatasetServiceBean.java | 2 +- .../dataverse/DatasetVersionServiceBean.java | 8 +++++ .../search/IndexBatchServiceBean.java | 2 +- .../search/SolrIndexServiceBean.java | 34 +++++++++++++++++-- 4 files changed, 42 insertions(+), 4 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java index 9a8c43668cb..21463bcc897 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java @@ -284,7 +284,7 @@ public List findAllOrSubsetOrderByFilesOwned(boolean skipIndexed) { "left join dvobject f on f.owner_id = o.id where o.dtype = 'Dataset' " + skipClause + " group by o.id " - + "ORDER BY count(f.id) asc, o.id"); + + "ORDER BY count(f.id) desc, o.id"); List queryResults; queryResults = query.getResultList(); diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java index 7e9b778c6f3..42e88dff61c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java @@ -1260,4 +1260,12 @@ public List getUnarchivedDatasetVersions(){ return null; } } // end getUnarchivedDatasetVersions + public List getDataFileInfoForPermissionIndexing(Long id) { + String query = "SELECT fm.label, df.id, df.restricted, df.publicationDate" + + "FROM filemetadata fm " + + "JOIN datafile df ON fm.datafile_id = df.id " + + "WHERE fm.datasetversion_id = ?"; + return em.createNativeQuery(query).setParameter(1, id).getResultList(); + } + } // end class diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java index 3f7a7bb3363..6e47d1938c1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexBatchServiceBean.java @@ -175,7 +175,7 @@ public Future indexAllOrSubset(long numPartitions, long partitionId, boo // List dataverses = dataverseService.findAllOrSubset(numPartitions, partitionId, skipIndexed); // Note: no support for "partitions" in this experimental branch. // The method below returns the ids of all the unindexed dataverses. - List dataverseIds = dataverseIds = dataverseService.findDataverseIdsForIndexing(skipIndexed); + List dataverseIds = dataverseService.findDataverseIdsForIndexing(skipIndexed); int dataverseIndexCount = 0; int dataverseFailureCount = 0; diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index 952021b0880..12c1363b698 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -4,6 +4,7 @@ import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetServiceBean; import edu.harvard.iq.dataverse.DatasetVersion; +import edu.harvard.iq.dataverse.DatasetVersionServiceBean; import edu.harvard.iq.dataverse.Dataverse; import edu.harvard.iq.dataverse.DataverseRoleServiceBean; import edu.harvard.iq.dataverse.DataverseServiceBean; @@ -44,6 +45,8 @@ public class SolrIndexServiceBean { @EJB DatasetServiceBean datasetService; @EJB + DatasetVersionServiceBean datasetVersionService; + @EJB DataverseRoleServiceBean rolesSvc; @EJB SolrClientIndexService solrClientService; @@ -144,7 +147,9 @@ private DvObjectSolrDoc constructDatafileSolrDoc(DataFile dataFile, DatasetVersi logger.info("no cached perms, file is not public, finding perms for file " + dataFile.getId()); perms = searchPermissionsService.findDatasetVersionPerms(version); } - return new DvObjectSolrDoc(dataFile.getId().toString(), solrId, version.getId(), dataFile.getDisplayName(), perms, ftperms); + //Temporary kludge to test performance + String name = dataFile.getDisplayName() == null? dataFile.getProvEntityName() : dataFile.getDisplayName(); + return new DvObjectSolrDoc(dataFile.getId().toString(), solrId, version.getId(), name, perms, ftperms); } private List constructDatafileSolrDocsFromDataset(Dataset dataset) { @@ -379,8 +384,33 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { boolean isDraft = version.isDraft(); if(version.getFileMetadatas().size()>1000) { + // For large datasets, use a more efficient SQL query instead of loading all file metadata objects + List fileInfoList = datasetVersionService.getDataFileInfoForPermissionIndexing(version.getId()); - } else { + for (Object[] fileInfo : fileInfoList) { + String label = (String) fileInfo[0]; + Long fileId = ((Number) fileInfo[1]).longValue(); + boolean restricted = (boolean) fileInfo[2]; + boolean isReleased = fileInfo[3] != null; + // Since reindexFilesInBatches() re-indexes a file in all versions needed, + // we should not send a file already in the released version twice + if (!isDraft || !isReleased) { + DataFile file = new DataFile(); + file.setId(fileId); + file.setRestricted(restricted); + file.setProvEntityName(label); + filesToReindexAsBatch.add(file); + counter[0]++; + } + + if (counter[0] % 100 == 0) { + long startTime = System.currentTimeMillis(); + reindexFilesInBatches(filesToReindexAsBatch, desiredCards, datasetVersions); + filesToReindexAsBatch.clear(); + logger.info("Progress: 100 file permissions at " + counter[0] + " files reindexed in " + (System.currentTimeMillis() - startTime) + " ms"); + } + } + } else { version.getFileMetadatas().stream() .forEach(fmd -> { DataFile file = fmd.getDataFile(); From 49e7d321da6dab4ce9284b57cb1c532143e6ccc0 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 1 Apr 2025 17:25:14 -0400 Subject: [PATCH 56/83] typo --- .../edu/harvard/iq/dataverse/DatasetVersionServiceBean.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java index 42e88dff61c..2f62e9b1471 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java @@ -1261,7 +1261,7 @@ public List getUnarchivedDatasetVersions(){ } } // end getUnarchivedDatasetVersions public List getDataFileInfoForPermissionIndexing(Long id) { - String query = "SELECT fm.label, df.id, df.restricted, df.publicationDate" + + String query = "SELECT fm.label, df.id, df.restricted, df.publicationDate " + "FROM filemetadata fm " + "JOIN datafile df ON fm.datafile_id = df.id " + "WHERE fm.datasetversion_id = ?"; From eff01f90002f5ae3cb9d45fd8a9ec6184b3f6dd2 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 1 Apr 2025 17:32:11 -0400 Subject: [PATCH 57/83] fix pub date source --- .../edu/harvard/iq/dataverse/DatasetVersionServiceBean.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java index 2f62e9b1471..59838cdf17f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java @@ -1260,10 +1260,12 @@ public List getUnarchivedDatasetVersions(){ return null; } } // end getUnarchivedDatasetVersions - public List getDataFileInfoForPermissionIndexing(Long id) { - String query = "SELECT fm.label, df.id, df.restricted, df.publicationDate " + + + public List getDataFileInfoForPermissionIndexing(Long id) { + String query = "SELECT fm.label, df.id, df.restricted, dvo.publicationDate " + "FROM filemetadata fm " + "JOIN datafile df ON fm.datafile_id = df.id " + + "JOIN dvobject dvo ON df.id = dvo.id " + "WHERE fm.datasetversion_id = ?"; return em.createNativeQuery(query).setParameter(1, id).getResultList(); } From 6b6fc54eaace8284afbea1c27cc9edc462943843 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 2 Apr 2025 08:47:32 -0400 Subject: [PATCH 58/83] try all files --- .../edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index 12c1363b698..0f78f726c42 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -383,7 +383,7 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) Set datasetVersions = datasetVersionsToBuildCardsFor(dataset); for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { boolean isDraft = version.isDraft(); - if(version.getFileMetadatas().size()>1000) { + if(version.getFileMetadatas().size()>0) { // For large datasets, use a more efficient SQL query instead of loading all file metadata objects List fileInfoList = datasetVersionService.getDataFileInfoForPermissionIndexing(version.getId()); From cf030718c6ff6c7a1dd18c13930a1960d03c7815 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 2 Apr 2025 08:55:54 -0400 Subject: [PATCH 59/83] try cache increases --- .../dataverse/search/SolrIndexServiceBean.java | 2 +- src/main/resources/META-INF/persistence.xml | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index 0f78f726c42..9c0bd5b2d37 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -383,7 +383,7 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) Set datasetVersions = datasetVersionsToBuildCardsFor(dataset); for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { boolean isDraft = version.isDraft(); - if(version.getFileMetadatas().size()>0) { + if(version.getFileMetadatas().size()>3000) { // For large datasets, use a more efficient SQL query instead of loading all file metadata objects List fileInfoList = datasetVersionService.getDataFileInfoForPermissionIndexing(version.getId()); diff --git a/src/main/resources/META-INF/persistence.xml b/src/main/resources/META-INF/persistence.xml index e6224dcdf01..02f114f5d73 100644 --- a/src/main/resources/META-INF/persistence.xml +++ b/src/main/resources/META-INF/persistence.xml @@ -18,6 +18,22 @@ + + + + + + + + + + + + + + + + From 946035b163ff5c3c463956f19e0b2aef861b4d88 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 2 Apr 2025 09:07:40 -0400 Subject: [PATCH 60/83] remove coord protocol --- src/main/resources/META-INF/persistence.xml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/main/resources/META-INF/persistence.xml b/src/main/resources/META-INF/persistence.xml index 02f114f5d73..5aba84ae866 100644 --- a/src/main/resources/META-INF/persistence.xml +++ b/src/main/resources/META-INF/persistence.xml @@ -31,9 +31,7 @@ - - - + From afeeb3952851688f966fdabe78cebdb153916c66 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 2 Apr 2025 09:16:12 -0400 Subject: [PATCH 61/83] limit at 1K --- .../edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index 9c0bd5b2d37..12c1363b698 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -383,7 +383,7 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) Set datasetVersions = datasetVersionsToBuildCardsFor(dataset); for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { boolean isDraft = version.isDraft(); - if(version.getFileMetadatas().size()>3000) { + if(version.getFileMetadatas().size()>1000) { // For large datasets, use a more efficient SQL query instead of loading all file metadata objects List fileInfoList = datasetVersionService.getDataFileInfoForPermissionIndexing(version.getId()); From b92489f5050b30db25ee9cacac5a8524b3d47f9e Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 2 Apr 2025 10:00:33 -0400 Subject: [PATCH 62/83] try weak on files/md --- src/main/resources/META-INF/persistence.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/resources/META-INF/persistence.xml b/src/main/resources/META-INF/persistence.xml index 5aba84ae866..c809c7616f7 100644 --- a/src/main/resources/META-INF/persistence.xml +++ b/src/main/resources/META-INF/persistence.xml @@ -21,6 +21,8 @@ + + From 0fcd06443be1bce7bf0502e7d54038d751df5706 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 2 Apr 2025 14:25:41 -0400 Subject: [PATCH 63/83] add file proxy --- .../dataverse/DatasetVersionServiceBean.java | 10 - .../search/SolrIndexServiceBean.java | 279 +++++++++++------- 2 files changed, 176 insertions(+), 113 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java index 59838cdf17f..7e9b778c6f3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersionServiceBean.java @@ -1260,14 +1260,4 @@ public List getUnarchivedDatasetVersions(){ return null; } } // end getUnarchivedDatasetVersions - - public List getDataFileInfoForPermissionIndexing(Long id) { - String query = "SELECT fm.label, df.id, df.restricted, dvo.publicationDate " + - "FROM filemetadata fm " + - "JOIN datafile df ON fm.datafile_id = df.id " + - "JOIN dvobject dvo ON df.id = dvo.id " + - "WHERE fm.datasetversion_id = ?"; - return em.createNativeQuery(query).setParameter(1, id).getResultList(); - } - } // end class diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index 12c1363b698..aac20dc7687 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -26,8 +26,10 @@ import jakarta.inject.Named; import jakarta.json.Json; import jakarta.json.JsonObjectBuilder; +import jakarta.persistence.EntityManager; +import jakarta.persistence.PersistenceContext; + import org.apache.solr.client.solrj.SolrServerException; -import org.apache.solr.client.solrj.response.UpdateResponse; import org.apache.solr.common.SolrInputDocument; @Named @@ -50,6 +52,9 @@ public class SolrIndexServiceBean { DataverseRoleServiceBean rolesSvc; @EJB SolrClientIndexService solrClientService; + + @PersistenceContext(unitName = "VDCNet-ejbPU") + private EntityManager em; public static String numRowsClearedByClearAllIndexTimes = "numRowsClearedByClearAllIndexTimes"; public static String messageString = "message"; @@ -71,10 +76,13 @@ public List determineSolrDocs(DvObject dvObject) { DataFile datafile = (DataFile) dvObject; Map desiredCards = searchPermissionsService.getDesiredCards(datafile.getOwner()); Set datasetVersions = datasetVersionsToBuildCardsFor(datafile.getOwner()); - for (DatasetVersion datasetVersion : datasetVersions) { - if(desiredCards.containsKey(datasetVersion.getVersionState()) && desiredCards.get(datasetVersion.getVersionState()) && datafile.isInDatasetVersion(datasetVersion)) { - - DvObjectSolrDoc fileSolrDoc = constructDatafileSolrDoc(datafile, datasetVersion, permStringByDatasetVersion); + for (DatasetVersion version : datasetVersions) { + if(desiredCards.containsKey(version.getVersionState()) && desiredCards.get(version.getVersionState()) && datafile.isInDatasetVersion(version)) { + List cachedPerms = searchPermissionsService.findDatasetVersionPerms(version); + String solrIdEnd = getDatasetOrDataFileSolrEnding(version.getVersionState()); + Long versionId = version.getId(); + boolean isDraft = version.isDraft(); + DvObjectSolrDoc fileSolrDoc = constructDatafileSolrDoc(new DataFileProxy(datafile.getFileMetadata()), cachedPerms, versionId, solrIdEnd); solrDocs.add(fileSolrDoc); } } @@ -128,28 +136,20 @@ private List constructDatasetSolrDocs(Dataset dataset) { return solrDocs; } - private DvObjectSolrDoc constructDatafileSolrDoc(DataFile dataFile, DatasetVersion version, Map> permStringByDatasetVersion) { - - String solrIdStart = IndexServiceBean.solrDocIdentifierFile + dataFile.getId(); - String solrIdEnd = getDatasetOrDataFileSolrEnding(version.getVersionState()); + private DvObjectSolrDoc constructDatafileSolrDoc(DataFileProxy fileProxy, List cachedPerms, long versionId, String solrIdEnd) { + List ftperms = new ArrayList<>(); + if (fileProxy.isRestricted()) { + ftperms = searchPermissionsService.findDvObjectPerms(fileProxy.getMinimalDataFile()); + } + String solrIdStart = IndexServiceBean.solrDocIdentifierFile + fileProxy.getFileId(); String solrId = solrIdStart + solrIdEnd; List perms = new ArrayList<>(); - - List cachedPerms = permStringByDatasetVersion.get(version.getId()); + assert(cachedPerms != null); if (cachedPerms != null) { - logger.finest("reusing cached perms for file " + dataFile.getId()); + logger.finest("reusing cached perms for file " + fileProxy.getFileId()); perms = cachedPerms; - } else if (version.isReleased()) { - logger.finest("no cached perms, file is public/discoverable/searchable for file " + dataFile.getId()); - perms.add(IndexServiceBean.getPublicGroupString()); - } else { - // go to the well (slow) - logger.info("no cached perms, file is not public, finding perms for file " + dataFile.getId()); - perms = searchPermissionsService.findDatasetVersionPerms(version); } - //Temporary kludge to test performance - String name = dataFile.getDisplayName() == null? dataFile.getProvEntityName() : dataFile.getDisplayName(); - return new DvObjectSolrDoc(dataFile.getId().toString(), solrId, version.getId(), name, perms, ftperms); + return new DvObjectSolrDoc(fileProxy.getFileId().toString(), solrId, versionId, fileProxy.getName(), perms, ftperms); } private List constructDatafileSolrDocsFromDataset(Dataset dataset) { @@ -170,7 +170,11 @@ private List constructDatafileSolrDocsFromDataset(Dataset datas String solrIdStart = IndexServiceBean.solrDocIdentifierFile + fileId; String solrIdEnd = getDatasetOrDataFileSolrEnding(datasetVersionFileIsAttachedTo.getVersionState()); String solrId = solrIdStart + solrIdEnd; - DvObjectSolrDoc dataFileSolrDoc = new DvObjectSolrDoc(fileId.toString(), solrId, datasetVersionFileIsAttachedTo.getId(), fileMetadata.getLabel(), perms); + List ftperms = new ArrayList<>(); + if (fileMetadata.getDataFile().isRestricted()) { + ftperms = searchPermissionsService.findDvObjectPerms(fileMetadata.getDataFile()); + } + DvObjectSolrDoc dataFileSolrDoc = new DvObjectSolrDoc(fileId.toString(), solrId, datasetVersionFileIsAttachedTo.getId(), fileMetadata.getLabel(), perms, ftperms); logger.finest("adding fileid " + fileId); datafileSolrDocs.add(dataFileSolrDoc); } @@ -327,7 +331,7 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) return null; } - List filesToReindexAsBatch = new ArrayList<>(); + List filesToReindexAsBatch = new ArrayList<>(); /** * @todo Re-indexing the definition point itself seems to be necessary * for revoke but not necessarily grant. @@ -348,31 +352,35 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) for (Dataset dataset : directChildDatasetsOfDvDefPoint) { indexPermissionsForOneDvObject(dataset); numObjects++; - + Map desiredCards = searchPermissionsService.getDesiredCards(dataset); Set datasetVersions = datasetVersionsToBuildCardsFor(dataset); long startTime = System.currentTimeMillis(); for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { - boolean isDraft = version.isDraft(); - for (FileMetadata fmd : version.getFileMetadatas()) { - DataFile file = fmd.getDataFile(); - //Since reindexFilesInBatches() re-indexes a file in all versions needed, we should not send a file already in the released version twice - if (!isDraft || !file.isReleased()) { - filesToReindexAsBatch.add(file); + if (desiredCards.get(version.getVersionState())) { + List cachedPerms = searchPermissionsService.findDatasetVersionPerms(version); + String solrIdEnd = getDatasetOrDataFileSolrEnding(version.getVersionState()); + Long versionId = version.getId(); + boolean isDraft = version.isDraft(); + for (FileMetadata fmd : version.getFileMetadatas()) { + DataFileProxy fileProxy = new DataFileProxy(fmd); + // Since reindexFilesInBatches() re-indexes a file in all versions needed, we should not send a file already in the released version twice + filesToReindexAsBatch.add(fileProxy); counter[0]++; + if (counter[0] % 100 == 0) { + reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd); + filesToReindexAsBatch.clear(); + } + if (counter[0] % 1000 == 0) { + logger.info("Progress: " + counter[0] + "files permissions reindexed"); + } } - if (counter[0] % 100 == 0) { - reindexFilesInBatches(filesToReindexAsBatch, desiredCards, datasetVersions); - filesToReindexAsBatch.clear(); - } - if (counter[0] % 1000 == 0) { - logger.info("Progress: " + counter[0] + "files permissions reindexed"); - } + + // Re-index any remaining files in the datasetversion (so that verionId, etc. remain constants for all files in the batch) + reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd); + logger.info("Progress : dataset " + dataset.getId() + " permissions reindexed in " + (System.currentTimeMillis() - startTime) + " ms"); } } - //Re-index any remaining files in the dataset (so that desiredCards and datasetVersions remain constants for all files in the batch) - reindexFilesInBatches(filesToReindexAsBatch, desiredCards, datasetVersions); - logger.fine("Progress : dataset " + dataset.getId() + " permissions reindexed in " + (System.currentTimeMillis() - startTime) + " ms"); } } else if (definitionPoint.isInstanceofDataset()) { indexPermissionsForOneDvObject(definitionPoint); @@ -382,54 +390,46 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) Map desiredCards = searchPermissionsService.getDesiredCards(dataset); Set datasetVersions = datasetVersionsToBuildCardsFor(dataset); for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { - boolean isDraft = version.isDraft(); - if(version.getFileMetadatas().size()>1000) { - // For large datasets, use a more efficient SQL query instead of loading all file metadata objects - List fileInfoList = datasetVersionService.getDataFileInfoForPermissionIndexing(version.getId()); - - for (Object[] fileInfo : fileInfoList) { - String label = (String) fileInfo[0]; - Long fileId = ((Number) fileInfo[1]).longValue(); - boolean restricted = (boolean) fileInfo[2]; - boolean isReleased = fileInfo[3] != null; - // Since reindexFilesInBatches() re-indexes a file in all versions needed, - // we should not send a file already in the released version twice - if (!isDraft || !isReleased) { - DataFile file = new DataFile(); - file.setId(fileId); - file.setRestricted(restricted); - file.setProvEntityName(label); - filesToReindexAsBatch.add(file); + if (desiredCards.get(version.getVersionState())) { + List cachedPerms = searchPermissionsService.findDatasetVersionPerms(version); + String solrIdEnd = getDatasetOrDataFileSolrEnding(version.getVersionState()); + Long versionId = version.getId(); + boolean isDraft = version.isDraft(); + if (version.getFileMetadatas().size() > 1000) { + // For large datasets, use a more efficient SQL query instead of loading all file metadata objects + List fileInfoList = getDataFileInfoForPermissionIndexing(version.getId()); + + for (DataFileProxy fileInfo : fileInfoList) { + filesToReindexAsBatch.add(fileInfo); counter[0]++; - } - - if (counter[0] % 100 == 0) { - long startTime = System.currentTimeMillis(); - reindexFilesInBatches(filesToReindexAsBatch, desiredCards, datasetVersions); - filesToReindexAsBatch.clear(); - logger.info("Progress: 100 file permissions at " + counter[0] + " files reindexed in " + (System.currentTimeMillis() - startTime) + " ms"); - } - } - } else { - version.getFileMetadatas().stream() - .forEach(fmd -> { - DataFile file = fmd.getDataFile(); - // Since reindexFilesInBatches() re-indexes a file in all versions needed, we should not send a file already in the released version twice - if (!isDraft || !file.isReleased()) { - filesToReindexAsBatch.add(file); - counter[0]++; - } + if (counter[0] % 100 == 0) { long startTime = System.currentTimeMillis(); - reindexFilesInBatches(filesToReindexAsBatch, desiredCards, datasetVersions); + reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd); filesToReindexAsBatch.clear(); - logger.info("Progress: 100 file permissions at " + counter[0] + "files reindexed in " + (System.currentTimeMillis() - startTime) + " ms"); + logger.info("Progress: 100 file permissions at " + counter[0] + " files reindexed in " + (System.currentTimeMillis() - startTime) + " ms"); } - }); + } + } else { + version.getFileMetadatas().stream() + .forEach(fmd -> { + DataFileProxy fileProxy = new DataFileProxy(fmd); + filesToReindexAsBatch.add(fileProxy); + counter[0]++; + if (counter[0] % 100 == 0) { + long startTime = System.currentTimeMillis(); + reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd); + filesToReindexAsBatch.clear(); + logger.info("Progress: 100 file permissions at " + counter[0] + "files reindexed in " + (System.currentTimeMillis() - startTime) + " ms"); + } + }); + } + // Re-index any remaining files in the dataset version (versionId, etc. remain constants for all files in the batch) + reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd); + filesToReindexAsBatch.clear(); } - }; - //Re-index any remaining files in the dataset (so that desiredCards and datasetVersions remain constants for all files in the batch) - reindexFilesInBatches(filesToReindexAsBatch, desiredCards, datasetVersions); + + } } else { indexPermissionsForOneDvObject(definitionPoint); numObjects++; @@ -438,45 +438,33 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) /** * @todo Error handling? What to do with response? * - * @todo Should update timestamps, probably, even thought these are - * files, see https://github.com/IQSS/dataverse/issues/2421 + * @todo Should update timestamps, probably, even thought these are files, see + * https://github.com/IQSS/dataverse/issues/2421 */ logger.fine("Reindexed permissions for " + counter[0] + " files and " + numObjects + "datasets/collections in " + (System.currentTimeMillis() - globalStartTime) + " ms"); return new IndexResponse("Number of dvObject permissions indexed for " + definitionPoint + ": " + numObjects); } - private String reindexFilesInBatches(List filesToReindexPermissionsFor, - Map desiredCards, - Set datasetVersions) { + private String reindexFilesInBatches(List filesToReindexAsBatch, List cachedPerms, Long versionId, String solrIdEnd) { List docs = new ArrayList<>(); Map> permStringByDatasetVersion = new HashMap<>(); try { // Assume all files have the same owner - if (filesToReindexPermissionsFor.isEmpty()) { + if (filesToReindexAsBatch.isEmpty()) { return "No files to reindex"; } - - for (DatasetVersion datasetVersion : datasetVersions) { - if (desiredCards.get(datasetVersion.getVersionState())) { - if (datasetVersion.getId() != null) { - permStringByDatasetVersion.put(datasetVersion.getId(), searchPermissionsService.findDatasetVersionPerms(datasetVersion)); - } - for (DataFile file : filesToReindexPermissionsFor) { - if (file.isInDatasetVersion(datasetVersion)) { + for (DataFileProxy file : filesToReindexAsBatch) { - DvObjectSolrDoc fileSolrDoc = constructDatafileSolrDoc(file, datasetVersion, permStringByDatasetVersion); + DvObjectSolrDoc fileSolrDoc = constructDatafileSolrDoc(file, cachedPerms, versionId, solrIdEnd); SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc); docs.add(solrDoc); - } } - } - } persistToSolr(docs); - return " " + filesToReindexPermissionsFor.size() + " files indexed across " + docs.size() + " Solr documents "; + return " " + filesToReindexAsBatch.size() + " files indexed across " + docs.size() + " Solr documents "; } catch (SolrServerException | IOException ex) { - return " tried to reindex " + filesToReindexPermissionsFor.size() + " files indexed across " + docs.size() + " Solr documents but caught exception: " + ex; + return " tried to reindex " + filesToReindexAsBatch.size() + " files indexed across " + docs.size() + " Solr documents but caught exception: " + ex; } } @@ -540,4 +528,89 @@ public List findPermissionsInDatabaseButStaleInOrMissingFromSolr() { return indexingRequired; } + public List getDataFileInfoForPermissionIndexing(Long id) { + String query = "SELECT fm.label, df.id, df.restricted, dvo.publicationDate " + + "FROM filemetadata fm " + + "JOIN datafile df ON fm.datafile_id = df.id " + + "JOIN dvobject dvo ON df.id = dvo.id " + + "WHERE fm.datasetversion_id = ?"; + return em.createNativeQuery(query).setParameter(1, id).getResultList().stream().map(o-> DataFileProxy.fromDatabaseResult((Object[])o)).toList(); + } + + /** + * A lightweight proxy for DataFile objects used during permission indexing. This class avoids loading the full DataFile entity from the database when only basic properties are needed for indexing, + * improving performance for large datasets. + */ + static class DataFileProxy { + + private final Long fileId; + private final String name; + private final boolean restricted; + private final boolean released; + + /** + * Creates a new DataFileProxy with the specified properties. + * + * @param fileId + * The ID of the data file + * @param label + * The label/name of the data file + * @param restricted + * Whether the file is restricted + * @param released + * Whether the file is released + */ + public DataFileProxy(FileMetadata fmd) { + DataFile df = fmd.getDataFile(); + this.fileId = df.getId(); + this.name = fmd.getLabel(); + this.restricted = df.isRestricted(); + this.released = df.isReleased(); + } + + public DataFileProxy(String label, Long fileId, boolean restricted, boolean released) { + this.fileId = fileId; + this.name = label; + this.restricted = restricted; + this.released = released; + } + + /** + * Creates a DataFileProxy from database query results. + * + * @param fileInfo + * Array of objects from database query containing file information + * @return A new DataFileProxy instance + */ + public static DataFileProxy fromDatabaseResult(Object[] fileInfo) { + String label = (String) fileInfo[0]; + Long fileId = ((Number) fileInfo[1]).longValue(); + boolean restricted = (boolean) fileInfo[2]; + boolean released = fileInfo[3] != null; + + return new DataFileProxy(label, fileId, restricted, released); + } + + public boolean isRestricted() { + return restricted; + } + + public boolean isReleased() { + return released; + } + + public Long getFileId() { + return fileId; + } + + public String getName() { + return name; + } + + public DataFile getMinimalDataFile() { + DataFile df = new DataFile(); + df.setId(fileId); + return df; + } + } } From 93c4e69797e74f87a2c4aa57005739fd6363a832 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 2 Apr 2025 14:54:04 -0400 Subject: [PATCH 64/83] stream, cleanup feature flag --- .../search/SolrIndexServiceBean.java | 27 ++++++++----------- .../iq/dataverse/settings/JvmSettings.java | 3 ++- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index aac20dc7687..bfd71f284ba 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -11,6 +11,8 @@ import edu.harvard.iq.dataverse.DvObject; import edu.harvard.iq.dataverse.DvObjectServiceBean; import edu.harvard.iq.dataverse.FileMetadata; +import edu.harvard.iq.dataverse.settings.JvmSettings; + import java.io.IOException; import java.util.ArrayList; import java.util.Collection; @@ -21,6 +23,8 @@ import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; +import java.util.stream.Stream; + import jakarta.ejb.EJB; import jakarta.ejb.Stateless; import jakarta.inject.Named; @@ -72,7 +76,6 @@ public List determineSolrDocs(DvObject dvObject) { List datasetSolrDocs = constructDatasetSolrDocs((Dataset) dvObject); solrDocs.addAll(datasetSolrDocs); } else if (dvObject.isInstanceofDataFile()) { - Map> permStringByDatasetVersion = new HashMap<>(); DataFile datafile = (DataFile) dvObject; Map desiredCards = searchPermissionsService.getDesiredCards(datafile.getOwner()); Set datasetVersions = datasetVersionsToBuildCardsFor(datafile.getOwner()); @@ -81,8 +84,7 @@ public List determineSolrDocs(DvObject dvObject) { List cachedPerms = searchPermissionsService.findDatasetVersionPerms(version); String solrIdEnd = getDatasetOrDataFileSolrEnding(version.getVersionState()); Long versionId = version.getId(); - boolean isDraft = version.isDraft(); - DvObjectSolrDoc fileSolrDoc = constructDatafileSolrDoc(new DataFileProxy(datafile.getFileMetadata()), cachedPerms, versionId, solrIdEnd); + DvObjectSolrDoc fileSolrDoc = constructDatafileSolrDoc(new DataFileProxy(datafile.getFileMetadata()), cachedPerms, versionId, solrIdEnd); solrDocs.add(fileSolrDoc); } } @@ -330,7 +332,7 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) logger.log(Level.WARNING, "Cannot perform indexPermissionsOnSelfAndChildren with a definitionPoint null"); return null; } - + int fileQueryMin= JvmSettings.MAX_FILES_BEFORE_USING_PERMISSION_INDEX_QUERY.lookup(Integer.class); List filesToReindexAsBatch = new ArrayList<>(); /** * @todo Re-indexing the definition point itself seems to be necessary @@ -354,14 +356,12 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) numObjects++; Map desiredCards = searchPermissionsService.getDesiredCards(dataset); - Set datasetVersions = datasetVersionsToBuildCardsFor(dataset); long startTime = System.currentTimeMillis(); for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { if (desiredCards.get(version.getVersionState())) { List cachedPerms = searchPermissionsService.findDatasetVersionPerms(version); String solrIdEnd = getDatasetOrDataFileSolrEnding(version.getVersionState()); Long versionId = version.getId(); - boolean isDraft = version.isDraft(); for (FileMetadata fmd : version.getFileMetadatas()) { DataFileProxy fileProxy = new DataFileProxy(fmd); // Since reindexFilesInBatches() re-indexes a file in all versions needed, we should not send a file already in the released version twice @@ -388,18 +388,14 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) // index files Dataset dataset = (Dataset) definitionPoint; Map desiredCards = searchPermissionsService.getDesiredCards(dataset); - Set datasetVersions = datasetVersionsToBuildCardsFor(dataset); for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { if (desiredCards.get(version.getVersionState())) { List cachedPerms = searchPermissionsService.findDatasetVersionPerms(version); String solrIdEnd = getDatasetOrDataFileSolrEnding(version.getVersionState()); Long versionId = version.getId(); - boolean isDraft = version.isDraft(); - if (version.getFileMetadatas().size() > 1000) { + if (version.getFileMetadatas().size() > fileQueryMin) { // For large datasets, use a more efficient SQL query instead of loading all file metadata objects - List fileInfoList = getDataFileInfoForPermissionIndexing(version.getId()); - - for (DataFileProxy fileInfo : fileInfoList) { + getDataFileInfoForPermissionIndexing(version.getId()).forEach(fileInfo -> { filesToReindexAsBatch.add(fileInfo); counter[0]++; @@ -409,7 +405,7 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) filesToReindexAsBatch.clear(); logger.info("Progress: 100 file permissions at " + counter[0] + " files reindexed in " + (System.currentTimeMillis() - startTime) + " ms"); } - } + }); } else { version.getFileMetadatas().stream() .forEach(fmd -> { @@ -448,7 +444,6 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) private String reindexFilesInBatches(List filesToReindexAsBatch, List cachedPerms, Long versionId, String solrIdEnd) { List docs = new ArrayList<>(); - Map> permStringByDatasetVersion = new HashMap<>(); try { // Assume all files have the same owner if (filesToReindexAsBatch.isEmpty()) { @@ -528,13 +523,13 @@ public List findPermissionsInDatabaseButStaleInOrMissingFromSolr() { return indexingRequired; } - public List getDataFileInfoForPermissionIndexing(Long id) { + public Stream getDataFileInfoForPermissionIndexing(Long id) { String query = "SELECT fm.label, df.id, df.restricted, dvo.publicationDate " + "FROM filemetadata fm " + "JOIN datafile df ON fm.datafile_id = df.id " + "JOIN dvobject dvo ON df.id = dvo.id " + "WHERE fm.datasetversion_id = ?"; - return em.createNativeQuery(query).setParameter(1, id).getResultList().stream().map(o-> DataFileProxy.fromDatabaseResult((Object[])o)).toList(); + return em.createNativeQuery(query).setParameter(1, id).getResultList().stream().map(o-> DataFileProxy.fromDatabaseResult((Object[])o)); } /** diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java index bc32e250be5..c6b35148ec2 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java @@ -71,7 +71,8 @@ public enum JvmSettings { // INDEX CONCURENCY SCOPE_SOLR_CONCURENCY(SCOPE_SOLR, "concurrency"), MAX_ASYNC_INDEXES(SCOPE_SOLR_CONCURENCY, "max-async-indexes"), - + MAX_FILES_BEFORE_USING_PERMISSION_INDEX_QUERY(SCOPE_SOLR_CONCURENCY, "max-files-before-using-permission-index-query"), + // RSERVE CONNECTION SCOPE_RSERVE(PREFIX, "rserve"), RSERVE_HOST(SCOPE_RSERVE, "host"), From 7c817b90c046431d83d802f5a9a000e6937a1bc9 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 2 Apr 2025 16:35:33 -0400 Subject: [PATCH 65/83] make the jvm option optional --- .../edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index bfd71f284ba..9a5284e2c16 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -332,7 +332,7 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) logger.log(Level.WARNING, "Cannot perform indexPermissionsOnSelfAndChildren with a definitionPoint null"); return null; } - int fileQueryMin= JvmSettings.MAX_FILES_BEFORE_USING_PERMISSION_INDEX_QUERY.lookup(Integer.class); + int fileQueryMin= JvmSettings.MAX_FILES_BEFORE_USING_PERMISSION_INDEX_QUERY.lookupOptional(Integer.class).orElse(Integer.MAX_VALUE); List filesToReindexAsBatch = new ArrayList<>(); /** * @todo Re-indexing the definition point itself seems to be necessary From 2a6e9f353997509d3ae0f15aa16ef2fe069c4332 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 2 Apr 2025 17:16:40 -0400 Subject: [PATCH 66/83] merge fix --- .../search/SearchPermissionsServiceBean.java | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java index a5d4fcbce0e..d1fb6b75345 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java @@ -1,5 +1,6 @@ package edu.harvard.iq.dataverse.search; +import edu.harvard.iq.dataverse.DataFile; import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetVersion; import edu.harvard.iq.dataverse.Dataverse; @@ -20,7 +21,6 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.logging.Logger; import jakarta.ejb.EJB; import jakarta.ejb.Stateless; @@ -93,17 +93,22 @@ public List findDatasetVersionPerms(DatasetVersion version) { /* public List findDvObjectPerms(DvObject dvObject) { List permStrings = new ArrayList<>(); - resetRoleAssigneeCache(); - Set roleAssignments = rolesSvc.rolesAssignments(dvObject); + + Set roleAssignments = null; + if (!((dvObject instanceof DataFile) && !((DataFile) dvObject).isRestricted())) { + roleAssignments = rolesSvc.rolesAssignments(dvObject); + } else { + roleAssignments = rolesSvc.rolesAssignments(dvObject.getOwner()); + } + // Use a set to avoid duplicates - taking size and load factor from original ra cache + Set assigneeIdStrings = new HashSet(100, 0.7f); + for (RoleAssignment roleAssignment : roleAssignments) { logger.fine("role assignment on dvObject " + dvObject.getId() + ": " + roleAssignment.getAssigneeIdentifier()); if (roleAssignment.getRole().permissions().contains(getRequiredSearchPermission(dvObject))) { - RoleAssignee userOrGroup = getRoleAssignee(roleAssignment.getAssigneeIdentifier()); - String indexableUserOrGroupPermissionString = getIndexableStringForUserOrGroup(userOrGroup); - if (indexableUserOrGroupPermissionString != null) { - permStrings.add(indexableUserOrGroupPermissionString); - } + assigneeIdStrings.add(roleAssignment.getAssigneeIdentifier()); } + } for (String id : assigneeIdStrings) { // Don't need to cache RoleAssignees since each is unique RoleAssignee userOrGroup = roleAssigneeService.getRoleAssignee(id); From 2f874154073abd062686566313a2f9ca954d2b72 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 2 Apr 2025 17:24:34 -0400 Subject: [PATCH 67/83] DvObj missed changes --- .../harvard/iq/dataverse/search/DvObjectSolrDoc.java | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/DvObjectSolrDoc.java b/src/main/java/edu/harvard/iq/dataverse/search/DvObjectSolrDoc.java index 588ec21459f..cf8730dfa01 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/DvObjectSolrDoc.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/DvObjectSolrDoc.java @@ -27,13 +27,19 @@ public class DvObjectSolrDoc { * Document. */ private final List permissions; + private final List ftpermissions; - public DvObjectSolrDoc(String dvObjectId, String solrId, Long datasetVersionId, String nameOrTitle, List permissions) { + public DvObjectSolrDoc(String dvObjectId, String solrId, Long datasetVersionId, String nameOrTitle, List permissions, List ftpermissions) { this.dvObjectId = dvObjectId; this.solrId = solrId; this.datasetVersionId = datasetVersionId; this.nameOrTitle = nameOrTitle; this.permissions = permissions; + this.ftpermissions = ftpermissions; + } + + public DvObjectSolrDoc(String dvObjectId, String solrId, Long datasetVersionId, String nameOrTitle, List permissions) { + this(dvObjectId, solrId, datasetVersionId, nameOrTitle, permissions, null); } // this could be a Long @@ -56,5 +62,9 @@ public String getNameOrTitle() { public List getPermissions() { return permissions; } + + public List getFTPermissions() { + return ftpermissions; + } } From 1bfe78d88ab5f1b170a45f86cffda266e1c3fafd Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 2 Apr 2025 17:55:58 -0400 Subject: [PATCH 68/83] cleanup --- .../iq/dataverse/FileVersionDifference.java | 186 ++++++++++-------- .../iq/dataverse/RoleAssigneeServiceBean.java | 5 - .../search/SearchPermissionsServiceBean.java | 157 +-------------- .../settings/SettingsServiceBean.java | 21 -- 4 files changed, 103 insertions(+), 266 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java b/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java index f2b7b37605c..9a73cad7877 100644 --- a/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java +++ b/src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java @@ -16,21 +16,23 @@ * @author skraffmi */ public final class FileVersionDifference { - - private FileMetadata newFileMetadata; - private FileMetadata originalFileMetadata; + + private FileMetadata newFileMetadata; + private FileMetadata originalFileMetadata; private boolean details = false; private boolean same = false; - private List differenceSummaryGroups = new ArrayList<>(); - private List differenceDetailItems = new ArrayList<>(); - - public FileVersionDifference(FileMetadata newFileMetadata, FileMetadata originalFileMetadata) { - this(newFileMetadata, originalFileMetadata, false); - } + private List differenceSummaryGroups = new ArrayList<>(); + private List differenceDetailItems = new ArrayList<>(); + + public FileVersionDifference(FileMetadata newFileMetadata, FileMetadata originalFileMetadata) { + this(newFileMetadata, originalFileMetadata, false); + + } + public FileVersionDifference(FileMetadata newFileMetadata, FileMetadata originalFileMetadata, boolean details) { this.newFileMetadata = newFileMetadata; @@ -38,41 +40,41 @@ public FileVersionDifference(FileMetadata newFileMetadata, FileMetadata original this.details = details; this.same = compareMetadata(newFileMetadata, originalFileMetadata); - // Compare versions - File Metadata first - - } + //Compare versions - File Metadata first + } + + public boolean compareMetadata(FileMetadata newFileMetadata, FileMetadata originalFileMetadata) { /* - * This method both determines if there has been a change in file metadata - * between the two versions supplied and it updates the FileVersionDifference - * object which is used to display the differences on the dataset versions tab. - * The return value is used by the index service bean to mark whether a file - * needs to be re-indexed in the context of a dataset update. When there are - * changes (after v4.19)to the file metadata data model this method must be - * updated. retVal of True means metadatas are equal. - */ - + This method both determines if there has been a change in file metadata between the two versions supplied + and it updates the FileVersionDifference object which is used to display the differences on the dataset versions tab. + The return value is used by the index service bean tomark whether a file needs to be re-indexed in the context of a dataset update. + When there are changes (after v4.19)to the file metadata data model this method must be updated. + retVal of True means metadatas are equal. + */ + boolean retVal = true; - if (newFileMetadata.getDataFile() == null && originalFileMetadata == null) { - // File in neither version - // Don't add any groups + if (newFileMetadata.getDataFile() == null && originalFileMetadata == null){ + //File in neither version + //Don't add any groups return true; } - - if (newFileMetadata.getDataFile() == null && originalFileMetadata != null) { - // File Deleted + + if (newFileMetadata.getDataFile() == null && originalFileMetadata != null){ + //File Deleted if (details) { updateDifferenceSummary("", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 0, 0, 1, 0); } return false; } - if (this.originalFileMetadata == null && this.newFileMetadata.getDataFile() != null) { - // File Added - if (!details) + if (this.originalFileMetadata == null && this.newFileMetadata.getDataFile() != null){ + //File Added + if (!details) { return false; + } retVal = false; updateDifferenceSummary("", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 1, 0, 0, 0); } @@ -100,7 +102,7 @@ public boolean compareMetadata(FileMetadata newFileMetadata, FileMetadata origin if (!newFileMetadata.getLabel().equals(originalFileMetadata.getLabel())) { if (details) { differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.fileNameDetailTitle"), originalFileMetadata.getLabel(), newFileMetadata.getLabel())); - } else { + } else{ return false; } updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileMetadataGroupTitle"), @@ -108,7 +110,7 @@ public boolean compareMetadata(FileMetadata newFileMetadata, FileMetadata origin retVal = false; } - // Description differences + //Description differences if (newFileMetadata.getDescription() != null && originalFileMetadata.getDescription() != null && !newFileMetadata.getDescription().equals(originalFileMetadata.getDescription())) { @@ -122,7 +124,8 @@ public boolean compareMetadata(FileMetadata newFileMetadata, FileMetadata origin retVal = false; } if (newFileMetadata.getDescription() != null - && originalFileMetadata.getDescription() == null) { + && originalFileMetadata.getDescription() == null + ) { if (details) { differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), "", newFileMetadata.getDescription())); } else { @@ -133,9 +136,10 @@ public boolean compareMetadata(FileMetadata newFileMetadata, FileMetadata origin retVal = false; } if (newFileMetadata.getDescription() == null - && originalFileMetadata.getDescription() != null) { + && originalFileMetadata.getDescription() != null + ) { if (details) { - differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), originalFileMetadata.getDescription(), "")); + differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), originalFileMetadata.getDescription(), "" )); } else { return false; } @@ -143,7 +147,7 @@ public boolean compareMetadata(FileMetadata newFileMetadata, FileMetadata origin BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), 0, 0, 1, 0); retVal = false; } - // Provenance Description differences + //Provenance Description differences if ((newFileMetadata.getProvFreeForm() != null && !newFileMetadata.getProvFreeForm().isEmpty()) && (originalFileMetadata.getProvFreeForm() != null && !originalFileMetadata.getProvFreeForm().isEmpty()) && !newFileMetadata.getProvFreeForm().equals(originalFileMetadata.getProvFreeForm())) { @@ -157,7 +161,8 @@ public boolean compareMetadata(FileMetadata newFileMetadata, FileMetadata origin retVal = false; } if ((newFileMetadata.getProvFreeForm() != null && !newFileMetadata.getProvFreeForm().isEmpty()) - && (originalFileMetadata.getProvFreeForm() == null || originalFileMetadata.getProvFreeForm().isEmpty())) { + && (originalFileMetadata.getProvFreeForm() == null || originalFileMetadata.getProvFreeForm().isEmpty()) + ) { if (details) { differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), "", newFileMetadata.getProvFreeForm())); } else { @@ -168,9 +173,10 @@ public boolean compareMetadata(FileMetadata newFileMetadata, FileMetadata origin retVal = false; } if ((newFileMetadata.getProvFreeForm() == null || newFileMetadata.getProvFreeForm().isEmpty()) - && (originalFileMetadata.getProvFreeForm() != null && !originalFileMetadata.getProvFreeForm().isEmpty())) { + && (originalFileMetadata.getProvFreeForm() != null && !originalFileMetadata.getProvFreeForm().isEmpty()) + ) { if (details) { - differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), originalFileMetadata.getProvFreeForm(), "")); + differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), originalFileMetadata.getProvFreeForm(), "" )); } else { return false; } @@ -179,8 +185,8 @@ public boolean compareMetadata(FileMetadata newFileMetadata, FileMetadata origin retVal = false; } /* - * get Tags differences - */ + get Tags differences + */ String value1 = originalFileMetadata.getCategoriesByName().toString(); String value2 = newFileMetadata.getCategoriesByName().toString(); if (value1 == null || value1.isEmpty() || value1.equals(" ")) { @@ -191,38 +197,39 @@ public boolean compareMetadata(FileMetadata newFileMetadata, FileMetadata origin } if (!value1.equals(value2)) { - if (!details) + if (!details) { return false; + } int added = 0; int deleted = 0; - + added = newFileMetadata.getCategoriesByName().stream().map((tag) -> { boolean found = false; - for (String tagOld : originalFileMetadata.getCategoriesByName()) { - if (tag.equals(tagOld)) { + for (String tagOld : originalFileMetadata.getCategoriesByName() ){ + if (tag.equals(tagOld)){ found = true; break; } } return found; }).filter((found) -> (!found)).map((_item) -> 1).reduce(added, Integer::sum); - + for (String tag : originalFileMetadata.getCategoriesByName()) { boolean found = false; - for (String tagNew : newFileMetadata.getCategoriesByName()) { - if (tag.equals(tagNew)) { + for (String tagNew : newFileMetadata.getCategoriesByName() ){ + if (tag.equals(tagNew)){ found = true; break; } } - if (!found) { + if (!found){ deleted++; } } - if (added > 0) { + if (added > 0){ updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileTagsGroupTitle"), "", added, 0, 0, 0, true); } - if (deleted > 0) { + if (deleted > 0){ updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileTagsGroupTitle"), "", 0, 0, deleted, 0, true); } retVal = false; @@ -231,16 +238,17 @@ public boolean compareMetadata(FileMetadata newFileMetadata, FileMetadata origin } return retVal; } - - private void updateDifferenceSummary(String groupLabel, String itemLabel, int added, int changed, int deleted, int replaced) { + + private void updateDifferenceSummary(String groupLabel, String itemLabel, int added, int changed, int deleted, int replaced) { updateDifferenceSummary(groupLabel, itemLabel, added, changed, deleted, replaced, false); } - + + private void updateDifferenceSummary(String groupLabel, String itemLabel, int added, int changed, int deleted, int replaced, boolean multiple) { FileDifferenceSummaryGroup summaryGroup = new FileDifferenceSummaryGroup(groupLabel); FileDifferenceSummaryItem summaryItem = new FileDifferenceSummaryItem(itemLabel, added, changed, deleted, replaced, multiple); - - if (!this.differenceSummaryGroups.contains(summaryGroup)) { + + if (!this.differenceSummaryGroups.contains(summaryGroup)) { summaryGroup.getFileDifferenceSummaryItems().add(summaryItem); this.differenceSummaryGroups.add(summaryGroup); } else { @@ -249,23 +257,23 @@ private void updateDifferenceSummary(String groupLabel, String itemLabel, int ad }); } } - - public FileMetadata getNewFileMetadata() { + + public FileMetadata getNewFileMetadata(){ return this.newFileMetadata; } - - public void setNewFileMetadata(FileMetadata in) { - this.newFileMetadata = in; + + public void setNewFileMetadata(FileMetadata in){ + this.newFileMetadata = in; } - public FileMetadata getOriginalFileMetadata() { + public FileMetadata getOriginalFileMetadata() { return originalFileMetadata; } public void setOriginalFileMetadata(FileMetadata originalFileMetadata) { this.originalFileMetadata = originalFileMetadata; } - + public boolean isSame() { return same; } @@ -273,7 +281,8 @@ public boolean isSame() { public void setSame(boolean same) { this.same = same; } - + + public List getDifferenceSummaryGroups() { return differenceSummaryGroups; } @@ -282,17 +291,19 @@ public void setDifferenceSummaryGroups(List differen this.differenceSummaryGroups = differenceSummaryGroups; } - public class FileDifferenceSummaryGroup { + public class FileDifferenceSummaryGroup { + + private String name; private List fileDifferenceSummaryItems; - + public FileDifferenceSummaryGroup(String name) { this.name = name; this.fileDifferenceSummaryItems = new ArrayList<>(); - + } - + public String getName() { return name; } @@ -308,22 +319,23 @@ public List getFileDifferenceSummaryItems() { public void setFileDifferenceSummaryItems(List fileDifferenceSummaryItems) { this.fileDifferenceSummaryItems = fileDifferenceSummaryItems; } - + @Override public String toString() { - + String retval = getName(); - if (!retval.isEmpty()) { + if (!retval.isEmpty()){ retval += ": "; } - - for (FileDifferenceSummaryItem item : this.fileDifferenceSummaryItems) { + + for (FileDifferenceSummaryItem item : this.fileDifferenceSummaryItems){ retval += " " + item.toString(); } - + return retval; } - + + @Override public int hashCode() { int hash = 5; @@ -346,8 +358,8 @@ public boolean equals(Object obj) { return Objects.equals(this.name, other.name); } } - - public final class FileDifferenceDetailItem { + + public final class FileDifferenceDetailItem{ private String displayName; private String originalValue; private String newValue; @@ -357,7 +369,9 @@ public FileDifferenceDetailItem(String displayName, String originalValue, String this.originalValue = originalValue; this.newValue = newValue; } - + + + public String getDisplayName() { return displayName; } @@ -383,8 +397,11 @@ public void setNewValue(String newValue) { } } + + + + public class FileDifferenceSummaryItem{ - public class FileDifferenceSummaryItem { private String name; private int added; @@ -392,7 +409,7 @@ public class FileDifferenceSummaryItem { private int deleted; private int replaced; private boolean multiple; - + public FileDifferenceSummaryItem(String name, int added, int changed, int deleted, int replaced, boolean multiple) { this.name = name; this.added = added; @@ -401,7 +418,7 @@ public FileDifferenceSummaryItem(String name, int added, int changed, int delete this.replaced = replaced; this.multiple = multiple; } - + public String getName() { return name; } @@ -449,7 +466,8 @@ public boolean isMultiple() { public void setMultiple(boolean multiple) { this.multiple = multiple; } - - } - + + + } + } diff --git a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java index 042379f36bb..0f19dfc105f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java @@ -402,11 +402,6 @@ public List filterRoleAssignees(String query, DvObject dvObject, L public List findAssigneesWithPermissionOnDvObject(Long objectId, Permission permission) { - if(permission.ordinal()==4) { - List authenticatedUsersList = new ArrayList<>(); - authenticatedUsersList.add(":authenticated-users"); - return authenticatedUsersList; - } int bitpos = 63 - permission.ordinal(); return em.createNamedQuery("RoleAssignment.findAssigneesWithPermissionOnDvObject", String.class) .setParameter(1, bitpos) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java index d1fb6b75345..efc85b5e3ea 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java @@ -52,9 +52,6 @@ public class SearchPermissionsServiceBean { @EJB SettingsServiceBean settingsService; - LinkedHashMap roleAssigneeCache = new LinkedHashMap<>(100, 0.7f, true); - private static final int MAX_CACHE_SIZE = 2000; - /** * @todo Should we make a PermStrings object? Probably. * @@ -65,8 +62,6 @@ public List findDataversePerms(Dataverse dataverse) { if (hasBeenPublished(dataverse)) { permStrings.add(IndexServiceBean.getPublicGroupString()); } -// permStrings.addAll(findDirectAssignments(dataverse)); -// permStrings.addAll(findImplicitAssignments(dataverse)); permStrings.addAll(findDvObjectPerms(dataverse)); return permStrings; } @@ -84,71 +79,17 @@ public List findDatasetVersionPerms(DatasetVersion version) { if (version.isReleased()) { perms.add(IndexServiceBean.getPublicGroupString()); } -// perms.addAll(findDirectAssignments(version.getDataset())); -// perms.addAll(findImplicitAssignments(version.getDataset())); + perms.addAll(findDvObjectPerms(version.getDataset())); return perms; } - /* - public List findDvObjectPerms(DvObject dvObject) { - List permStrings = new ArrayList<>(); - - Set roleAssignments = null; - if (!((dvObject instanceof DataFile) && !((DataFile) dvObject).isRestricted())) { - roleAssignments = rolesSvc.rolesAssignments(dvObject); - } else { - roleAssignments = rolesSvc.rolesAssignments(dvObject.getOwner()); - } - // Use a set to avoid duplicates - taking size and load factor from original ra cache - Set assigneeIdStrings = new HashSet(100, 0.7f); - - for (RoleAssignment roleAssignment : roleAssignments) { - logger.fine("role assignment on dvObject " + dvObject.getId() + ": " + roleAssignment.getAssigneeIdentifier()); - if (roleAssignment.getRole().permissions().contains(getRequiredSearchPermission(dvObject))) { - assigneeIdStrings.add(roleAssignment.getAssigneeIdentifier()); - } - } - for (String id : assigneeIdStrings) { - // Don't need to cache RoleAssignees since each is unique - RoleAssignee userOrGroup = roleAssigneeService.getRoleAssignee(id); - String indexableUserOrGroupPermissionString = getIndexableStringForUserOrGroup(userOrGroup); - if (indexableUserOrGroupPermissionString != null) { - permStrings.add(indexableUserOrGroupPermissionString); - } - } - return permStrings; - } -*/ public List findDvObjectPerms(DvObject dvObject) { List permStrings = new ArrayList<>(); Permission p = getRequiredSearchPermission(dvObject); List assigneeIdStrings = null; -/* if(dvObject instanceof DataFile) { - Long[] downloadRole = new Long[1]; - downloadRole[0] =2L; - assigneeIdStrings = roleAssigneeService.findAssigneesWithRoleOnDvObject(dvObject.getId(), downloadRole); - - } else { - */ assigneeIdStrings = roleAssigneeService.findAssigneesWithPermissionOnDvObject(dvObject.getId(), p); -// } - for (String id : assigneeIdStrings) { - // Don't need to cache RoleAssignees since each is unique - RoleAssignee userOrGroup = roleAssigneeService.getRoleAssignee(id); - String indexableUserOrGroupPermissionString = getIndexableStringForUserOrGroup(userOrGroup); - if (indexableUserOrGroupPermissionString != null) { - permStrings.add(indexableUserOrGroupPermissionString); - } - } - return permStrings; - } - - /* public List findDvObjectPerms(DvObject dvObject, List dataverseRoleIds) { - List permStrings = new ArrayList<>(); - - List assigneeIdStrings = roleAssigneeService.findAssigneesWithPermissionOnDvObject(dvObject.getId(), dataverseRoleIds); for (String id : assigneeIdStrings) { // Don't need to cache RoleAssignees since each is unique RoleAssignee userOrGroup = roleAssigneeService.getRoleAssignee(id); @@ -159,95 +100,6 @@ public List findDvObjectPerms(DvObject dvObject) { } return permStrings; } -*/ - /* - public List findRestrictedDatafilePerms(long fileId) { - List permStrings = new ArrayList<>(); - - List assigneeIdStrings = roleAssigneeService.findFileDownloaders(fileId); - for (String id : assigneeIdStrings) { - // Don't need to cache RoleAssignees since each is unique - RoleAssignee userOrGroup = roleAssigneeService.getRoleAssignee(id); - String indexableUserOrGroupPermissionString = getIndexableStringForUserOrGroup(userOrGroup); - if (indexableUserOrGroupPermissionString != null) { - permStrings.add(indexableUserOrGroupPermissionString); - } - } - return permStrings; - } -*/ - @Deprecated - private void resetRoleAssigneeCache() { - roleAssigneeCache.clear(); - } - - @Deprecated - private RoleAssignee getRoleAssignee(String idtf) { - RoleAssignee ra = roleAssigneeCache.get(idtf); - if (ra != null) { - return ra; - } - ra = roleAssigneeService.getRoleAssignee(idtf); - roleAssigneeCache.put(idtf, ra); - if (roleAssigneeCache.size() > MAX_CACHE_SIZE) { - roleAssigneeCache.remove(roleAssigneeCache.keySet().iterator().next()); - } - return ra; - } - - @Deprecated - private List findDirectAssignments(DvObject dvObject) { - List permStrings = new ArrayList<>(); - List roleAssignees = findWhoHasDirectAssignments(dvObject); - for (RoleAssignee roleAssignee : roleAssignees) { - logger.fine("user or group (findDirectAssignments): " + roleAssignee.getIdentifier()); - String indexableUserOrGroupPermissionString = getIndexableStringForUserOrGroup(roleAssignee); - if (indexableUserOrGroupPermissionString != null) { - permStrings.add(indexableUserOrGroupPermissionString); - } - } - return permStrings; - } - - @Deprecated - private List findWhoHasDirectAssignments(DvObject dvObject) { - List emptyList = new ArrayList<>(); - List peopleWhoCanSearch = emptyList; - resetRoleAssigneeCache(); - - List assignmentsOn = permissionService.assignmentsOn(dvObject); - for (RoleAssignment roleAssignment : assignmentsOn) { - if (roleAssignment.getRole().permissions().contains(getRequiredSearchPermission(dvObject))) { - RoleAssignee userOrGroup = getRoleAssignee(roleAssignment.getAssigneeIdentifier()); - if (userOrGroup != null) { - peopleWhoCanSearch.add(userOrGroup); - } - } - } - resetRoleAssigneeCache(); - return peopleWhoCanSearch; - } - - @Deprecated - private List findImplicitAssignments(DvObject dvObject) { - List permStrings = new ArrayList<>(); - DvObject parent = dvObject.getOwner(); - while (parent != null) { - if (respectPermissionRoot()) { - if (parent.isEffectivelyPermissionRoot()) { - return permStrings; - } - } - if (parent.isInstanceofDataverse()) { - permStrings.addAll(findDirectAssignments(parent)); - } else if (parent.isInstanceofDataset()) { - // files get discoverability from their parent dataset - permStrings.addAll(findDirectAssignments(parent)); - } - parent = parent.getOwner(); - } - return permStrings; - } public Map getDesiredCards(Dataset dataset) { Map desiredCards = new LinkedHashMap<>(); @@ -310,13 +162,6 @@ private Permission getRequiredSearchPermission(DvObject dvObject) { } - @Deprecated - private boolean respectPermissionRoot() { - boolean safeDefaultIfKeyNotFound = true; - // see javadoc of the key - return settingsService.isTrueForKey(SettingsServiceBean.Key.SearchRespectPermissionRoot, safeDefaultIfKeyNotFound); - } - /** * From a Solr perspective we can't just index any string when we go to do * the JOIN to enforce security. (Maybe putting quotes around the string at diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java index 5b0a178969b..6d96ad4abf6 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java @@ -168,27 +168,6 @@ public enum Key { * to from the footer. */ ApplicationPrivacyPolicyUrl, - /** - * A boolean defining if indexing and search should respect the concept - * of "permission root". - * - *

- * - * If we ignore permissionRoot at index time, we should blindly give - * search ("discoverability") access to people and group who have access - * defined in a parent dataverse, all the way back to the root. - * - *

- * - * If we respect permissionRoot, this means that the dataverse being - * indexed is an island of permissions all by itself. We should not look - * to its parent to see if more people and groups might be able to - * search the DvObjects within it. We would assume no implicit - * inheritance of permissions. In this mode, all permissions must be - * explicitly defined on DvObjects. No implied inheritance. - * - */ - SearchRespectPermissionRoot, /** * Solr hostname and port, such as "localhost:8983". * @deprecated New installations should not use this database setting, but use {@link JvmSettings#SOLR_HOST} From 36b4efbb75356ea8509dca127deb39e638445eca Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 3 Apr 2025 12:46:08 -0400 Subject: [PATCH 69/83] cleanup, remove restricted ft code from QDR --- .../edu/harvard/iq/dataverse/DataTable.java | 3 +- .../iq/dataverse/DatasetServiceBean.java | 2 +- .../iq/dataverse/RoleAssigneeServiceBean.java | 2 - .../edu/harvard/iq/dataverse/api/Index.java | 1 - .../dataverse/authorization/Permission.java | 3 +- .../iq/dataverse/search/DvObjectSolrDoc.java | 12 +- .../iq/dataverse/search/IndexServiceBean.java | 138 +++++++++--------- .../search/SearchPermissionsServiceBean.java | 13 +- .../search/SolrIndexServiceBean.java | 34 ++--- 9 files changed, 86 insertions(+), 122 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DataTable.java b/src/main/java/edu/harvard/iq/dataverse/DataTable.java index acf70360edf..95f3aed0f40 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataTable.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataTable.java @@ -10,7 +10,6 @@ import java.util.List; import jakarta.persistence.CascadeType; import jakarta.persistence.Entity; -import jakarta.persistence.FetchType; import jakarta.persistence.GeneratedValue; import jakarta.persistence.GenerationType; import jakarta.persistence.Id; @@ -84,7 +83,7 @@ public DataTable() { /* * DataVariables in this DataTable: */ - @OneToMany (mappedBy="dataTable",cascade={ CascadeType.REMOVE, CascadeType.MERGE,CascadeType.PERSIST}) + @OneToMany (mappedBy="dataTable", cascade={ CascadeType.REMOVE, CascadeType.MERGE,CascadeType.PERSIST}) @OrderBy ("fileOrder") private List dataVariables; diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java index 21463bcc897..9a8c43668cb 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java @@ -284,7 +284,7 @@ public List findAllOrSubsetOrderByFilesOwned(boolean skipIndexed) { "left join dvobject f on f.owner_id = o.id where o.dtype = 'Dataset' " + skipClause + " group by o.id " - + "ORDER BY count(f.id) desc, o.id"); + + "ORDER BY count(f.id) asc, o.id"); List queryResults; queryResults = query.getResultList(); diff --git a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java index 0f19dfc105f..46d420d3bba 100644 --- a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java @@ -41,8 +41,6 @@ @Named public class RoleAssigneeServiceBean { - - private static final Logger logger = Logger.getLogger(RoleAssigneeServiceBean.class.getName()); @PersistenceContext(unitName = "VDCNet-ejbPU") private EntityManager em; diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Index.java b/src/main/java/edu/harvard/iq/dataverse/api/Index.java index 1a95c55ea0c..bc9a8ae692b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Index.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Index.java @@ -303,7 +303,6 @@ public Response indexDatasetByPersistentId(@QueryParam("persistentId") String pe Dataset dataset = null; try { dataset = datasetService.findByGlobalId(persistentId); - dataset = datasetService.findDeep(dataset.getId()); } catch (Exception ex) { return error(Status.BAD_REQUEST, "Problem looking up dataset with persistent id \"" + persistentId + "\". Error: " + ex.getMessage()); } diff --git a/src/main/java/edu/harvard/iq/dataverse/authorization/Permission.java b/src/main/java/edu/harvard/iq/dataverse/authorization/Permission.java index 2ad8881155f..32937098118 100644 --- a/src/main/java/edu/harvard/iq/dataverse/authorization/Permission.java +++ b/src/main/java/edu/harvard/iq/dataverse/authorization/Permission.java @@ -5,7 +5,6 @@ import edu.harvard.iq.dataverse.Dataverse; import edu.harvard.iq.dataverse.DvObject; import java.util.Arrays; -import java.util.Enumeration; import java.util.HashSet; import java.util.Set; import edu.harvard.iq.dataverse.util.BundleUtil; @@ -101,4 +100,6 @@ public boolean appliesTo(Class aClass) { public boolean requiresAuthenticatedUser() { return requiresAuthenticatedUser; } + + } diff --git a/src/main/java/edu/harvard/iq/dataverse/search/DvObjectSolrDoc.java b/src/main/java/edu/harvard/iq/dataverse/search/DvObjectSolrDoc.java index cf8730dfa01..588ec21459f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/DvObjectSolrDoc.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/DvObjectSolrDoc.java @@ -27,19 +27,13 @@ public class DvObjectSolrDoc { * Document. */ private final List permissions; - private final List ftpermissions; - public DvObjectSolrDoc(String dvObjectId, String solrId, Long datasetVersionId, String nameOrTitle, List permissions, List ftpermissions) { + public DvObjectSolrDoc(String dvObjectId, String solrId, Long datasetVersionId, String nameOrTitle, List permissions) { this.dvObjectId = dvObjectId; this.solrId = solrId; this.datasetVersionId = datasetVersionId; this.nameOrTitle = nameOrTitle; this.permissions = permissions; - this.ftpermissions = ftpermissions; - } - - public DvObjectSolrDoc(String dvObjectId, String solrId, Long datasetVersionId, String nameOrTitle, List permissions) { - this(dvObjectId, solrId, datasetVersionId, nameOrTitle, permissions, null); } // this could be a Long @@ -62,9 +56,5 @@ public String getNameOrTitle() { public List getPermissions() { return permissions; } - - public List getFTPermissions() { - return ftpermissions; - } } diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index 58b3639cd30..b2baa6de503 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -93,7 +93,6 @@ import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; -import org.apache.logging.log4j.util.Strings; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery.SortClause; import org.apache.solr.client.solrj.SolrServerException; @@ -432,8 +431,7 @@ public void asyncIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) { @Asynchronous public void asyncIndexDataset(Long datasetId, boolean doNormalSolrDocCleanUp) { //Initialize dataset here for logging (LoggingUtil) purposes - Dataset dataset = new Dataset(); - dataset.setId(datasetId); + Dataset dataset = null; try { acquirePermitFromSemaphore(); dataset = datasetService.find(datasetId); @@ -441,6 +439,10 @@ public void asyncIndexDataset(Long datasetId, boolean doNormalSolrDocCleanUp) { } catch (InterruptedException e) { String failureLogText = "Indexing failed: interrupted. You can kickoff a re-index of this dataset with: \r\n curl http://localhost:8080/api/admin/index/datasets/" + datasetId.toString(); failureLogText += "\r\n" + e.getLocalizedMessage(); + if(dataset==null) { + dataset = new Dataset(); + dataset.setId(datasetId); + } LoggingUtil.writeOnSuccessFailureLog(null, failureLogText, dataset); } finally { ASYNC_INDEX_SEMAPHORE.release(); @@ -937,7 +939,7 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset) throws Sol return result; } - public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set datafilesInDraftVersion) throws SolrServerException, IOException { + public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set datafilesInDraftVersion) throws SolrServerException, IOException { IndexableDataset.DatasetState state = indexableDataset.getDatasetState(); Dataset dataset = indexableDataset.getDatasetVersion().getDataset(); logger.fine("adding or updating Solr document for dataset id " + dataset.getId()); @@ -959,7 +961,7 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set cvocMap = datasetFieldService.getCVocConf(true); Map> cvocManagedFieldMap = new HashMap<>(); for (Map.Entry cvocEntry : cvocMap.entrySet()) { - if (cvocEntry.getValue().containsKey("managed-fields")) { + if(cvocEntry.getValue().containsKey("managed-fields")) { JsonObject managedFields = cvocEntry.getValue().getJsonObject("managed-fields"); Set managedFieldValues = new HashSet<>(); for (String s : managedFields.keySet()) { @@ -1066,6 +1068,8 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set metadataBlocksWithValue = new HashSet<>(); for (DatasetField dsf : datasetVersion.getFlatDatasetFields()) { @@ -1134,7 +1138,7 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set vals = dsf.getValues_nondisplay(); Set searchStrings = new HashSet<>(); - for (String val : vals) { + for (String val: vals) { searchStrings.add(val); // Try to get string values from externalvocabularyvalue using val as termUri searchStrings.addAll(datasetFieldService.getIndexableStringsByTermUri(val, cvocMap.get(dsfType.getId()), dsfType.getName())); - if (dsfType.getParentDatasetFieldType() != null) { + if(dsfType.getParentDatasetFieldType()!=null) { List childDatasetFields = dsf.getParentDatasetFieldCompoundValue().getChildDatasetFields(); for (DatasetField df : childDatasetFields) { - if (cvocManagedFieldMap.containsKey(dsfType.getId()) && cvocManagedFieldMap.get(dsfType.getId()).contains(df.getDatasetFieldType().getName())) { + if(cvocManagedFieldMap.containsKey(dsfType.getId()) && cvocManagedFieldMap.get(dsfType.getId()).contains(df.getDatasetFieldType().getName())) { String solrManagedFieldSearchable = df.getDatasetFieldType().getSolrField().getNameSearchable(); // Try to get string values from externalvocabularyvalue but for a managed fields of the CVOCConf Set stringsForManagedField = datasetFieldService.getIndexableStringsByTermUri(val, cvocMap.get(dsfType.getId()), df.getDatasetFieldType().getName()); logger.fine(solrManagedFieldSearchable + " filled with externalvocabularyvalue : " + stringsForManagedField); - // .addField works as addition of value not a replace of value + //.addField works as addition of value not a replace of value // it allows to add mapped values by CVOCConf before or after indexing real DatasetField value(s) of solrManagedFieldSearchable solrInputDocument.addField(solrManagedFieldSearchable, stringsForManagedField); } @@ -1252,7 +1256,7 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set Float.parseFloat(westLon)) { - minWestLon = westLon; + //Find the overall bounding box that includes all bounding boxes + if(minWestLon==null || Float.parseFloat(minWestLon) > Float.parseFloat(westLon)) { + minWestLon=westLon; } - if (maxEastLon == null || Float.parseFloat(maxEastLon) < Float.parseFloat(eastLon)) { - maxEastLon = eastLon; + if(maxEastLon==null || Float.parseFloat(maxEastLon) < Float.parseFloat(eastLon)) { + maxEastLon=eastLon; } - if (minSouthLat == null || Float.parseFloat(minSouthLat) > Float.parseFloat(southLat)) { - minSouthLat = southLat; + if(minSouthLat==null || Float.parseFloat(minSouthLat) > Float.parseFloat(southLat)) { + minSouthLat=southLat; } - if (maxNorthLat == null || Float.parseFloat(maxNorthLat) < Float.parseFloat(northLat)) { - maxNorthLat = northLat; + if(maxNorthLat==null || Float.parseFloat(maxNorthLat) < Float.parseFloat(northLat)) { + maxNorthLat=northLat; } if (DatasetFieldValueValidator.validateBoundingBox(westLon, eastLon, northLat, southLat)) { - // W, E, N, S + //W, E, N, S solrInputDocument.addField(SearchFields.GEOLOCATION, "ENVELOPE(" + westLon + "," + eastLon + "," + northLat + "," + southLat + ")"); } } } - // Only one bbox per dataset - // W, E, N, S + //Only one bbox per dataset + //W, E, N, S if (DatasetFieldValueValidator.validateBoundingBox(minWestLon, maxEastLon, maxNorthLat, minSouthLat) && (minWestLon != null || maxEastLon != null) && (maxNorthLat != null || minSouthLat != null)) { solrInputDocument.addField(SearchFields.BOUNDING_BOX, "ENVELOPE(" + minWestLon + "," + maxEastLon + "," + maxNorthLat + "," + minSouthLat + ")"); @@ -1356,12 +1360,12 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set dataversePaths = retrieveDVOPaths(dataset); + + List dataversePaths = retrieveDVOPaths(dataset); solrInputDocument.addField(SearchFields.SUBTREE, dataversePaths); // solrInputDocument.addField(SearchFields.HOST_DATAVERSE, // dataset.getOwner().getName()); @@ -1637,10 +1641,10 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set datafilesInDraftVersion) throws SolrServerException, IOException { final SolrInputDocuments docs = toSolrDocs(indexableDataset, datafilesInDraftVersion); diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java index efc85b5e3ea..c25a462efab 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java @@ -65,14 +65,6 @@ public List findDataversePerms(Dataverse dataverse) { permStrings.addAll(findDvObjectPerms(dataverse)); return permStrings; } - - public List findDataFilePermsforDatasetVersion(DataFile dataFile, DatasetVersion version) { - if (dataFile.isRestricted()) { - return(findDvObjectPerms(dataFile)); - } else { - return findDatasetVersionPerms(version); - } - } public List findDatasetVersionPerms(DatasetVersion version) { List perms = new ArrayList<>(); @@ -91,7 +83,6 @@ public List findDvObjectPerms(DvObject dvObject) { List assigneeIdStrings = null; assigneeIdStrings = roleAssigneeService.findAssigneesWithPermissionOnDvObject(dvObject.getId(), p); for (String id : assigneeIdStrings) { - // Don't need to cache RoleAssignees since each is unique RoleAssignee userOrGroup = roleAssigneeService.getRoleAssignee(id); String indexableUserOrGroupPermissionString = getIndexableStringForUserOrGroup(userOrGroup); if (indexableUserOrGroupPermissionString != null) { @@ -154,10 +145,8 @@ private boolean hasBeenPublished(Dataverse dataverse) { private Permission getRequiredSearchPermission(DvObject dvObject) { if (dvObject.isInstanceofDataverse()) { return Permission.ViewUnpublishedDataverse; - } else if(dvObject.isInstanceofDataset()) { - return Permission.ViewUnpublishedDataset; } else { - return Permission.DownloadFile; + return Permission.ViewUnpublishedDataset; } } diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index 9a5284e2c16..603fb0b292e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -139,10 +139,6 @@ private List constructDatasetSolrDocs(Dataset dataset) { } private DvObjectSolrDoc constructDatafileSolrDoc(DataFileProxy fileProxy, List cachedPerms, long versionId, String solrIdEnd) { - List ftperms = new ArrayList<>(); - if (fileProxy.isRestricted()) { - ftperms = searchPermissionsService.findDvObjectPerms(fileProxy.getMinimalDataFile()); - } String solrIdStart = IndexServiceBean.solrDocIdentifierFile + fileProxy.getFileId(); String solrId = solrIdStart + solrIdEnd; List perms = new ArrayList<>(); @@ -151,7 +147,7 @@ private DvObjectSolrDoc constructDatafileSolrDoc(DataFileProxy fileProxy, List constructDatafileSolrDocsFromDataset(Dataset dataset) { @@ -172,11 +168,7 @@ private List constructDatafileSolrDocsFromDataset(Dataset datas String solrIdStart = IndexServiceBean.solrDocIdentifierFile + fileId; String solrIdEnd = getDatasetOrDataFileSolrEnding(datasetVersionFileIsAttachedTo.getVersionState()); String solrId = solrIdStart + solrIdEnd; - List ftperms = new ArrayList<>(); - if (fileMetadata.getDataFile().isRestricted()) { - ftperms = searchPermissionsService.findDvObjectPerms(fileMetadata.getDataFile()); - } - DvObjectSolrDoc dataFileSolrDoc = new DvObjectSolrDoc(fileId.toString(), solrId, datasetVersionFileIsAttachedTo.getId(), fileMetadata.getLabel(), perms, ftperms); + DvObjectSolrDoc dataFileSolrDoc = new DvObjectSolrDoc(fileId.toString(), solrId, datasetVersionFileIsAttachedTo.getId(), fileMetadata.getLabel(), perms); logger.finest("adding fileid " + fileId); datafileSolrDocs.add(dataFileSolrDoc); } @@ -372,7 +364,7 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) filesToReindexAsBatch.clear(); } if (counter[0] % 1000 == 0) { - logger.info("Progress: " + counter[0] + "files permissions reindexed"); + logger.fine("Progress: " + counter[0] + "files permissions reindexed"); } } @@ -403,7 +395,7 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) long startTime = System.currentTimeMillis(); reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd); filesToReindexAsBatch.clear(); - logger.info("Progress: 100 file permissions at " + counter[0] + " files reindexed in " + (System.currentTimeMillis() - startTime) + " ms"); + logger.fine("Progress: 100 file permissions at " + counter[0] + " files reindexed in " + (System.currentTimeMillis() - startTime) + " ms"); } }); } else { @@ -416,7 +408,7 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) long startTime = System.currentTimeMillis(); reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd); filesToReindexAsBatch.clear(); - logger.info("Progress: 100 file permissions at " + counter[0] + "files reindexed in " + (System.currentTimeMillis() - startTime) + " ms"); + logger.fine("Progress: 100 file permissions at " + counter[0] + "files reindexed in " + (System.currentTimeMillis() - startTime) + " ms"); } }); } @@ -524,7 +516,7 @@ public List findPermissionsInDatabaseButStaleInOrMissingFromSolr() { } public Stream getDataFileInfoForPermissionIndexing(Long id) { - String query = "SELECT fm.label, df.id, df.restricted, dvo.publicationDate " + + String query = "SELECT fm.label, df.id, dvo.publicationDate " + "FROM filemetadata fm " + "JOIN datafile df ON fm.datafile_id = df.id " + "JOIN dvobject dvo ON df.id = dvo.id " + @@ -540,7 +532,6 @@ static class DataFileProxy { private final Long fileId; private final String name; - private final boolean restricted; private final boolean released; /** @@ -559,14 +550,12 @@ public DataFileProxy(FileMetadata fmd) { DataFile df = fmd.getDataFile(); this.fileId = df.getId(); this.name = fmd.getLabel(); - this.restricted = df.isRestricted(); this.released = df.isReleased(); } - public DataFileProxy(String label, Long fileId, boolean restricted, boolean released) { + public DataFileProxy(String label, Long fileId, boolean released) { this.fileId = fileId; this.name = label; - this.restricted = restricted; this.released = released; } @@ -580,14 +569,9 @@ public DataFileProxy(String label, Long fileId, boolean restricted, boolean rele public static DataFileProxy fromDatabaseResult(Object[] fileInfo) { String label = (String) fileInfo[0]; Long fileId = ((Number) fileInfo[1]).longValue(); - boolean restricted = (boolean) fileInfo[2]; - boolean released = fileInfo[3] != null; - - return new DataFileProxy(label, fileId, restricted, released); - } + boolean released = fileInfo[2] != null; - public boolean isRestricted() { - return restricted; + return new DataFileProxy(label, fileId, released); } public boolean isReleased() { From c508ec6de8d0473178de221af21468807640c591 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 3 Apr 2025 13:04:17 -0400 Subject: [PATCH 70/83] make named queries --- .../edu/harvard/iq/dataverse/DataFile.java | 22 ++++++++++++++ .../harvard/iq/dataverse/FileMetadata.java | 30 +++++++++++++++++++ .../iq/dataverse/search/IndexServiceBean.java | 28 +---------------- .../search/SolrIndexServiceBean.java | 26 +++------------- 4 files changed, 57 insertions(+), 49 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DataFile.java b/src/main/java/edu/harvard/iq/dataverse/DataFile.java index 01c1a48e117..45604a5472b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataFile.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataFile.java @@ -13,6 +13,7 @@ import edu.harvard.iq.dataverse.datasetutility.FileSizeChecker; import edu.harvard.iq.dataverse.ingest.IngestReport; import edu.harvard.iq.dataverse.ingest.IngestRequest; +import edu.harvard.iq.dataverse.search.SolrIndexServiceBean; import edu.harvard.iq.dataverse.util.BundleUtil; import edu.harvard.iq.dataverse.util.FileUtil; import edu.harvard.iq.dataverse.util.ShapefileHandler; @@ -23,6 +24,7 @@ import java.util.Objects; import java.text.SimpleDateFormat; import java.util.Arrays; +import java.util.Date; import java.util.HashMap; import java.util.Map; import java.util.Set; @@ -50,6 +52,26 @@ @NamedQuery(name="DataFile.findDataFileThatReplacedId", query="SELECT s.id FROM DataFile s WHERE s.previousDataFileId=:identifier") }) +@NamedNativeQuery( + name = "DataFile.getDataFileInfoForPermissionIndexing", + query = "SELECT fm.label, df.id, dvo.publicationDate " + + "FROM filemetadata fm " + + "JOIN datafile df ON fm.datafile_id = df.id " + + "JOIN dvobject dvo ON df.id = dvo.id " + + "WHERE fm.datasetversion_id = ?", + resultSetMapping = "DataFileInfoMapping" + ) + @SqlResultSetMapping( + name = "DataFileInfoMapping", + classes = @ConstructorResult( + targetClass = SolrIndexServiceBean.DataFileProxy.class, + columns = { + @ColumnResult(name = "label", type = String.class), + @ColumnResult(name = "id", type = Long.class), + @ColumnResult(name = "publicationDate", type = Date.class) + } + ) + ) @Entity @Table(indexes = {@Index(columnList="ingeststatus") , @Index(columnList="checksumvalue") diff --git a/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java b/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java index 461c8b14e46..d00edf6f6ce 100644 --- a/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java +++ b/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java @@ -35,6 +35,7 @@ import jakarta.persistence.JoinTable; import jakarta.persistence.ManyToMany; import jakarta.persistence.ManyToOne; +import jakarta.persistence.NamedNativeQuery; import jakarta.persistence.OneToMany; import jakarta.persistence.OrderBy; import jakarta.persistence.Table; @@ -62,6 +63,35 @@ * @author skraffmiller */ @Table(indexes = {@Index(columnList="datafile_id"), @Index(columnList="datasetversion_id")} ) +@NamedNativeQuery( + name = "FileMetadata.compareFileMetadata", + query = "WITH fm_categories AS (" + + " SELECT fmd.filemetadatas_id, " + + " STRING_AGG(dfc.name, ',' ORDER BY dfc.name) AS categories " + + " FROM FileMetadata_DataFileCategory fmd " + + " JOIN DataFileCategory dfc ON fmd.filecategories_id = dfc.id " + + " GROUP BY fmd.filemetadatas_id " + + ") " + + "SELECT fm1.id " + + "FROM FileMetadata fm1 " + + "LEFT JOIN FileMetadata fm2 ON fm1.datafile_id = fm2.datafile_id " + + " AND fm2.datasetversion_id = ?1 " + + "LEFT JOIN fm_categories fc1 ON fc1.filemetadatas_id = fm1.id " + + "LEFT JOIN fm_categories fc2 ON fc2.filemetadatas_id = fm2.id " + + "WHERE fm1.datasetversion_id = ?2 " + + " AND (fm2.id IS NULL " + + " OR (fm1.datafile_id = fm2.datafile_id " + + " AND (fm2.description IS DISTINCT FROM fm1.description " + + " OR fm2.directoryLabel IS DISTINCT FROM fm1.directoryLabel " + + " OR fm2.label != fm1.label " + + " OR fm2.restricted IS DISTINCT FROM fm1.restricted " + + " OR fm2.prov_freeform IS DISTINCT FROM fm1.prov_freeform " + + " OR fc1.categories IS DISTINCT FROM fc2.categories " + + " ) " + + " ) " + + " )", + resultClass = Long.class + ) @Entity public class FileMetadata implements Serializable { private static final long serialVersionUID = 1L; diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index b2baa6de503..2816cf611dd 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -1403,33 +1403,7 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set findPermissionsInDatabaseButStaleInOrMissingFromSolr() { } public Stream getDataFileInfoForPermissionIndexing(Long id) { - String query = "SELECT fm.label, df.id, dvo.publicationDate " + - "FROM filemetadata fm " + - "JOIN datafile df ON fm.datafile_id = df.id " + - "JOIN dvobject dvo ON df.id = dvo.id " + - "WHERE fm.datasetversion_id = ?"; - return em.createNativeQuery(query).setParameter(1, id).getResultList().stream().map(o-> DataFileProxy.fromDatabaseResult((Object[])o)); + return em.createNamedQuery("DataFile.getDataFileInfoForPermissionIndexing", DataFileProxy.class) + .setParameter(1, id) + .getResultStream(); } /** * A lightweight proxy for DataFile objects used during permission indexing. This class avoids loading the full DataFile entity from the database when only basic properties are needed for indexing, * improving performance for large datasets. */ - static class DataFileProxy { + public static class DataFileProxy { private final Long fileId; private final String name; @@ -559,21 +556,6 @@ public DataFileProxy(String label, Long fileId, boolean released) { this.released = released; } - /** - * Creates a DataFileProxy from database query results. - * - * @param fileInfo - * Array of objects from database query containing file information - * @return A new DataFileProxy instance - */ - public static DataFileProxy fromDatabaseResult(Object[] fileInfo) { - String label = (String) fileInfo[0]; - Long fileId = ((Number) fileInfo[1]).longValue(); - boolean released = fileInfo[2] != null; - - return new DataFileProxy(label, fileId, released); - } - public boolean isReleased() { return released; } From 474c3b2a82b78cb308dec68a08d561823457c3e8 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 3 Apr 2025 13:31:18 -0400 Subject: [PATCH 71/83] try stream, remove sync blocks from parallel test --- .../iq/dataverse/search/IndexServiceBean.java | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index 2816cf611dd..a38a8679209 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -1459,7 +1459,7 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set { DataFile datafile = fileMetadata.getDataFile(); Embargo emb = datafile.getEmbargo(); LocalDate end = null; @@ -1777,14 +1777,10 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set Date: Thu, 3 Apr 2025 13:31:33 -0400 Subject: [PATCH 72/83] docs and setting updates --- doc/release-notes/11374-indexing-improvement.md | 4 +++- doc/sphinx-guides/source/installation/config.rst | 11 +++++++++++ .../iq/dataverse/search/SolrIndexServiceBean.java | 2 +- .../harvard/iq/dataverse/settings/JvmSettings.java | 3 ++- 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/doc/release-notes/11374-indexing-improvement.md b/doc/release-notes/11374-indexing-improvement.md index 5547a498987..b852146d110 100644 --- a/doc/release-notes/11374-indexing-improvement.md +++ b/doc/release-notes/11374-indexing-improvement.md @@ -1,3 +1,5 @@ ### Solr Indexing speed improved -The performance if Solr indexing for files has been improved by ~30-40% +The performance of Solr indexing has been significantly improved, particularly for datasets with many files. + +A new dataverse.solr.min-files-to-use-proxy microprofile setting can be used to further improve performance/lower memory requirements for datasets with many files (e.g. 500+) (defaults to Integer.MAX, disabling use of the new functionality) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 6aa5f5c8ff6..6784ecab172 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -2689,6 +2689,17 @@ when using it to configure your core name! Can also be set via *MicroProfile Config API* sources, e.g. the environment variable ``DATAVERSE_SOLR_PATH``. +dataverse.solr.min-files-to-use-proxy ++++++++++++++++++++++++++++++++++++++ + +Specifies when to use a smaller datafile proxy object for the purposes of dataset indexing. This can lower memory requirements +and improve performance when reindexing large datasets (e.g. those with hundreds or thousands of files). (Creating the proxy may slightly slow indexing datasets with only a few files.) + +This setting represents a number of files for which the datafile procy should be used. By default, this is set to Interger.MAX which disables using the proxy. +A recommended value would be ~1000 but the optimal value may vary depending on details of your installation. + +Can also be set via *MicroProfile Config API* sources, e.g. the environment variable ``DATAVERSE_SOLR_MIN_FILES_TO_USE_PROXY``. + dataverse.solr.concurrency.max-async-indexes ++++++++++++++++++++++++++++++++++++++++++++ diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index 2f56a36d33f..cb9a6a74844 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -324,7 +324,7 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) logger.log(Level.WARNING, "Cannot perform indexPermissionsOnSelfAndChildren with a definitionPoint null"); return null; } - int fileQueryMin= JvmSettings.MAX_FILES_BEFORE_USING_PERMISSION_INDEX_QUERY.lookupOptional(Integer.class).orElse(Integer.MAX_VALUE); + int fileQueryMin= JvmSettings.MIN_FILES_TO_USE_PROXY.lookupOptional(Integer.class).orElse(Integer.MAX_VALUE); List filesToReindexAsBatch = new ArrayList<>(); /** * @todo Re-indexing the definition point itself seems to be necessary diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java index c6b35148ec2..afc698b418b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java @@ -67,11 +67,12 @@ public enum JvmSettings { SOLR_PROT(SCOPE_SOLR, "protocol"), SOLR_CORE(SCOPE_SOLR, "core"), SOLR_PATH(SCOPE_SOLR, "path"), + MIN_FILES_TO_USE_PROXY(SCOPE_SOLR, "min-files-to-use-proxy"), + // INDEX CONCURENCY SCOPE_SOLR_CONCURENCY(SCOPE_SOLR, "concurrency"), MAX_ASYNC_INDEXES(SCOPE_SOLR_CONCURENCY, "max-async-indexes"), - MAX_FILES_BEFORE_USING_PERMISSION_INDEX_QUERY(SCOPE_SOLR_CONCURENCY, "max-files-before-using-permission-index-query"), // RSERVE CONNECTION SCOPE_RSERVE(PREFIX, "rserve"), From da1b631e99de1b0e76e08c77aa762b790b5ec98e Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 3 Apr 2025 13:51:56 -0400 Subject: [PATCH 73/83] sync query mapping and constructor --- .../dataverse/search/SolrIndexServiceBean.java | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index cb9a6a74844..64679b05beb 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -16,6 +16,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collection; +import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -527,14 +528,14 @@ public Stream getDataFileInfoForPermissionIndexing(Long id) { */ public static class DataFileProxy { - private final Long fileId; + private final Long id; private final String name; private final boolean released; /** * Creates a new DataFileProxy with the specified properties. * - * @param fileId + * @param id * The ID of the data file * @param label * The label/name of the data file @@ -545,15 +546,15 @@ public static class DataFileProxy { */ public DataFileProxy(FileMetadata fmd) { DataFile df = fmd.getDataFile(); - this.fileId = df.getId(); + this.id = df.getId(); this.name = fmd.getLabel(); this.released = df.isReleased(); } - public DataFileProxy(String label, Long fileId, boolean released) { - this.fileId = fileId; + public DataFileProxy(String label, Long id, Date publicationDate) { + this.id = id; this.name = label; - this.released = released; + this.released = publicationDate != null; } public boolean isReleased() { @@ -561,7 +562,7 @@ public boolean isReleased() { } public Long getFileId() { - return fileId; + return id; } public String getName() { @@ -570,7 +571,7 @@ public String getName() { public DataFile getMinimalDataFile() { DataFile df = new DataFile(); - df.setId(fileId); + df.setId(id); return df; } } From 2db16253cd8886f309943c8b5227e0549863d362 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 3 Apr 2025 14:14:14 -0400 Subject: [PATCH 74/83] named query, back to asc order --- .../edu/harvard/iq/dataverse/Dataset.java | 20 +++++++++++++ .../iq/dataverse/DatasetServiceBean.java | 29 ++----------------- 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/Dataset.java b/src/main/java/edu/harvard/iq/dataverse/Dataset.java index 79c64d03d60..a0f8ca4d939 100644 --- a/src/main/java/edu/harvard/iq/dataverse/Dataset.java +++ b/src/main/java/edu/harvard/iq/dataverse/Dataset.java @@ -20,10 +20,12 @@ import java.util.Objects; import java.util.Set; import jakarta.persistence.CascadeType; +import jakarta.persistence.ColumnResult; import jakarta.persistence.Entity; import jakarta.persistence.Index; import jakarta.persistence.JoinColumn; import jakarta.persistence.ManyToOne; +import jakarta.persistence.NamedNativeQuery; import jakarta.persistence.NamedQueries; import jakarta.persistence.NamedQuery; import jakarta.persistence.NamedStoredProcedureQuery; @@ -31,6 +33,7 @@ import jakarta.persistence.OneToOne; import jakarta.persistence.OrderBy; import jakarta.persistence.ParameterMode; +import jakarta.persistence.SqlResultSetMapping; import jakarta.persistence.StoredProcedureParameter; import jakarta.persistence.Table; import jakarta.persistence.Temporal; @@ -71,6 +74,23 @@ @NamedQuery(name = "Dataset.countAll", query = "SELECT COUNT(ds) FROM Dataset ds") }) +@NamedNativeQuery( + name = "Dataset.findAllOrSubsetOrderByFilesOwned", + query = "SELECT DISTINCT CAST(o.id AS BIGINT) as id, COUNT(f.id) as numFiles " + + "FROM dvobject o " + + "LEFT JOIN dvobject f ON f.owner_id = o.id " + + "WHERE o.dtype = 'Dataset' " + + "AND (? = false OR o.indexTime IS NULL) " + + "GROUP BY o.id " + + "ORDER BY COUNT(f.id) ASC, o.id", + resultSetMapping = "DatasetIdMapping" + ) +@SqlResultSetMapping( + name = "DatasetIdMapping", + columns = { + @ColumnResult(name = "id", type = Long.class) + } +) /* Below is the database stored procedure for getting a string dataset id. diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java index 9a8c43668cb..202800d027b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java @@ -279,32 +279,9 @@ public List findAllOrSubsetOrderByFilesOwned(boolean skipIndexed) { SEK - 11/09/2021 */ - String skipClause = skipIndexed ? "AND o.indexTime is null " : ""; - Query query = em.createNativeQuery(" Select distinct(o.id), count(f.id) as numFiles FROM dvobject o " + - "left join dvobject f on f.owner_id = o.id where o.dtype = 'Dataset' " - + skipClause - + " group by o.id " - + "ORDER BY count(f.id) asc, o.id"); - - List queryResults; - queryResults = query.getResultList(); - - List retVal = new ArrayList(); - for (Object[] result : queryResults) { - Long dsId; - if (result[0] != null) { - try { - dsId = Long.parseLong(result[0].toString()) ; - } catch (Exception ex) { - dsId = null; - } - if (dsId == null) { - continue; - } - retVal.add(dsId); - } - } - return retVal; + return em.createNamedQuery("Dataset.findAllOrSubsetOrderByFilesOwned", Long.class) + .setParameter(1, skipIndexed) + .getResultList(); } /** From db8791e5722fa3b63e4e097400669e73f991fe13 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 3 Apr 2025 14:44:37 -0400 Subject: [PATCH 75/83] query fix --- src/main/java/edu/harvard/iq/dataverse/Dataset.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/Dataset.java b/src/main/java/edu/harvard/iq/dataverse/Dataset.java index a0f8ca4d939..fd3f8333768 100644 --- a/src/main/java/edu/harvard/iq/dataverse/Dataset.java +++ b/src/main/java/edu/harvard/iq/dataverse/Dataset.java @@ -82,7 +82,7 @@ "WHERE o.dtype = 'Dataset' " + "AND (? = false OR o.indexTime IS NULL) " + "GROUP BY o.id " + - "ORDER BY COUNT(f.id) ASC, o.id", + "ORDER BY numfiles ASC, id", resultSetMapping = "DatasetIdMapping" ) @SqlResultSetMapping( From 7cf09a647fec439b8390707384cc2f9f80212474 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 3 Apr 2025 14:51:10 -0400 Subject: [PATCH 76/83] lengthen hard commit time --- conf/solr/solrconfig.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/solr/solrconfig.xml b/conf/solr/solrconfig.xml index 97965bd77d7..003b71c85c1 100644 --- a/conf/solr/solrconfig.xml +++ b/conf/solr/solrconfig.xml @@ -238,7 +238,7 @@ have some sort of hard autoCommit to limit the log size. --> - ${solr.autoCommit.maxTime:30000} + ${solr.autoCommit.maxTime:300000} false From 750974f3670242374bf9f702876ac28ecc25870d Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 3 Apr 2025 15:54:43 -0400 Subject: [PATCH 77/83] remove unused query --- .../edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java | 7 ------- src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java | 6 ------ 2 files changed, 13 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java index 46d420d3bba..8ac2aabdfa4 100644 --- a/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java @@ -406,13 +406,6 @@ public List findAssigneesWithPermissionOnDvObject(Long objectId, Permiss .setParameter(2, objectId) .getResultList(); } - - public List findAssigneesWithRoleOnDvObject(Long objectId, Long[] downloadRole) { - return em.createNamedQuery("RoleAssignment.findAssigneesWithRoleOnDvObject", String.class) - .setParameter(1, downloadRole) - .setParameter(2, objectId) - .getResultList(); - } private void msg(String s) { //System.out.println(s); diff --git a/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java b/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java index c8a5e7658e9..7172e28f18a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java +++ b/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java @@ -57,12 +57,6 @@ query = "DELETE FROM RoleAssignment r WHERE r.assigneeIdentifier=:assigneeIdentifier AND r.role.id=:roleId and r.definitionPoint.id=:definitionPointId") }) @NamedNativeQueries({ - @NamedNativeQuery( - name = "RoleAssignment.findAssigneesWithRoleOnDvObject", - query = "SELECT DISTINCT ra.assigneeidentifier FROM roleassignment ra " + - "WHERE ra.role_id = ANY(?1) " + - "AND ra.definitionpoint_id = ?2", - resultSetMapping = "AssigneeIdentifierMapping"), @NamedNativeQuery( name = "RoleAssignment.findAssigneesWithPermissionOnDvObject", query = "SELECT DISTINCT ra.assigneeidentifier FROM roleassignment ra " + From 2dc56e1cac0940a699a0f3c87d1d007d976d6547 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 3 Apr 2025 17:17:08 -0400 Subject: [PATCH 78/83] revert hard commit change --- conf/solr/solrconfig.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/solr/solrconfig.xml b/conf/solr/solrconfig.xml index 003b71c85c1..97965bd77d7 100644 --- a/conf/solr/solrconfig.xml +++ b/conf/solr/solrconfig.xml @@ -238,7 +238,7 @@ have some sort of hard autoCommit to limit the log size. --> - ${solr.autoCommit.maxTime:300000} + ${solr.autoCommit.maxTime:30000} false From 862197dfa6fa7f725d661bc4e3c83d25de92aada Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 4 Apr 2025 07:55:41 -0400 Subject: [PATCH 79/83] remove shared cache from persistence.xml Causes edit locks to remain (in memory only!) after dataset changes --- src/main/resources/META-INF/persistence.xml | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/main/resources/META-INF/persistence.xml b/src/main/resources/META-INF/persistence.xml index c809c7616f7..151410c04c2 100644 --- a/src/main/resources/META-INF/persistence.xml +++ b/src/main/resources/META-INF/persistence.xml @@ -31,9 +31,6 @@ - - - From ac32815f9821738fc7d44f55f3f4153a92a46f74 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 4 Apr 2025 12:36:14 -0400 Subject: [PATCH 80/83] Revert "revert hard commit change" This reverts commit 2dc56e1cac0940a699a0f3c87d1d007d976d6547. --- conf/solr/solrconfig.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/solr/solrconfig.xml b/conf/solr/solrconfig.xml index 97965bd77d7..003b71c85c1 100644 --- a/conf/solr/solrconfig.xml +++ b/conf/solr/solrconfig.xml @@ -238,7 +238,7 @@ have some sort of hard autoCommit to limit the log size. --> - ${solr.autoCommit.maxTime:30000} + ${solr.autoCommit.maxTime:300000} false From cff9848a3ee03550fa02792159e56b654396712a Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 4 Apr 2025 12:43:20 -0400 Subject: [PATCH 81/83] update query to recurse to permissionroot --- .../harvard/iq/dataverse/RoleAssignment.java | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java b/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java index 7172e28f18a..7ac8c1a7993 100644 --- a/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java +++ b/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java @@ -59,12 +59,24 @@ @NamedNativeQueries({ @NamedNativeQuery( name = "RoleAssignment.findAssigneesWithPermissionOnDvObject", - query = "SELECT DISTINCT ra.assigneeidentifier FROM roleassignment ra " + + query = "WITH RECURSIVE owner_hierarchy(id, owner_id, permissionroot) AS ( " + + " SELECT dvo.id, dvo.owner_id, COALESCE(dv.permissionroot, false) " + + " FROM dvobject dvo " + + " LEFT JOIN dataverse dv ON dvo.id = dv.id " + + " WHERE dvo.id = ?2 " + + " UNION ALL " + + " SELECT dvo.id, dvo.owner_id, dv.permissionroot " + + " FROM dvobject dvo " + + " LEFT JOIN dataverse dv ON dvo.id = dv.id " + + " JOIN owner_hierarchy oh ON dvo.owner_id = oh.id " + + " WHERE NOT oh.permissionroot " + + ") " + + "SELECT DISTINCT ra.assigneeidentifier " + + "FROM roleassignment ra " + "JOIN dataverserole dr ON ra.role_id = dr.id " + - "JOIN dvobject dob ON ra.definitionpoint_id = dob.id " + - "WHERE get_bit(dr.permissionbits::bit(64), ?1) = '1' " + - "AND dob.id = ?2", - resultSetMapping = "AssigneeIdentifierMapping" + "JOIN owner_hierarchy oh ON ra.definitionpoint_id = oh.id " + + "WHERE get_bit(dr.permissionbits::bit(64), ?1) = '1'", + resultSetMapping = "AssigneeIdentifierMapping" ) }) @SqlResultSetMapping( From e4e39d43c1fa24946f4066effe3831712a898f75 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 4 Apr 2025 15:54:40 -0400 Subject: [PATCH 82/83] fix mapping to long --- src/main/java/edu/harvard/iq/dataverse/FileMetadata.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java b/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java index d00edf6f6ce..ca3e2d67263 100644 --- a/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java +++ b/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java @@ -25,6 +25,7 @@ import jakarta.json.Json; import jakarta.json.JsonArrayBuilder; import jakarta.persistence.Column; +import jakarta.persistence.ColumnResult; import jakarta.persistence.Entity; import jakarta.persistence.GeneratedValue; import jakarta.persistence.GenerationType; @@ -38,6 +39,7 @@ import jakarta.persistence.NamedNativeQuery; import jakarta.persistence.OneToMany; import jakarta.persistence.OrderBy; +import jakarta.persistence.SqlResultSetMapping; import jakarta.persistence.Table; import jakarta.persistence.Transient; import jakarta.persistence.Version; @@ -90,7 +92,11 @@ " ) " + " ) " + " )", - resultClass = Long.class + resultSetMapping = "IdToLongMapping" + ) +@SqlResultSetMapping( + name = "IdToLongMapping", + columns = @ColumnResult(name = "id", type = Long.class) ) @Entity public class FileMetadata implements Serializable { From 9c36fcf6bb29bc2823512542fb36bad47ec6e1a2 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Sat, 5 Apr 2025 10:41:47 -0400 Subject: [PATCH 83/83] flip recursion --- src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java b/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java index 7ac8c1a7993..eed7fe07637 100644 --- a/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java +++ b/src/main/java/edu/harvard/iq/dataverse/RoleAssignment.java @@ -68,7 +68,7 @@ " SELECT dvo.id, dvo.owner_id, dv.permissionroot " + " FROM dvobject dvo " + " LEFT JOIN dataverse dv ON dvo.id = dv.id " + - " JOIN owner_hierarchy oh ON dvo.owner_id = oh.id " + + " JOIN owner_hierarchy oh ON dvo.id = oh.owner_id " + " WHERE NOT oh.permissionroot " + ") " + "SELECT DISTINCT ra.assigneeidentifier " +