From d82c730b9f3ef0b1ba570878ea1814ea51dc073e Mon Sep 17 00:00:00 2001 From: Steven Winship Date: Thu, 15 Feb 2024 11:00:39 -0500 Subject: [PATCH 01/15] adding harvesting feature to handle missing controlled values --- ...92-harvest-metadata-values-not-in-cvv-list | 6 ++++ .../settings/SettingsServiceBean.java | 7 +++- .../iq/dataverse/util/SystemConfig.java | 7 ++++ .../iq/dataverse/util/json/JsonParser.java | 34 +++++++++++-------- .../iq/dataverse/api/HarvestingClientsIT.java | 31 ++++++++++++++--- 5 files changed, 64 insertions(+), 21 deletions(-) create mode 100644 doc/release-notes/9992-harvest-metadata-values-not-in-cvv-list diff --git a/doc/release-notes/9992-harvest-metadata-values-not-in-cvv-list b/doc/release-notes/9992-harvest-metadata-values-not-in-cvv-list new file mode 100644 index 00000000000..64ea2e1166a --- /dev/null +++ b/doc/release-notes/9992-harvest-metadata-values-not-in-cvv-list @@ -0,0 +1,6 @@ + +`AllowHarvestingMissingCVV` setting to enable/disable allowing datasets to be harvested with Controlled Vocabulary Values that existed in the originating Dataverse Project but are not in the harvesting Dataverse Project. +The default value of this setting is false/no which will cause the harvesting of the dataset to fail. +By activating this feature (true/yes) the value in question will be removed from the list of values and the dataset will be harvested without the missing value. + +`curl http://localhost:8080/api/admin/settings/:AllowHarvestingMissingCVV -X PUT -d yes` diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java index 3b7632f3d9e..6ed17d93ee3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java @@ -603,7 +603,12 @@ Whether Harvesting (OAI) service is enabled * When ingesting tabular data files, store the generated tab-delimited * files *with* the variable names line up top. */ - StoreIngestedTabularFilesWithVarHeaders + StoreIngestedTabularFilesWithVarHeaders, + + /** + * Should we ignore missing controlled vocabulary values when harvesting + */ + AllowHarvestingMissingCVV ; @Override diff --git a/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java b/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java index ded394833f1..b2127cc263d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java @@ -1181,4 +1181,11 @@ public Long getTestStorageQuotaLimit() { public boolean isStoringIngestedFilesWithHeaders() { return settingsService.isTrueForKey(SettingsServiceBean.Key.StoreIngestedTabularFilesWithVarHeaders, false); } + + /** + * Should we ignore missing controlled vocabulary values when harvesting + */ + public boolean allowHarvestingMissingCVV() { + return settingsService.isTrueForKey(SettingsServiceBean.Key.AllowHarvestingMissingCVV, false); + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java index 984c607aac7..cd93f4719cd 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java @@ -38,7 +38,6 @@ import java.text.ParseException; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.HashSet; @@ -69,7 +68,8 @@ public class JsonParser { MetadataBlockServiceBean blockService; SettingsServiceBean settingsService; LicenseServiceBean licenseService; - HarvestingClient harvestingClient = null; + HarvestingClient harvestingClient = null; + boolean allowHarvestingMissingCVV = false; /** * if lenient, we will accept alternate spellings for controlled vocabulary values @@ -93,6 +93,7 @@ public JsonParser(DatasetFieldServiceBean datasetFieldSvc, MetadataBlockServiceB this.settingsService = settingsService; this.licenseService = licenseService; this.harvestingClient = harvestingClient; + this.allowHarvestingMissingCVV = (harvestingClient != null && settingsService.isTrueForKey(SettingsServiceBean.Key.AllowHarvestingMissingCVV, false)); } public JsonParser() { @@ -931,30 +932,30 @@ private String jsonValueToString(JsonValue jv) { } public List parseControlledVocabularyValue(DatasetFieldType cvvType, JsonObject json) throws JsonParseException { + List vals = new LinkedList<>(); try { if (cvvType.isAllowMultiples()) { try { json.getJsonArray("value").getValuesAs(JsonObject.class); } catch (ClassCastException cce) { throw new JsonParseException("Invalid values submitted for " + cvvType.getName() + ". It should be an array of values."); - } - List vals = new LinkedList<>(); + } for (JsonString strVal : json.getJsonArray("value").getValuesAs(JsonString.class)) { String strValue = strVal.getString(); ControlledVocabularyValue cvv = datasetFieldSvc.findControlledVocabularyValueByDatasetFieldTypeAndStrValue(cvvType, strValue, lenient); - if (cvv == null) { + if (cvv == null && !allowHarvestingMissingCVV) { throw new ControlledVocabularyException("Value '" + strValue + "' does not exist in type '" + cvvType.getName() + "'", cvvType, strValue); } - // Only add value to the list if it is not a duplicate - if (strValue.equals("Other")) { - System.out.println("vals = " + vals + ", contains: " + vals.contains(cvv)); - } - if (!vals.contains(cvv)) { - vals.add(cvv); + if (cvv != null) { + // Only add value to the list if it is not a duplicate + if (strValue.equals("Other")) { + System.out.println("vals = " + vals + ", contains: " + vals.contains(cvv)); + } + if (!vals.contains(cvv)) { + vals.add(cvv); + } } } - return vals; - } else { try { json.getString("value"); @@ -963,11 +964,14 @@ public List parseControlledVocabularyValue(DatasetFie } String strValue = json.getString("value", ""); ControlledVocabularyValue cvv = datasetFieldSvc.findControlledVocabularyValueByDatasetFieldTypeAndStrValue(cvvType, strValue, lenient); - if (cvv == null) { + if (cvv == null && !allowHarvestingMissingCVV) { throw new ControlledVocabularyException("Value '" + strValue + "' does not exist in type '" + cvvType.getName() + "'", cvvType, strValue); } - return Collections.singletonList(cvv); + if (cvv != null) { + vals.add(cvv); + } } + return vals; } catch (ClassCastException cce) { throw new JsonParseException("Invalid values submitted for " + cvvType.getName()); } diff --git a/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java b/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java index d5388e510d2..36ef947e105 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java @@ -2,6 +2,8 @@ import java.util.logging.Logger; +import edu.harvard.iq.dataverse.settings.SettingsServiceBean; +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; import io.restassured.RestAssured; @@ -37,8 +39,8 @@ public class HarvestingClientsIT { private static final String ARCHIVE_URL = "https://demo.dataverse.org"; private static final String HARVEST_METADATA_FORMAT = "oai_dc"; private static final String ARCHIVE_DESCRIPTION = "RestAssured harvesting client test"; - private static final String CONTROL_OAI_SET = "controlTestSet"; - private static final int DATASETS_IN_CONTROL_SET = 7; + private static final String CONTROL_OAI_SET = "controlTestSet2"; + private static final int DATASETS_IN_CONTROL_SET = 8; private static String normalUserAPIKey; private static String adminUserAPIKey; private static String harvestCollectionAlias; @@ -54,6 +56,10 @@ public static void setUpClass() { setupCollection(); } + @AfterEach + public void cleanup() { + UtilIT.deleteSetting(SettingsServiceBean.Key.AllowHarvestingMissingCVV); + } private static void setupUsers() { Response cu0 = UtilIT.createRandomUser(); @@ -157,9 +163,24 @@ public void testCreateEditDeleteClient() throws InterruptedException { logger.info("rDelete.getStatusCode(): " + rDelete.getStatusCode()); assertEquals(OK.getStatusCode(), rDelete.getStatusCode()); } - + + @Test + public void testHarvestingClientRun_AllowHarvestingMissingCVV_True() throws InterruptedException { + harvestingClientRun(true); + } @Test - public void testHarvestingClientRun() throws InterruptedException { + public void testHarvestingClientRun_AllowHarvestingMissingCVV_False() throws InterruptedException { + harvestingClientRun(false); + } + + private void harvestingClientRun(boolean allowHarvestingMissingCVV) throws InterruptedException { + int expectedNumberOfSetsHarvested = allowHarvestingMissingCVV ? DATASETS_IN_CONTROL_SET : DATASETS_IN_CONTROL_SET - 1; + if (allowHarvestingMissingCVV) { + UtilIT.enableSetting(SettingsServiceBean.Key.AllowHarvestingMissingCVV); + } else { + UtilIT.deleteSetting(SettingsServiceBean.Key.AllowHarvestingMissingCVV); + } + // This test will create a client and attempt to perform an actual // harvest and validate the resulting harvested content. @@ -242,7 +263,7 @@ public void testHarvestingClientRun() throws InterruptedException { assertEquals(harvestTimeStamp, responseJsonPath.getString("data.lastNonEmpty")); // d) Confirm that the correct number of datasets have been harvested: - assertEquals(DATASETS_IN_CONTROL_SET, responseJsonPath.getInt("data.lastDatasetsHarvested")); + assertEquals(expectedNumberOfSetsHarvested, responseJsonPath.getInt("data.lastDatasetsHarvested")); // ok, it looks like the harvest has completed successfully. break; From 34d7802622f6d38fec9debcd6ee88798c20bd358 Mon Sep 17 00:00:00 2001 From: Steven Winship Date: Thu, 15 Feb 2024 11:42:11 -0500 Subject: [PATCH 02/15] add .md to release notes file --- ...n-cvv-list => 9992-harvest-metadata-values-not-in-cvv-list.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename doc/release-notes/{9992-harvest-metadata-values-not-in-cvv-list => 9992-harvest-metadata-values-not-in-cvv-list.md} (100%) diff --git a/doc/release-notes/9992-harvest-metadata-values-not-in-cvv-list b/doc/release-notes/9992-harvest-metadata-values-not-in-cvv-list.md similarity index 100% rename from doc/release-notes/9992-harvest-metadata-values-not-in-cvv-list rename to doc/release-notes/9992-harvest-metadata-values-not-in-cvv-list.md From 32d2fa4fa8a26783f6055d1715cb667a3b3ae4d1 Mon Sep 17 00:00:00 2001 From: Steven Winship Date: Fri, 16 Feb 2024 11:06:18 -0500 Subject: [PATCH 03/15] modify for comments --- .../iq/dataverse/util/json/JsonParser.java | 11 +++----- .../iq/dataverse/api/HarvestingClientsIT.java | 25 ++++++++++--------- 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java index cd93f4719cd..bd756fffdbf 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java @@ -946,14 +946,9 @@ public List parseControlledVocabularyValue(DatasetFie if (cvv == null && !allowHarvestingMissingCVV) { throw new ControlledVocabularyException("Value '" + strValue + "' does not exist in type '" + cvvType.getName() + "'", cvvType, strValue); } - if (cvv != null) { - // Only add value to the list if it is not a duplicate - if (strValue.equals("Other")) { - System.out.println("vals = " + vals + ", contains: " + vals.contains(cvv)); - } - if (!vals.contains(cvv)) { - vals.add(cvv); - } + // Only add value to the list if it is not a duplicate + if (cvv != null && !vals.contains(cvv)) { + vals.add(cvv); } } } else { diff --git a/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java b/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java index 36ef947e105..71d4fc14ad5 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java @@ -43,7 +43,8 @@ public class HarvestingClientsIT { private static final int DATASETS_IN_CONTROL_SET = 8; private static String normalUserAPIKey; private static String adminUserAPIKey; - private static String harvestCollectionAlias; + private static String harvestCollectionAlias; + String clientApiPath = null; @BeforeAll public static void setUpClass() { @@ -59,6 +60,15 @@ public static void setUpClass() { @AfterEach public void cleanup() { UtilIT.deleteSetting(SettingsServiceBean.Key.AllowHarvestingMissingCVV); + // Cleanup: delete the client + if (clientApiPath != null) { + Response deleteResponse = given() + .header(UtilIT.API_TOKEN_HTTP_HEADER, adminUserAPIKey) + .delete(clientApiPath); + System.out.println("deleteResponse.getStatusCode(): " + deleteResponse.getStatusCode()); + assertEquals(OK.getStatusCode(), deleteResponse.getStatusCode()); + clientApiPath = null; + } } private static void setupUsers() { @@ -191,7 +201,7 @@ private void harvestingClientRun(boolean allowHarvestingMissingCVV) throws Inte String nickName = "h" + UtilIT.getRandomString(6); - String clientApiPath = String.format(HARVEST_CLIENTS_API+"%s", nickName); + clientApiPath = String.format(HARVEST_CLIENTS_API+"%s", nickName); String clientJson = String.format("{\"dataverseAlias\":\"%s\"," + "\"type\":\"oai\"," + "\"harvestUrl\":\"%s\"," @@ -279,15 +289,6 @@ private void harvestingClientRun(boolean allowHarvestingMissingCVV) throws Inte // datasets have been harvested. This may or may not be necessary, seeing // how we have already confirmed the number of successfully harvested // datasets from the control set; somewhat hard to imagine a practical - // situation where that would not be enough (?). - - // Cleanup: delete the client - - Response deleteResponse = given() - .header(UtilIT.API_TOKEN_HTTP_HEADER, adminUserAPIKey) - .delete(clientApiPath); - System.out.println("deleteResponse.getStatusCode(): " + deleteResponse.getStatusCode()); - assertEquals(OK.getStatusCode(), deleteResponse.getStatusCode()); - + // situation where that would not be enough (?). } } From 97678807454cd3834f5dc4a59c50599f326e14dd Mon Sep 17 00:00:00 2001 From: Steven Winship Date: Fri, 16 Feb 2024 13:09:54 -0500 Subject: [PATCH 04/15] addressing review comments --- .../iq/dataverse/util/json/JsonParser.java | 29 +++++++++++-------- .../iq/dataverse/api/HarvestingClientsIT.java | 10 +++---- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java index bd756fffdbf..ac7b6bb4067 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java @@ -38,6 +38,7 @@ import java.text.ParseException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.HashSet; @@ -69,7 +70,6 @@ public class JsonParser { SettingsServiceBean settingsService; LicenseServiceBean licenseService; HarvestingClient harvestingClient = null; - boolean allowHarvestingMissingCVV = false; /** * if lenient, we will accept alternate spellings for controlled vocabulary values @@ -93,7 +93,6 @@ public JsonParser(DatasetFieldServiceBean datasetFieldSvc, MetadataBlockServiceB this.settingsService = settingsService; this.licenseService = licenseService; this.harvestingClient = harvestingClient; - this.allowHarvestingMissingCVV = (harvestingClient != null && settingsService.isTrueForKey(SettingsServiceBean.Key.AllowHarvestingMissingCVV, false)); } public JsonParser() { @@ -738,7 +737,14 @@ public DatasetField parseField(JsonObject json, Boolean testType) throws JsonPar ret.setDatasetFieldType(type); - + + // If Harvesting, CVV values may differ between the Dataverse installations, so we won't enforce them + if (harvestingClient != null && type.isControlledVocabulary() && + settingsService.isTrueForKey(SettingsServiceBean.Key.AllowHarvestingMissingCVV, false)) { + type.setAllowControlledVocabulary(false); + logger.warning("Harvesting: Skipping Controlled Vocabulary. Treating values as primitives"); + } + if (type.isCompound()) { List vals = parseCompoundValue(type, json, testType); for (DatasetFieldCompoundValue dsfcv : vals) { @@ -930,9 +936,8 @@ private String jsonValueToString(JsonValue jv) { default: return jv.toString(); } } - + public List parseControlledVocabularyValue(DatasetFieldType cvvType, JsonObject json) throws JsonParseException { - List vals = new LinkedList<>(); try { if (cvvType.isAllowMultiples()) { try { @@ -940,17 +945,20 @@ public List parseControlledVocabularyValue(DatasetFie } catch (ClassCastException cce) { throw new JsonParseException("Invalid values submitted for " + cvvType.getName() + ". It should be an array of values."); } + List vals = new LinkedList<>(); for (JsonString strVal : json.getJsonArray("value").getValuesAs(JsonString.class)) { String strValue = strVal.getString(); ControlledVocabularyValue cvv = datasetFieldSvc.findControlledVocabularyValueByDatasetFieldTypeAndStrValue(cvvType, strValue, lenient); - if (cvv == null && !allowHarvestingMissingCVV) { + if (cvv == null) { throw new ControlledVocabularyException("Value '" + strValue + "' does not exist in type '" + cvvType.getName() + "'", cvvType, strValue); } // Only add value to the list if it is not a duplicate - if (cvv != null && !vals.contains(cvv)) { + if (!vals.contains(cvv)) { vals.add(cvv); } } + return vals; + } else { try { json.getString("value"); @@ -959,14 +967,11 @@ public List parseControlledVocabularyValue(DatasetFie } String strValue = json.getString("value", ""); ControlledVocabularyValue cvv = datasetFieldSvc.findControlledVocabularyValueByDatasetFieldTypeAndStrValue(cvvType, strValue, lenient); - if (cvv == null && !allowHarvestingMissingCVV) { + if (cvv == null) { throw new ControlledVocabularyException("Value '" + strValue + "' does not exist in type '" + cvvType.getName() + "'", cvvType, strValue); } - if (cvv != null) { - vals.add(cvv); - } + return Collections.singletonList(cvv); } - return vals; } catch (ClassCastException cce) { throw new JsonParseException("Invalid values submitted for " + cvvType.getName()); } diff --git a/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java b/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java index 71d4fc14ad5..9b83c4c1c9a 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java @@ -60,14 +60,12 @@ public static void setUpClass() { @AfterEach public void cleanup() { UtilIT.deleteSetting(SettingsServiceBean.Key.AllowHarvestingMissingCVV); - // Cleanup: delete the client if (clientApiPath != null) { Response deleteResponse = given() .header(UtilIT.API_TOKEN_HTTP_HEADER, adminUserAPIKey) .delete(clientApiPath); - System.out.println("deleteResponse.getStatusCode(): " + deleteResponse.getStatusCode()); - assertEquals(OK.getStatusCode(), deleteResponse.getStatusCode()); clientApiPath = null; + System.out.println("deleteResponse.getStatusCode(): " + deleteResponse.getStatusCode()); } } @@ -175,11 +173,11 @@ public void testCreateEditDeleteClient() throws InterruptedException { } @Test - public void testHarvestingClientRun_AllowHarvestingMissingCVV_True() throws InterruptedException { - harvestingClientRun(true); + public void testHarvestingClientRun_AllowHarvestingMissingCVV_False() throws InterruptedException { + harvestingClientRun(false); } @Test - public void testHarvestingClientRun_AllowHarvestingMissingCVV_False() throws InterruptedException { + public void testHarvestingClientRun_AllowHarvestingMissingCVV_True() throws InterruptedException { harvestingClientRun(false); } From 7ff5d6a35647cb76d9fe98279dd9021952a9849b Mon Sep 17 00:00:00 2001 From: Steven Winship Date: Tue, 20 Feb 2024 08:49:25 -0500 Subject: [PATCH 05/15] adding to test --- .../edu/harvard/iq/dataverse/util/json/JsonParser.java | 8 +++++--- .../edu/harvard/iq/dataverse/api/HarvestingClientsIT.java | 7 +++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java index ac7b6bb4067..4287cab069b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java @@ -70,6 +70,7 @@ public class JsonParser { SettingsServiceBean settingsService; LicenseServiceBean licenseService; HarvestingClient harvestingClient = null; + boolean allowHarvestingMissingCVV = false; /** * if lenient, we will accept alternate spellings for controlled vocabulary values @@ -93,6 +94,8 @@ public JsonParser(DatasetFieldServiceBean datasetFieldSvc, MetadataBlockServiceB this.settingsService = settingsService; this.licenseService = licenseService; this.harvestingClient = harvestingClient; + this.allowHarvestingMissingCVV = harvestingClient != null && + settingsService.isTrueForKey(SettingsServiceBean.Key.AllowHarvestingMissingCVV, false); } public JsonParser() { @@ -739,10 +742,9 @@ public DatasetField parseField(JsonObject json, Boolean testType) throws JsonPar ret.setDatasetFieldType(type); // If Harvesting, CVV values may differ between the Dataverse installations, so we won't enforce them - if (harvestingClient != null && type.isControlledVocabulary() && - settingsService.isTrueForKey(SettingsServiceBean.Key.AllowHarvestingMissingCVV, false)) { + if (allowHarvestingMissingCVV && type.isControlledVocabulary()) { type.setAllowControlledVocabulary(false); - logger.warning("Harvesting: Skipping Controlled Vocabulary. Treating values as primitives"); + logger.info("Harvesting: Skipping Controlled Vocabulary. Treating values as primitives"); } if (type.isCompound()) { diff --git a/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java b/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java index 9b83c4c1c9a..375eb92a6ab 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java @@ -3,8 +3,7 @@ import java.util.logging.Logger; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.*; import io.restassured.RestAssured; import static io.restassured.RestAssured.given; @@ -19,7 +18,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.assertTrue; -import org.junit.jupiter.api.BeforeAll; /** * This class tests Harvesting Client functionality. @@ -29,6 +27,7 @@ * /api/harvest/clients/ api to run an actual harvest of a control set and * then validate the resulting harvested content. */ +@TestMethodOrder(MethodOrderer.MethodName.class) public class HarvestingClientsIT { private static final Logger logger = Logger.getLogger(HarvestingClientsIT.class.getCanonicalName()); @@ -178,7 +177,7 @@ public void testHarvestingClientRun_AllowHarvestingMissingCVV_False() throws In } @Test public void testHarvestingClientRun_AllowHarvestingMissingCVV_True() throws InterruptedException { - harvestingClientRun(false); + harvestingClientRun(true); } private void harvestingClientRun(boolean allowHarvestingMissingCVV) throws InterruptedException { From 2e9d4144cffcf97d7890b10f14f438c950f8ab89 Mon Sep 17 00:00:00 2001 From: Steven Winship Date: Tue, 20 Feb 2024 10:16:46 -0500 Subject: [PATCH 06/15] fixing CCV datafieldtypes from getting overwritten in database --- .../edu/harvard/iq/dataverse/util/json/JsonParser.java | 8 +------- .../edu/harvard/iq/dataverse/api/HarvestingClientsIT.java | 5 +++-- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java index 4287cab069b..16cffb92c8c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java @@ -741,12 +741,6 @@ public DatasetField parseField(JsonObject json, Boolean testType) throws JsonPar ret.setDatasetFieldType(type); - // If Harvesting, CVV values may differ between the Dataverse installations, so we won't enforce them - if (allowHarvestingMissingCVV && type.isControlledVocabulary()) { - type.setAllowControlledVocabulary(false); - logger.info("Harvesting: Skipping Controlled Vocabulary. Treating values as primitives"); - } - if (type.isCompound()) { List vals = parseCompoundValue(type, json, testType); for (DatasetFieldCompoundValue dsfcv : vals) { @@ -754,7 +748,7 @@ public DatasetField parseField(JsonObject json, Boolean testType) throws JsonPar } ret.setDatasetFieldCompoundValues(vals); - } else if (type.isControlledVocabulary()) { + } else if (type.isControlledVocabulary() && !allowHarvestingMissingCVV) { // if allowing missing CVV then fall through to 'primitive' List vals = parseControlledVocabularyValue(type, json); for (ControlledVocabularyValue cvv : vals) { cvv.setDatasetFieldType(type); diff --git a/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java b/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java index 375eb92a6ab..1de219e765b 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java @@ -3,12 +3,14 @@ import java.util.logging.Logger; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; -import org.junit.jupiter.api.*; import io.restassured.RestAssured; import static io.restassured.RestAssured.given; import io.restassured.path.json.JsonPath; import io.restassured.response.Response; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import static jakarta.ws.rs.core.Response.Status.CREATED; import static jakarta.ws.rs.core.Response.Status.UNAUTHORIZED; @@ -27,7 +29,6 @@ * /api/harvest/clients/ api to run an actual harvest of a control set and * then validate the resulting harvested content. */ -@TestMethodOrder(MethodOrderer.MethodName.class) public class HarvestingClientsIT { private static final Logger logger = Logger.getLogger(HarvestingClientsIT.class.getCanonicalName()); From 6a04efbae285a363acbe3dcb7408c7c1da95ae21 Mon Sep 17 00:00:00 2001 From: Steven Winship Date: Mon, 26 Feb 2024 10:21:30 -0500 Subject: [PATCH 07/15] moved flag from settings to harvesting client and fixed indexing --- .../harvard/iq/dataverse/DatasetField.java | 3 +- .../harvest/client/HarvestingClient.java | 6 ++ .../iq/dataverse/search/IndexServiceBean.java | 41 ++++++--- .../settings/SettingsServiceBean.java | 7 +- .../iq/dataverse/util/SystemConfig.java | 7 -- .../iq/dataverse/util/json/JsonParser.java | 86 +++++++++---------- ...re-to-handle-missing-controlled-values.sql | 2 + .../iq/dataverse/api/HarvestingClientsIT.java | 11 +-- 8 files changed, 85 insertions(+), 78 deletions(-) create mode 100644 src/main/resources/db/migration/V6.1.0.4__10023-harvesting-feature-to-handle-missing-controlled-values.sql diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetField.java b/src/main/java/edu/harvard/iq/dataverse/DatasetField.java index c836a20893f..31e7758c7d5 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetField.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetField.java @@ -595,7 +595,8 @@ public boolean removeBlankDatasetFieldValues() { return true; } } else { // controlled vocab - if (this.getControlledVocabularyValues().isEmpty()) { + // during harvesting some CVV are put in getDatasetFieldValues. we don't want to remove those + if (this.getControlledVocabularyValues().isEmpty() && this.getDatasetFieldValues().isEmpty()) { return true; } } diff --git a/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvestingClient.java b/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvestingClient.java index 40db55f2a0c..16f46bcab06 100644 --- a/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvestingClient.java +++ b/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvestingClient.java @@ -243,6 +243,12 @@ public String getCustomHttpHeaders() { public void setCustomHttpHeaders(String customHttpHeaders) { this.customHttpHeaders = customHttpHeaders; } + + private boolean allowHarvestingMissingCVV; + public boolean getAllowHarvestingMissingCVV() { return allowHarvestingMissingCVV; } + public void setAllowHarvestingMissingCVV(boolean allowHarvestingMissingCVV) { + this.allowHarvestingMissingCVV = allowHarvestingMissingCVV; + } // TODO: do we need "orphanRemoval=true"? -- L.A. 4.4 // TODO: should it be @OrderBy("startTime")? -- L.A. 4.4 diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index d631e2829c1..bfee506b53f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -956,22 +956,39 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, SetFeature Request/Idea: Harvest metadata values that aren't from a list of controlled values #9992 + */ + if (dsf.getControlledVocabularyValues().isEmpty()) { + for (DatasetFieldValue dfv : dsf.getDatasetFieldValues()) { + if (dfv.getValue().equals(DatasetField.NA_VALUE)) { + continue; + } + solrInputDocument.addField(solrFieldSearchable, dfv.getValue()); - // Index in all used languages (display and metadata languages - if (!dsfType.isAllowMultiples() || langs.isEmpty()) { - solrInputDocument.addField(solrFieldSearchable, controlledVocabularyValue.getStrValue()); - } else { - for(String locale: langs) { - solrInputDocument.addField(solrFieldSearchable, controlledVocabularyValue.getLocaleStrValue(locale)); + if (dsfType.getSolrField().isFacetable()) { + solrInputDocument.addField(solrFieldFacetable, dfv.getValue()); } } + } else { + for (ControlledVocabularyValue controlledVocabularyValue : dsf.getControlledVocabularyValues()) { + if (controlledVocabularyValue.getStrValue().equals(DatasetField.NA_VALUE)) { + continue; + } - if (dsfType.getSolrField().isFacetable()) { - solrInputDocument.addField(solrFieldFacetable, controlledVocabularyValue.getStrValue()); + // Index in all used languages (display and metadata languages + if (!dsfType.isAllowMultiples() || langs.isEmpty()) { + solrInputDocument.addField(solrFieldSearchable, controlledVocabularyValue.getStrValue()); + } else { + for(String locale: langs) { + solrInputDocument.addField(solrFieldSearchable, controlledVocabularyValue.getLocaleStrValue(locale)); + } + } + + if (dsfType.getSolrField().isFacetable()) { + solrInputDocument.addField(solrFieldFacetable, controlledVocabularyValue.getStrValue()); + } } } } else if (dsfType.getFieldType().equals(DatasetFieldType.FieldType.TEXTBOX)) { diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java index 6ed17d93ee3..3b7632f3d9e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java @@ -603,12 +603,7 @@ Whether Harvesting (OAI) service is enabled * When ingesting tabular data files, store the generated tab-delimited * files *with* the variable names line up top. */ - StoreIngestedTabularFilesWithVarHeaders, - - /** - * Should we ignore missing controlled vocabulary values when harvesting - */ - AllowHarvestingMissingCVV + StoreIngestedTabularFilesWithVarHeaders ; @Override diff --git a/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java b/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java index b2127cc263d..ded394833f1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java @@ -1181,11 +1181,4 @@ public Long getTestStorageQuotaLimit() { public boolean isStoringIngestedFilesWithHeaders() { return settingsService.isTrueForKey(SettingsServiceBean.Key.StoreIngestedTabularFilesWithVarHeaders, false); } - - /** - * Should we ignore missing controlled vocabulary values when harvesting - */ - public boolean allowHarvestingMissingCVV() { - return settingsService.isTrueForKey(SettingsServiceBean.Key.AllowHarvestingMissingCVV, false); - } } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java index 16cffb92c8c..f5affc5586e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java @@ -38,7 +38,6 @@ import java.text.ParseException; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.HashSet; @@ -94,8 +93,7 @@ public JsonParser(DatasetFieldServiceBean datasetFieldSvc, MetadataBlockServiceB this.settingsService = settingsService; this.licenseService = licenseService; this.harvestingClient = harvestingClient; - this.allowHarvestingMissingCVV = harvestingClient != null && - settingsService.isTrueForKey(SettingsServiceBean.Key.AllowHarvestingMissingCVV, false); + this.allowHarvestingMissingCVV = harvestingClient != null && harvestingClient.getAllowHarvestingMissingCVV(); } public JsonParser() { @@ -742,37 +740,21 @@ public DatasetField parseField(JsonObject json, Boolean testType) throws JsonPar ret.setDatasetFieldType(type); if (type.isCompound()) { - List vals = parseCompoundValue(type, json, testType); - for (DatasetFieldCompoundValue dsfcv : vals) { - dsfcv.setParentDatasetField(ret); - } - ret.setDatasetFieldCompoundValues(vals); - - } else if (type.isControlledVocabulary() && !allowHarvestingMissingCVV) { // if allowing missing CVV then fall through to 'primitive' - List vals = parseControlledVocabularyValue(type, json); - for (ControlledVocabularyValue cvv : vals) { - cvv.setDatasetFieldType(type); - } - ret.setControlledVocabularyValues(vals); - + parseCompoundValue(ret, type, json, testType); + } else if (type.isControlledVocabulary()) { + parseControlledVocabularyValue(ret, type, json); } else { - // primitive - - List values = parsePrimitiveValue(type, json); - for (DatasetFieldValue val : values) { - val.setDatasetField(ret); - } - ret.setDatasetFieldValues(values); - } + parsePrimitiveValue(ret, type, json); + } return ret; } - public List parseCompoundValue(DatasetFieldType compoundType, JsonObject json) throws JsonParseException { - return parseCompoundValue(compoundType, json, true); + public void parseCompoundValue(DatasetField dsf, DatasetFieldType compoundType, JsonObject json) throws JsonParseException { + parseCompoundValue(dsf, compoundType, json, true); } - public List parseCompoundValue(DatasetFieldType compoundType, JsonObject json, Boolean testType) throws JsonParseException { + public void parseCompoundValue(DatasetField dsf, DatasetFieldType compoundType, JsonObject json, Boolean testType) throws JsonParseException { List vocabExceptions = new ArrayList<>(); List vals = new LinkedList<>(); if (compoundType.isAllowMultiples()) { @@ -839,18 +821,17 @@ public List parseCompoundValue(DatasetFieldType compo if (!vocabExceptions.isEmpty()) { throw new CompoundVocabularyException( "Invalid controlled vocabulary in compound field ", vocabExceptions, vals); } - return vals; + + for (DatasetFieldCompoundValue dsfcv : vals) { + dsfcv.setParentDatasetField(dsf); + } + dsf.setDatasetFieldCompoundValues(vals); } - public List parsePrimitiveValue(DatasetFieldType dft , JsonObject json) throws JsonParseException { + public void parsePrimitiveValue(DatasetField dsf, DatasetFieldType dft , JsonObject json) throws JsonParseException { Map cvocMap = datasetFieldSvc.getCVocConf(true); - boolean extVocab=false; - if(cvocMap.containsKey(dft.getId())) { - extVocab=true; - } - - + boolean extVocab = cvocMap.containsKey(dft.getId()); List vals = new LinkedList<>(); if (dft.isAllowMultiples()) { try { @@ -859,7 +840,7 @@ public List parsePrimitiveValue(DatasetFieldType dft , JsonOb throw new JsonParseException("Invalid values submitted for " + dft.getName() + ". It should be an array of values."); } for (JsonString val : json.getJsonArray("value").getValuesAs(JsonString.class)) { - DatasetFieldValue datasetFieldValue = new DatasetFieldValue(); + DatasetFieldValue datasetFieldValue = new DatasetFieldValue(dsf); datasetFieldValue.setDisplayOrder(vals.size() - 1); datasetFieldValue.setValue(val.getString().trim()); if(extVocab) { @@ -878,6 +859,7 @@ public List parsePrimitiveValue(DatasetFieldType dft , JsonOb } DatasetFieldValue datasetFieldValue = new DatasetFieldValue(); datasetFieldValue.setValue(json.getString("value", "").trim()); + datasetFieldValue.setDatasetField(dsf); if(extVocab) { if(!datasetFieldSvc.isValidCVocValue(dft, datasetFieldValue.getValue())) { throw new JsonParseException("Invalid values submitted for " + dft.getName() + " which is limited to specific vocabularies."); @@ -887,7 +869,7 @@ public List parsePrimitiveValue(DatasetFieldType dft , JsonOb vals.add(datasetFieldValue); } - return vals; + dsf.setDatasetFieldValues(vals); } public Workflow parseWorkflow(JsonObject json) throws JsonParseException { @@ -933,7 +915,8 @@ private String jsonValueToString(JsonValue jv) { } } - public List parseControlledVocabularyValue(DatasetFieldType cvvType, JsonObject json) throws JsonParseException { + public void parseControlledVocabularyValue(DatasetField dsf, DatasetFieldType cvvType, JsonObject json) throws JsonParseException { + List vals = new LinkedList<>(); try { if (cvvType.isAllowMultiples()) { try { @@ -941,19 +924,25 @@ public List parseControlledVocabularyValue(DatasetFie } catch (ClassCastException cce) { throw new JsonParseException("Invalid values submitted for " + cvvType.getName() + ". It should be an array of values."); } - List vals = new LinkedList<>(); for (JsonString strVal : json.getJsonArray("value").getValuesAs(JsonString.class)) { String strValue = strVal.getString(); ControlledVocabularyValue cvv = datasetFieldSvc.findControlledVocabularyValueByDatasetFieldTypeAndStrValue(cvvType, strValue, lenient); if (cvv == null) { - throw new ControlledVocabularyException("Value '" + strValue + "' does not exist in type '" + cvvType.getName() + "'", cvvType, strValue); + if (allowHarvestingMissingCVV) { + // we need to process these as primitive values + logger.warning("Value '" + strValue + "' does not exist in type '" + cvvType.getName() + "'. Processing as primitive per setting override."); + parsePrimitiveValue(dsf, cvvType, json); + return; + } else { + throw new ControlledVocabularyException("Value '" + strValue + "' does not exist in type '" + cvvType.getName() + "'", cvvType, strValue); + } } + cvv.setDatasetFieldType(cvvType); // Only add value to the list if it is not a duplicate if (!vals.contains(cvv)) { vals.add(cvv); } } - return vals; } else { try { @@ -964,13 +953,23 @@ public List parseControlledVocabularyValue(DatasetFie String strValue = json.getString("value", ""); ControlledVocabularyValue cvv = datasetFieldSvc.findControlledVocabularyValueByDatasetFieldTypeAndStrValue(cvvType, strValue, lenient); if (cvv == null) { - throw new ControlledVocabularyException("Value '" + strValue + "' does not exist in type '" + cvvType.getName() + "'", cvvType, strValue); + if (allowHarvestingMissingCVV) { + // we need to process this as a primitive value + logger.warning(">>>> Value '" + strValue + "' does not exist in type '" + cvvType.getName() + "'. Processing as primitive per setting override."); + parsePrimitiveValue(dsf, cvvType , json); + return; + } else { + throw new ControlledVocabularyException("Value '" + strValue + "' does not exist in type '" + cvvType.getName() + "'", cvvType, strValue); + } } - return Collections.singletonList(cvv); + cvv.setDatasetFieldType(cvvType); + vals.add(cvv); } } catch (ClassCastException cce) { throw new JsonParseException("Invalid values submitted for " + cvvType.getName()); } + + dsf.setControlledVocabularyValues(vals); } Date parseDate(String str) throws ParseException { @@ -1001,6 +1000,7 @@ public String parseHarvestingClient(JsonObject obj, HarvestingClient harvestingC harvestingClient.setMetadataPrefix(obj.getString("metadataFormat",null)); harvestingClient.setHarvestingSet(obj.getString("set",null)); harvestingClient.setCustomHttpHeaders(obj.getString("customHeaders", null)); + harvestingClient.setAllowHarvestingMissingCVV(obj.getBoolean("allowHarvestingMissingCVV", false)); return dataverseAlias; } diff --git a/src/main/resources/db/migration/V6.1.0.4__10023-harvesting-feature-to-handle-missing-controlled-values.sql b/src/main/resources/db/migration/V6.1.0.4__10023-harvesting-feature-to-handle-missing-controlled-values.sql new file mode 100644 index 00000000000..c9942fb8480 --- /dev/null +++ b/src/main/resources/db/migration/V6.1.0.4__10023-harvesting-feature-to-handle-missing-controlled-values.sql @@ -0,0 +1,2 @@ +-- Add flag to allow harvesting client to handle missing CVV values +ALTER TABLE harvestingclient ADD COLUMN IF NOT EXISTS allowharvestingmissingcvv BOOLEAN; diff --git a/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java b/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java index 1de219e765b..4466182b435 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/HarvestingClientsIT.java @@ -2,8 +2,6 @@ import java.util.logging.Logger; -import edu.harvard.iq.dataverse.settings.SettingsServiceBean; - import io.restassured.RestAssured; import static io.restassured.RestAssured.given; import io.restassured.path.json.JsonPath; @@ -59,7 +57,6 @@ public static void setUpClass() { } @AfterEach public void cleanup() { - UtilIT.deleteSetting(SettingsServiceBean.Key.AllowHarvestingMissingCVV); if (clientApiPath != null) { Response deleteResponse = given() .header(UtilIT.API_TOKEN_HTTP_HEADER, adminUserAPIKey) @@ -183,11 +180,6 @@ public void testHarvestingClientRun_AllowHarvestingMissingCVV_True() throws Int private void harvestingClientRun(boolean allowHarvestingMissingCVV) throws InterruptedException { int expectedNumberOfSetsHarvested = allowHarvestingMissingCVV ? DATASETS_IN_CONTROL_SET : DATASETS_IN_CONTROL_SET - 1; - if (allowHarvestingMissingCVV) { - UtilIT.enableSetting(SettingsServiceBean.Key.AllowHarvestingMissingCVV); - } else { - UtilIT.deleteSetting(SettingsServiceBean.Key.AllowHarvestingMissingCVV); - } // This test will create a client and attempt to perform an actual // harvest and validate the resulting harvested content. @@ -205,8 +197,9 @@ private void harvestingClientRun(boolean allowHarvestingMissingCVV) throws Inte + "\"harvestUrl\":\"%s\"," + "\"archiveUrl\":\"%s\"," + "\"set\":\"%s\"," + + "\"allowHarvestingMissingCVV\":%s," + "\"metadataFormat\":\"%s\"}", - harvestCollectionAlias, HARVEST_URL, ARCHIVE_URL, CONTROL_OAI_SET, HARVEST_METADATA_FORMAT); + harvestCollectionAlias, HARVEST_URL, ARCHIVE_URL, CONTROL_OAI_SET, allowHarvestingMissingCVV, HARVEST_METADATA_FORMAT); Response createResponse = given() .header(UtilIT.API_TOKEN_HTTP_HEADER, adminUserAPIKey) From 00a33310052787c87030a9389b0d3cb057d9ec71 Mon Sep 17 00:00:00 2001 From: Steven Winship Date: Mon, 26 Feb 2024 10:24:38 -0500 Subject: [PATCH 08/15] moved flag from settings to harvesting client and fixed indexing --- .../harvard/iq/dataverse/harvest/client/HarvestingClient.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvestingClient.java b/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvestingClient.java index 16f46bcab06..0667f5594ce 100644 --- a/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvestingClient.java +++ b/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvestingClient.java @@ -245,7 +245,9 @@ public void setCustomHttpHeaders(String customHttpHeaders) { } private boolean allowHarvestingMissingCVV; - public boolean getAllowHarvestingMissingCVV() { return allowHarvestingMissingCVV; } + public boolean getAllowHarvestingMissingCVV() { + return allowHarvestingMissingCVV; + } public void setAllowHarvestingMissingCVV(boolean allowHarvestingMissingCVV) { this.allowHarvestingMissingCVV = allowHarvestingMissingCVV; } From 58e765a60bda3e80fbbbbfc8c0416c353c1439fe Mon Sep 17 00:00:00 2001 From: Steven Winship Date: Mon, 26 Feb 2024 11:29:05 -0500 Subject: [PATCH 09/15] updating docs --- .../9992-harvest-metadata-values-not-in-cvv-list.md | 7 ++----- doc/sphinx-guides/source/api/native-api.rst | 6 ++++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/doc/release-notes/9992-harvest-metadata-values-not-in-cvv-list.md b/doc/release-notes/9992-harvest-metadata-values-not-in-cvv-list.md index 64ea2e1166a..f77f03d9be5 100644 --- a/doc/release-notes/9992-harvest-metadata-values-not-in-cvv-list.md +++ b/doc/release-notes/9992-harvest-metadata-values-not-in-cvv-list.md @@ -1,6 +1,3 @@ +The API endpoint `api/harvest/clients/{harvestingClientNickname}` has been extended to include the following fields: -`AllowHarvestingMissingCVV` setting to enable/disable allowing datasets to be harvested with Controlled Vocabulary Values that existed in the originating Dataverse Project but are not in the harvesting Dataverse Project. -The default value of this setting is false/no which will cause the harvesting of the dataset to fail. -By activating this feature (true/yes) the value in question will be removed from the list of values and the dataset will be harvested without the missing value. - -`curl http://localhost:8080/api/admin/settings/:AllowHarvestingMissingCVV -X PUT -d yes` +- `allowHarvestingMissingCVV`: enable/disable allowing datasets to be harvested with Controlled Vocabulary Values that existed in the originating Dataverse Project but are not in the harvesting Dataverse Project. Default is false. diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst index dbe769e2fd1..d544aa1b8d8 100644 --- a/doc/sphinx-guides/source/api/native-api.rst +++ b/doc/sphinx-guides/source/api/native-api.rst @@ -4423,7 +4423,8 @@ The following optional fields are supported: - set: The OAI set on the remote server. If not supplied, will default to none, i.e., "harvest everything". - style: Defaults to "default" - a generic OAI archive. (Make sure to use "dataverse" when configuring harvesting from another Dataverse installation). - customHeaders: This can be used to configure this client with a specific HTTP header that will be added to every OAI request. This is to accommodate a use case where the remote server requires this header to supply some form of a token in order to offer some content not available to other clients. See the example below. Multiple headers can be supplied separated by `\\n` - actual "backslash" and "n" characters, not a single "new line" character. - +- allowHarvestingMissingCVV: flag to allow datasets to be harvested with Controlled Vocabulary Values that existed in the originating Dataverse Project but are not in the harvesting Dataverse Project. (Default is false) + Generally, the API will accept the output of the GET version of the API for an existing client as valid input, but some fields will be ignored. For example, as of writing this there is no way to configure a harvesting schedule via this API. An example JSON file would look like this:: @@ -4436,7 +4437,8 @@ An example JSON file would look like this:: "archiveDescription": "Moissonné depuis la collection LMOPS de l'entrepôt Zenodo. En cliquant sur ce jeu de données, vous serez redirigé vers Zenodo.", "metadataFormat": "oai_dc", "customHeaders": "x-oai-api-key: xxxyyyzzz", - "set": "user-lmops" + "set": "user-lmops", + "allowHarvestingMissingCVV":true } Something important to keep in mind about this API is that, unlike the harvesting clients GUI, it will create a client with the values supplied without making any attempts to validate them in real time. In other words, for the `harvestUrl` it will accept anything that looks like a well-formed url, without making any OAI calls to verify that the name of the set and/or the metadata format entered are supported by it. This is by design, to give an admin an option to still be able to create a client, in a rare case when it cannot be done via the GUI because of some real time failures in an exchange with an otherwise valid OAI server. This however puts the responsibility on the admin to supply the values already confirmed to be valid. From fe87f9e6b7be8a24ec0872e420c60c3558e2ad0c Mon Sep 17 00:00:00 2001 From: Steven Winship Date: Tue, 27 Feb 2024 13:36:24 -0500 Subject: [PATCH 10/15] adding fix per review comment --- .../java/edu/harvard/iq/dataverse/util/json/JsonPrinter.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonPrinter.java b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonPrinter.java index 2eaf6b64579..3e5106d3ff2 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonPrinter.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonPrinter.java @@ -871,6 +871,7 @@ public static JsonObjectBuilder json(HarvestingClient harvestingClient) { add("schedule", harvestingClient.isScheduled() ? harvestingClient.getScheduleDescription() : "none"). add("status", harvestingClient.isHarvestingNow() ? "inProgress" : "inActive"). add("customHeaders", harvestingClient.getCustomHttpHeaders()). + add("allowHarvestingMissingCVV", harvestingClient.getAllowHarvestingMissingCVV()). add("lastHarvest", harvestingClient.getLastHarvestTime() == null ? null : harvestingClient.getLastHarvestTime().toString()). add("lastResult", harvestingClient.getLastResult()). add("lastSuccessful", harvestingClient.getLastSuccessfulHarvestTime() == null ? null : harvestingClient.getLastSuccessfulHarvestTime().toString()). From 70f0d50f8c70117fb1b7ed12a08d505d0898c0ee Mon Sep 17 00:00:00 2001 From: Steven Winship Date: Thu, 29 Feb 2024 14:27:35 -0500 Subject: [PATCH 11/15] rename sql to unique --- ...23-harvesting-feature-to-handle-missing-controlled-values.sql} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/main/resources/db/migration/{V6.1.0.4__10023-harvesting-feature-to-handle-missing-controlled-values.sql => V6.1.0.5__10023-harvesting-feature-to-handle-missing-controlled-values.sql} (100%) diff --git a/src/main/resources/db/migration/V6.1.0.4__10023-harvesting-feature-to-handle-missing-controlled-values.sql b/src/main/resources/db/migration/V6.1.0.5__10023-harvesting-feature-to-handle-missing-controlled-values.sql similarity index 100% rename from src/main/resources/db/migration/V6.1.0.4__10023-harvesting-feature-to-handle-missing-controlled-values.sql rename to src/main/resources/db/migration/V6.1.0.5__10023-harvesting-feature-to-handle-missing-controlled-values.sql From 5ffd72765460e07a862c237fd833b52e25e0cc74 Mon Sep 17 00:00:00 2001 From: Steven Winship Date: Thu, 29 Feb 2024 16:26:40 -0500 Subject: [PATCH 12/15] rename sql to unique --- ...23-harvesting-feature-to-handle-missing-controlled-values.sql} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/main/resources/db/migration/{V6.1.0.5__10023-harvesting-feature-to-handle-missing-controlled-values.sql => V6.1.0.6__10023-harvesting-feature-to-handle-missing-controlled-values.sql} (100%) diff --git a/src/main/resources/db/migration/V6.1.0.5__10023-harvesting-feature-to-handle-missing-controlled-values.sql b/src/main/resources/db/migration/V6.1.0.6__10023-harvesting-feature-to-handle-missing-controlled-values.sql similarity index 100% rename from src/main/resources/db/migration/V6.1.0.5__10023-harvesting-feature-to-handle-missing-controlled-values.sql rename to src/main/resources/db/migration/V6.1.0.6__10023-harvesting-feature-to-handle-missing-controlled-values.sql From 113bf7daa8947823019c3656f7e0e2f806167654 Mon Sep 17 00:00:00 2001 From: Steven Winship Date: Fri, 1 Mar 2024 09:59:04 -0500 Subject: [PATCH 13/15] sql name change --- ...23-harvesting-feature-to-handle-missing-controlled-values.sql} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/main/resources/db/migration/{V6.1.0.6__10023-harvesting-feature-to-handle-missing-controlled-values.sql => V6.1.0.5__10023-harvesting-feature-to-handle-missing-controlled-values.sql} (100%) diff --git a/src/main/resources/db/migration/V6.1.0.6__10023-harvesting-feature-to-handle-missing-controlled-values.sql b/src/main/resources/db/migration/V6.1.0.5__10023-harvesting-feature-to-handle-missing-controlled-values.sql similarity index 100% rename from src/main/resources/db/migration/V6.1.0.6__10023-harvesting-feature-to-handle-missing-controlled-values.sql rename to src/main/resources/db/migration/V6.1.0.5__10023-harvesting-feature-to-handle-missing-controlled-values.sql From b12e6ff8207fab8c44b1aa1e8542ab116269f09f Mon Sep 17 00:00:00 2001 From: Steven Winship Date: Wed, 6 Mar 2024 13:00:47 -0500 Subject: [PATCH 14/15] rename sql file --- ...23-harvesting-feature-to-handle-missing-controlled-values.sql} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/main/resources/db/migration/{V6.1.0.5__10023-harvesting-feature-to-handle-missing-controlled-values.sql => V6.1.0.6__10023-harvesting-feature-to-handle-missing-controlled-values.sql} (100%) diff --git a/src/main/resources/db/migration/V6.1.0.5__10023-harvesting-feature-to-handle-missing-controlled-values.sql b/src/main/resources/db/migration/V6.1.0.6__10023-harvesting-feature-to-handle-missing-controlled-values.sql similarity index 100% rename from src/main/resources/db/migration/V6.1.0.5__10023-harvesting-feature-to-handle-missing-controlled-values.sql rename to src/main/resources/db/migration/V6.1.0.6__10023-harvesting-feature-to-handle-missing-controlled-values.sql From 5bb189b934b7539a1cf501803815d5c890902236 Mon Sep 17 00:00:00 2001 From: Steven Winship Date: Mon, 18 Mar 2024 11:16:46 -0400 Subject: [PATCH 15/15] add to docs and rename sql --- .../9992-harvest-metadata-values-not-in-cvv-list.md | 1 + doc/sphinx-guides/source/api/native-api.rst | 2 +- ...ure-to-handle-missing-controlled-values.sql => V6.1.0.6.sql} | 0 3 files changed, 2 insertions(+), 1 deletion(-) rename src/main/resources/db/migration/{V6.1.0.6__10023-harvesting-feature-to-handle-missing-controlled-values.sql => V6.1.0.6.sql} (100%) diff --git a/doc/release-notes/9992-harvest-metadata-values-not-in-cvv-list.md b/doc/release-notes/9992-harvest-metadata-values-not-in-cvv-list.md index f77f03d9be5..88ca6cf0e79 100644 --- a/doc/release-notes/9992-harvest-metadata-values-not-in-cvv-list.md +++ b/doc/release-notes/9992-harvest-metadata-values-not-in-cvv-list.md @@ -1,3 +1,4 @@ The API endpoint `api/harvest/clients/{harvestingClientNickname}` has been extended to include the following fields: - `allowHarvestingMissingCVV`: enable/disable allowing datasets to be harvested with Controlled Vocabulary Values that existed in the originating Dataverse Project but are not in the harvesting Dataverse Project. Default is false. +Note: This setting is only available to the API and not currently accessible/settable via the UI \ No newline at end of file diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst index d544aa1b8d8..b2ce1d44977 100644 --- a/doc/sphinx-guides/source/api/native-api.rst +++ b/doc/sphinx-guides/source/api/native-api.rst @@ -4423,7 +4423,7 @@ The following optional fields are supported: - set: The OAI set on the remote server. If not supplied, will default to none, i.e., "harvest everything". - style: Defaults to "default" - a generic OAI archive. (Make sure to use "dataverse" when configuring harvesting from another Dataverse installation). - customHeaders: This can be used to configure this client with a specific HTTP header that will be added to every OAI request. This is to accommodate a use case where the remote server requires this header to supply some form of a token in order to offer some content not available to other clients. See the example below. Multiple headers can be supplied separated by `\\n` - actual "backslash" and "n" characters, not a single "new line" character. -- allowHarvestingMissingCVV: flag to allow datasets to be harvested with Controlled Vocabulary Values that existed in the originating Dataverse Project but are not in the harvesting Dataverse Project. (Default is false) +- allowHarvestingMissingCVV: Flag to allow datasets to be harvested with Controlled Vocabulary Values that existed in the originating Dataverse Project but are not in the harvesting Dataverse Project. (Default is false). Currently only settable using API. Generally, the API will accept the output of the GET version of the API for an existing client as valid input, but some fields will be ignored. For example, as of writing this there is no way to configure a harvesting schedule via this API. diff --git a/src/main/resources/db/migration/V6.1.0.6__10023-harvesting-feature-to-handle-missing-controlled-values.sql b/src/main/resources/db/migration/V6.1.0.6.sql similarity index 100% rename from src/main/resources/db/migration/V6.1.0.6__10023-harvesting-feature-to-handle-missing-controlled-values.sql rename to src/main/resources/db/migration/V6.1.0.6.sql