Skip to content

Commit

Permalink
Merge branch 'IQSS:develop' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
BenediktMeierUIT committed Mar 19, 2024
2 parents dae2988 + 4f46d15 commit 42db500
Show file tree
Hide file tree
Showing 9 changed files with 132 additions and 84 deletions.
@@ -0,0 +1,4 @@
The API endpoint `api/harvest/clients/{harvestingClientNickname}` has been extended to include the following fields:

- `allowHarvestingMissingCVV`: enable/disable allowing datasets to be harvested with Controlled Vocabulary Values that existed in the originating Dataverse Project but are not in the harvesting Dataverse Project. Default is false.
Note: This setting is only available to the API and not currently accessible/settable via the UI
6 changes: 4 additions & 2 deletions doc/sphinx-guides/source/api/native-api.rst
Expand Up @@ -4693,7 +4693,8 @@ The following optional fields are supported:
- set: The OAI set on the remote server. If not supplied, will default to none, i.e., "harvest everything".
- style: Defaults to "default" - a generic OAI archive. (Make sure to use "dataverse" when configuring harvesting from another Dataverse installation).
- customHeaders: This can be used to configure this client with a specific HTTP header that will be added to every OAI request. This is to accommodate a use case where the remote server requires this header to supply some form of a token in order to offer some content not available to other clients. See the example below. Multiple headers can be supplied separated by `\\n` - actual "backslash" and "n" characters, not a single "new line" character.
- allowHarvestingMissingCVV: Flag to allow datasets to be harvested with Controlled Vocabulary Values that existed in the originating Dataverse Project but are not in the harvesting Dataverse Project. (Default is false). Currently only settable using API.
Generally, the API will accept the output of the GET version of the API for an existing client as valid input, but some fields will be ignored. For example, as of writing this there is no way to configure a harvesting schedule via this API.
An example JSON file would look like this::
Expand All @@ -4706,7 +4707,8 @@ An example JSON file would look like this::
"archiveDescription": "Moissonné depuis la collection LMOPS de l'entrepôt Zenodo. En cliquant sur ce jeu de données, vous serez redirigé vers Zenodo.",
"metadataFormat": "oai_dc",
"customHeaders": "x-oai-api-key: xxxyyyzzz",
"set": "user-lmops"
"set": "user-lmops",
"allowHarvestingMissingCVV":true
}
Something important to keep in mind about this API is that, unlike the harvesting clients GUI, it will create a client with the values supplied without making any attempts to validate them in real time. In other words, for the `harvestUrl` it will accept anything that looks like a well-formed url, without making any OAI calls to verify that the name of the set and/or the metadata format entered are supported by it. This is by design, to give an admin an option to still be able to create a client, in a rare case when it cannot be done via the GUI because of some real time failures in an exchange with an otherwise valid OAI server. This however puts the responsibility on the admin to supply the values already confirmed to be valid.
Expand Down
3 changes: 2 additions & 1 deletion src/main/java/edu/harvard/iq/dataverse/DatasetField.java
Expand Up @@ -595,7 +595,8 @@ public boolean removeBlankDatasetFieldValues() {
return true;
}
} else { // controlled vocab
if (this.getControlledVocabularyValues().isEmpty()) {
// during harvesting some CVV are put in getDatasetFieldValues. we don't want to remove those
if (this.getControlledVocabularyValues().isEmpty() && this.getDatasetFieldValues().isEmpty()) {
return true;
}
}
Expand Down
Expand Up @@ -243,6 +243,14 @@ public String getCustomHttpHeaders() {
public void setCustomHttpHeaders(String customHttpHeaders) {
this.customHttpHeaders = customHttpHeaders;
}

private boolean allowHarvestingMissingCVV;
public boolean getAllowHarvestingMissingCVV() {
return allowHarvestingMissingCVV;
}
public void setAllowHarvestingMissingCVV(boolean allowHarvestingMissingCVV) {
this.allowHarvestingMissingCVV = allowHarvestingMissingCVV;
}

// TODO: do we need "orphanRemoval=true"? -- L.A. 4.4
// TODO: should it be @OrderBy("startTime")? -- L.A. 4.4
Expand Down
41 changes: 29 additions & 12 deletions src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java
Expand Up @@ -958,22 +958,39 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set<Long
}
}
if (dsfType.isControlledVocabulary()) {
for (ControlledVocabularyValue controlledVocabularyValue : dsf.getControlledVocabularyValues()) {
if (controlledVocabularyValue.getStrValue().equals(DatasetField.NA_VALUE)) {
continue;
}
/** If the cvv list is empty but the dfv list is not then it is assumed this was harvested
* from an installation that had controlled vocabulary entries that don't exist in our this db
* @see <a href="https://github.com/IQSS/dataverse/issues/9992">Feature Request/Idea: Harvest metadata values that aren't from a list of controlled values #9992</a>
*/
if (dsf.getControlledVocabularyValues().isEmpty()) {
for (DatasetFieldValue dfv : dsf.getDatasetFieldValues()) {
if (dfv.getValue().equals(DatasetField.NA_VALUE)) {
continue;
}
solrInputDocument.addField(solrFieldSearchable, dfv.getValue());

// Index in all used languages (display and metadata languages
if (!dsfType.isAllowMultiples() || langs.isEmpty()) {
solrInputDocument.addField(solrFieldSearchable, controlledVocabularyValue.getStrValue());
} else {
for(String locale: langs) {
solrInputDocument.addField(solrFieldSearchable, controlledVocabularyValue.getLocaleStrValue(locale));
if (dsfType.getSolrField().isFacetable()) {
solrInputDocument.addField(solrFieldFacetable, dfv.getValue());
}
}
} else {
for (ControlledVocabularyValue controlledVocabularyValue : dsf.getControlledVocabularyValues()) {
if (controlledVocabularyValue.getStrValue().equals(DatasetField.NA_VALUE)) {
continue;
}

if (dsfType.getSolrField().isFacetable()) {
solrInputDocument.addField(solrFieldFacetable, controlledVocabularyValue.getStrValue());
// Index in all used languages (display and metadata languages
if (!dsfType.isAllowMultiples() || langs.isEmpty()) {
solrInputDocument.addField(solrFieldSearchable, controlledVocabularyValue.getStrValue());
} else {
for(String locale: langs) {
solrInputDocument.addField(solrFieldSearchable, controlledVocabularyValue.getLocaleStrValue(locale));
}
}

if (dsfType.getSolrField().isFacetable()) {
solrInputDocument.addField(solrFieldFacetable, controlledVocabularyValue.getStrValue());
}
}
}
} else if (dsfType.getFieldType().equals(DatasetFieldType.FieldType.TEXTBOX)) {
Expand Down
96 changes: 48 additions & 48 deletions src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java
Expand Up @@ -38,7 +38,6 @@
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
Expand Down Expand Up @@ -69,7 +68,8 @@ public class JsonParser {
MetadataBlockServiceBean blockService;
SettingsServiceBean settingsService;
LicenseServiceBean licenseService;
HarvestingClient harvestingClient = null;
HarvestingClient harvestingClient = null;
boolean allowHarvestingMissingCVV = false;

/**
* if lenient, we will accept alternate spellings for controlled vocabulary values
Expand All @@ -93,6 +93,7 @@ public JsonParser(DatasetFieldServiceBean datasetFieldSvc, MetadataBlockServiceB
this.settingsService = settingsService;
this.licenseService = licenseService;
this.harvestingClient = harvestingClient;
this.allowHarvestingMissingCVV = harvestingClient != null && harvestingClient.getAllowHarvestingMissingCVV();
}

public JsonParser() {
Expand Down Expand Up @@ -737,39 +738,23 @@ public DatasetField parseField(JsonObject json, Boolean testType) throws JsonPar


ret.setDatasetFieldType(type);

if (type.isCompound()) {
List<DatasetFieldCompoundValue> vals = parseCompoundValue(type, json, testType);
for (DatasetFieldCompoundValue dsfcv : vals) {
dsfcv.setParentDatasetField(ret);
}
ret.setDatasetFieldCompoundValues(vals);

if (type.isCompound()) {
parseCompoundValue(ret, type, json, testType);
} else if (type.isControlledVocabulary()) {
List<ControlledVocabularyValue> vals = parseControlledVocabularyValue(type, json);
for (ControlledVocabularyValue cvv : vals) {
cvv.setDatasetFieldType(type);
}
ret.setControlledVocabularyValues(vals);

parseControlledVocabularyValue(ret, type, json);
} else {
// primitive

List<DatasetFieldValue> values = parsePrimitiveValue(type, json);
for (DatasetFieldValue val : values) {
val.setDatasetField(ret);
}
ret.setDatasetFieldValues(values);
}
parsePrimitiveValue(ret, type, json);
}

return ret;
}

public List<DatasetFieldCompoundValue> parseCompoundValue(DatasetFieldType compoundType, JsonObject json) throws JsonParseException {
return parseCompoundValue(compoundType, json, true);
public void parseCompoundValue(DatasetField dsf, DatasetFieldType compoundType, JsonObject json) throws JsonParseException {
parseCompoundValue(dsf, compoundType, json, true);
}

public List<DatasetFieldCompoundValue> parseCompoundValue(DatasetFieldType compoundType, JsonObject json, Boolean testType) throws JsonParseException {
public void parseCompoundValue(DatasetField dsf, DatasetFieldType compoundType, JsonObject json, Boolean testType) throws JsonParseException {
List<ControlledVocabularyException> vocabExceptions = new ArrayList<>();
List<DatasetFieldCompoundValue> vals = new LinkedList<>();
if (compoundType.isAllowMultiples()) {
Expand Down Expand Up @@ -836,18 +821,17 @@ public List<DatasetFieldCompoundValue> parseCompoundValue(DatasetFieldType compo
if (!vocabExceptions.isEmpty()) {
throw new CompoundVocabularyException( "Invalid controlled vocabulary in compound field ", vocabExceptions, vals);
}
return vals;

for (DatasetFieldCompoundValue dsfcv : vals) {
dsfcv.setParentDatasetField(dsf);
}
dsf.setDatasetFieldCompoundValues(vals);
}

public List<DatasetFieldValue> parsePrimitiveValue(DatasetFieldType dft , JsonObject json) throws JsonParseException {
public void parsePrimitiveValue(DatasetField dsf, DatasetFieldType dft , JsonObject json) throws JsonParseException {

Map<Long, JsonObject> cvocMap = datasetFieldSvc.getCVocConf(true);
boolean extVocab=false;
if(cvocMap.containsKey(dft.getId())) {
extVocab=true;
}


boolean extVocab = cvocMap.containsKey(dft.getId());
List<DatasetFieldValue> vals = new LinkedList<>();
if (dft.isAllowMultiples()) {
try {
Expand All @@ -856,7 +840,7 @@ public List<DatasetFieldValue> parsePrimitiveValue(DatasetFieldType dft , JsonOb
throw new JsonParseException("Invalid values submitted for " + dft.getName() + ". It should be an array of values.");
}
for (JsonString val : json.getJsonArray("value").getValuesAs(JsonString.class)) {
DatasetFieldValue datasetFieldValue = new DatasetFieldValue();
DatasetFieldValue datasetFieldValue = new DatasetFieldValue(dsf);
datasetFieldValue.setDisplayOrder(vals.size() - 1);
datasetFieldValue.setValue(val.getString().trim());
if(extVocab) {
Expand All @@ -875,6 +859,7 @@ public List<DatasetFieldValue> parsePrimitiveValue(DatasetFieldType dft , JsonOb
}
DatasetFieldValue datasetFieldValue = new DatasetFieldValue();
datasetFieldValue.setValue(json.getString("value", "").trim());
datasetFieldValue.setDatasetField(dsf);
if(extVocab) {
if(!datasetFieldSvc.isValidCVocValue(dft, datasetFieldValue.getValue())) {
throw new JsonParseException("Invalid values submitted for " + dft.getName() + " which is limited to specific vocabularies.");
Expand All @@ -884,7 +869,7 @@ public List<DatasetFieldValue> parsePrimitiveValue(DatasetFieldType dft , JsonOb
vals.add(datasetFieldValue);
}

return vals;
dsf.setDatasetFieldValues(vals);
}

public Workflow parseWorkflow(JsonObject json) throws JsonParseException {
Expand Down Expand Up @@ -929,31 +914,35 @@ private String jsonValueToString(JsonValue jv) {
default: return jv.toString();
}
}

public List<ControlledVocabularyValue> parseControlledVocabularyValue(DatasetFieldType cvvType, JsonObject json) throws JsonParseException {

public void parseControlledVocabularyValue(DatasetField dsf, DatasetFieldType cvvType, JsonObject json) throws JsonParseException {
List<ControlledVocabularyValue> vals = new LinkedList<>();
try {
if (cvvType.isAllowMultiples()) {
try {
json.getJsonArray("value").getValuesAs(JsonObject.class);
} catch (ClassCastException cce) {
throw new JsonParseException("Invalid values submitted for " + cvvType.getName() + ". It should be an array of values.");
}
List<ControlledVocabularyValue> vals = new LinkedList<>();
}
for (JsonString strVal : json.getJsonArray("value").getValuesAs(JsonString.class)) {
String strValue = strVal.getString();
ControlledVocabularyValue cvv = datasetFieldSvc.findControlledVocabularyValueByDatasetFieldTypeAndStrValue(cvvType, strValue, lenient);
if (cvv == null) {
throw new ControlledVocabularyException("Value '" + strValue + "' does not exist in type '" + cvvType.getName() + "'", cvvType, strValue);
}
// Only add value to the list if it is not a duplicate
if (strValue.equals("Other")) {
System.out.println("vals = " + vals + ", contains: " + vals.contains(cvv));
if (allowHarvestingMissingCVV) {
// we need to process these as primitive values
logger.warning("Value '" + strValue + "' does not exist in type '" + cvvType.getName() + "'. Processing as primitive per setting override.");
parsePrimitiveValue(dsf, cvvType, json);
return;
} else {
throw new ControlledVocabularyException("Value '" + strValue + "' does not exist in type '" + cvvType.getName() + "'", cvvType, strValue);
}
}
cvv.setDatasetFieldType(cvvType);
// Only add value to the list if it is not a duplicate
if (!vals.contains(cvv)) {
vals.add(cvv);
}
}
return vals;

} else {
try {
Expand All @@ -964,13 +953,23 @@ public List<ControlledVocabularyValue> parseControlledVocabularyValue(DatasetFie
String strValue = json.getString("value", "");
ControlledVocabularyValue cvv = datasetFieldSvc.findControlledVocabularyValueByDatasetFieldTypeAndStrValue(cvvType, strValue, lenient);
if (cvv == null) {
throw new ControlledVocabularyException("Value '" + strValue + "' does not exist in type '" + cvvType.getName() + "'", cvvType, strValue);
if (allowHarvestingMissingCVV) {
// we need to process this as a primitive value
logger.warning(">>>> Value '" + strValue + "' does not exist in type '" + cvvType.getName() + "'. Processing as primitive per setting override.");
parsePrimitiveValue(dsf, cvvType , json);
return;
} else {
throw new ControlledVocabularyException("Value '" + strValue + "' does not exist in type '" + cvvType.getName() + "'", cvvType, strValue);
}
}
return Collections.singletonList(cvv);
cvv.setDatasetFieldType(cvvType);
vals.add(cvv);
}
} catch (ClassCastException cce) {
throw new JsonParseException("Invalid values submitted for " + cvvType.getName());
}

dsf.setControlledVocabularyValues(vals);
}

Date parseDate(String str) throws ParseException {
Expand Down Expand Up @@ -1001,6 +1000,7 @@ public String parseHarvestingClient(JsonObject obj, HarvestingClient harvestingC
harvestingClient.setMetadataPrefix(obj.getString("metadataFormat",null));
harvestingClient.setHarvestingSet(obj.getString("set",null));
harvestingClient.setCustomHttpHeaders(obj.getString("customHeaders", null));
harvestingClient.setAllowHarvestingMissingCVV(obj.getBoolean("allowHarvestingMissingCVV", false));

return dataverseAlias;
}
Expand Down
Expand Up @@ -949,6 +949,7 @@ public static JsonObjectBuilder json(HarvestingClient harvestingClient) {
add("schedule", harvestingClient.isScheduled() ? harvestingClient.getScheduleDescription() : "none").
add("status", harvestingClient.isHarvestingNow() ? "inProgress" : "inActive").
add("customHeaders", harvestingClient.getCustomHttpHeaders()).
add("allowHarvestingMissingCVV", harvestingClient.getAllowHarvestingMissingCVV()).
add("lastHarvest", harvestingClient.getLastHarvestTime() == null ? null : harvestingClient.getLastHarvestTime().toString()).
add("lastResult", harvestingClient.getLastResult()).
add("lastSuccessful", harvestingClient.getLastSuccessfulHarvestTime() == null ? null : harvestingClient.getLastSuccessfulHarvestTime().toString()).
Expand Down
2 changes: 2 additions & 0 deletions src/main/resources/db/migration/V6.1.0.6.sql
@@ -0,0 +1,2 @@
-- Add flag to allow harvesting client to handle missing CVV values
ALTER TABLE harvestingclient ADD COLUMN IF NOT EXISTS allowharvestingmissingcvv BOOLEAN;

0 comments on commit 42db500

Please sign in to comment.