diff --git a/scripts/api/data/metadatablocks/language_ISO_639.tsv b/scripts/api/data/metadatablocks/language_ISO_639.tsv new file mode 100644 index 00000000000..e780fbbd564 --- /dev/null +++ b/scripts/api/data/metadatablocks/language_ISO_639.tsv @@ -0,0 +1,486 @@ +#controlledVocabularyExt DatasetField Value identifier displayOrder alpha-3 (bibliographic) code alpha-3 (terminologic) code alpha-2 code + language Afar aar aar aa + language Abkhazian abk abk ab + language Achinese ace ace + language Acoli ach ach + language Adangme ada ada + language Adyghe, Adygei ady ady + language Afro-Asiatic languages afa afa + language Afrihili afh afh + language Afrikaans afr afr af + language Ainu ain ain + language Akan aka aka ak + language Akkadian akk akk + language Albanian alb alb sqi sq + language Aleut ale ale + language Algonquian languages alg alg + language Southern Altai alt alt + language Amharic amh amh am + language English, Old (ca.450-1100) ang ang + language Angika anp anp + language Apache languages apa apa + language Arabic ara ara ar + language Official Aramaic (700-300 BCE), Imperial Aramaic (700-300 BCE) arc arc + language Aragonese arg arg an + language Armenian arm arm hye hy + language Mapudungun, Mapuche arn arn + language Arapaho arp arp + language Artificial languages art art + language Arawak arw arw + language Assamese asm asm as + language Asturian, Bable, Leonese, Asturleonese ast ast + language Athapascan languages ath ath + language Australian languages aus aus + language Avaric ava ava av + language Avestan ave ave ae + language Awadhi awa awa + language Aymara aym aym ay + language Azerbaijani aze aze az + language Banda languages bad bad + language Bamileke languages bai bai + language Bashkir bak bak ba + language Baluchi bal bal + language Bambara bam bam bm + language Balinese ban ban + language Basque baq baq eus eu + language Basa bas bas + language Baltic languages bat bat + language Beja, Bedawiyet bej bej + language Belarusian bel bel be + language Bemba bem bem + language Bengali ben ben bn + language Berber languages ber ber + language Bhojpuri bho bho + language Bihari languages bih bih bh + language Bikol bik bik + language Bini, Edo bin bin + language Bislama bis bis bi + language Siksika bla bla + language Bantu languages bnt bnt + language Bosnian bos bos bs + language Braj bra bra + language Breton bre bre br + language Batak languages btk btk + language Buriat bua bua + language Buginese bug bug + language Bulgarian bul bul bg + language Burmese bur bur mya my + language Blin, Bilin byn byn + language Caddo cad cad + language Central American Indian languages cai cai + language Galibi Carib car car + language Catalan, Valencian cat cat ca + language Caucasian languages cau cau + language Cebuano ceb ceb + language Celtic languages cel cel + language Chamorro cha cha ch + language Chibcha chb chb + language Chechen che che ce + language Chagatai chg chg + language Chinese chi chi zho zh + language Chuukese chk chk + language Mari chm chm + language Chinook jargon chn chn + language Choctaw cho cho + language Chipewyan, Dene Suline chp chp + language Cherokee chr chr + language Church Slavic, Old Slavonic, Church Slavonic, Old Bulgarian, Old Church Slavonic chu chu cu + language Chuvash chv chv cv + language Cheyenne chy chy + language Chamic languages cmc cmc + language Montenegrin cnr cnr + language Coptic cop cop + language Cornish cor cor kw + language Corsican cos cos co + language Creoles and pidgins, English based cpe cpe + language Creoles and pidgins, French-based cpf cpf + language Creoles and pidgins, Portuguese-based cpp cpp + language Cree cre cre cr + language Crimean Tatar, Crimean Turkish crh crh + language Creoles and pidgins crp crp + language Kashubian csb csb + language Cushitic languages cus cus + language Czech cze cze ces cs + language Dakota dak dak + language Danish dan dan da + language Dargwa dar dar + language Land Dayak languages day day + language Delaware del del + language Slave (Athapascan) den den + language Dogrib dgr dgr + language Dinka din din + language Divehi, Dhivehi, Maldivian div div dv + language Dogri doi doi + language Dravidian languages dra dra + language Lower Sorbian dsb dsb + language Duala dua dua + language Dutch, Middle (ca.1050-1350) dum dum + language Dutch, Flemish dut dut nld nl + language Dyula dyu dyu + language Dzongkha dzo dzo dz + language Efik efi efi + language Egyptian (Ancient) egy egy + language Ekajuk eka eka + language Elamite elx elx + language English eng eng en + language English, Middle (1100-1500) enm enm + language Esperanto epo epo eo + language Estonian est est et + language Ewe ewe ewe ee + language Ewondo ewo ewo + language Fang fan fan + language Faroese fao fao fo + language Fanti fat fat + language Fijian fij fij fj + language Filipino, Pilipino fil fil + language Finnish fin fin fi + language Finno-Ugrian languages fiu fiu + language Fon fon fon + language French fre fre fra fr + language French, Middle (ca.1400-1600) frm frm + language French, Old (842-ca.1400) fro fro + language Northern Frisian frr frr + language Eastern Frisian frs frs + language Western Frisian fry fry fy + language Fulah ful ful ff + language Friulian fur fur + language Ga gaa gaa + language Gayo gay gay + language Gbaya gba gba + language Germanic languages gem gem + language Georgian geo geo kat ka + language German ger ger deu de + language Geez gez gez + language Gilbertese gil gil + language Gaelic, Scottish Gaelic gla gla gd + language Irish gle gle ga + language Galician glg glg gl + language Manx glv glv gv + language German, Middle High (ca.1050-1500) gmh gmh + language German, Old High (ca.750-1050) goh goh + language Gondi gon gon + language Gorontalo gor gor + language Gothic got got + language Grebo grb grb + language Greek, Ancient (to 1453) grc grc + language Greek, Modern (1453-) gre gre ell el + language Guarani grn grn gn + language Swiss German, Alemannic, Alsatian gsw gsw + language Gujarati guj guj gu + language Gwich'in gwi gwi + language Haida hai hai + language Haitian, Haitian Creole hat hat ht + language Hausa hau hau ha + language Hawaiian haw haw + language Hebrew heb heb he + language Herero her her hz + language Hiligaynon hil hil + language Himachali languages, Western Pahari languages him him + language Hindi hin hin hi + language Hittite hit hit + language Hmong, Mong hmn hmn + language Hiri Motu hmo hmo ho + language Croatian hrv hrv hr + language Upper Sorbian hsb hsb + language Hungarian hun hun hu + language Hupa hup hup + language Iban iba iba + language Igbo ibo ibo ig + language Icelandic ice ice isl is + language Ido ido ido io + language Sichuan Yi, Nuosu iii iii ii + language Ijo languages ijo ijo + language Inuktitut iku iku iu + language Interlingue, Occidental ile ile ie + language Iloko ilo ilo + language Interlingua (International Auxiliary Language Association) ina ina ia + language Indic languages inc inc + language Indonesian ind ind id + language Indo-European languages ine ine + language Ingush inh inh + language Inupiaq ipk ipk ik + language Iranian languages ira ira + language Iroquoian languages iro iro + language Italian ita ita it + language Javanese jav jav jv + language Lojban jbo jbo + language Japanese jpn jpn ja + language Judeo-Persian jpr jpr + language Judeo-Arabic jrb jrb + language Kara-Kalpak kaa kaa + language Kabyle kab kab + language Kachin, Jingpho kac kac + language Kalaallisut, Greenlandic kal kal kl + language Kamba kam kam + language Kannada kan kan kn + language Karen languages kar kar + language Kashmiri kas kas ks + language Kanuri kau kau kr + language Kawi kaw kaw + language Kazakh kaz kaz kk + language Kabardian kbd kbd + language Khasi kha kha + language Khoisan languages khi khi + language Central Khmer khm khm km + language Khotanese, Sakan kho kho + language Kikuyu, Gikuyu kik kik ki + language Kinyarwanda kin kin rw + language Kirghiz, Kyrgyz kir kir ky + language Kimbundu kmb kmb + language Konkani kok kok + language Komi kom kom kv + language Kongo kon kon kg + language Korean kor kor ko + language Kosraean kos kos + language Kpelle kpe kpe + language Karachay-Balkar krc krc + language Karelian krl krl + language Kru languages kro kro + language Kurukh kru kru + language Kuanyama, Kwanyama kua kua kj + language Kumyk kum kum + language Kurdish kur kur ku + language Kutenai kut kut + language Ladino lad lad + language Lahnda lah lah + language Lamba lam lam + language Lao lao lao lo + language Latin lat lat la + language Latvian lav lav lv + language Lezghian lez lez + language Limburgan, Limburger, Limburgish lim lim li + language Lingala lin lin ln + language Lithuanian lit lit lt + language Mongo lol lol + language Lozi loz loz + language Luxembourgish, Letzeburgesch ltz ltz lb + language Luba-Lulua lua lua + language Luba-Katanga lub lub lu + language Ganda lug lug lg + language Luiseno lui lui + language Lunda lun lun + language Luo (Kenya and Tanzania) luo luo + language Lushai lus lus + language Macedonian mac mac mkd mk + language Madurese mad mad + language Magahi mag mag + language Marshallese mah mah mh + language Maithili mai mai + language Makasar mak mak + language Malayalam mal mal ml + language Mandingo man man + language Maori mao mao mri mi + language Austronesian languages map map + language Marathi mar mar mr + language Masai mas mas + language Malay may may msa ms + language Moksha mdf mdf + language Mandar mdr mdr + language Mende men men + language Irish, Middle (900-1200) mga mga + language Mi'kmaq, Micmac mic mic + language Minangkabau min min + language Uncoded languages mis mis + language Mon-Khmer languages mkh mkh + language Malagasy mlg mlg mg + language Maltese mlt mlt mt + language Manchu mnc mnc + language Manipuri mni mni + language Manobo languages mno mno + language Mohawk moh moh + language Mongolian mon mon mn + language Mossi mos mos + language Multiple languages mul mul + language Munda languages mun mun + language Creek mus mus + language Mirandese mwl mwl + language Marwari mwr mwr + language Mayan languages myn myn + language Erzya myv myv + language Nahuatl languages nah nah + language North American Indian languages nai nai + language Neapolitan nap nap + language Nauru nau nau na + language Navajo, Navaho nav nav nv + language Ndebele, South, South Ndebele nbl nbl nr + language Ndebele, North, North Ndebele nde nde nd + language Ndonga ndo ndo ng + language Low German, Low Saxon, German, Low, Saxon, Low nds nds + language Nepali nep nep ne + language Nepal Bhasa, Newari new new + language Nias nia nia + language Niger-Kordofanian languages nic nic + language Niuean niu niu + language Norwegian Nynorsk, Nynorsk, Norwegian nno nno nn + language Bokmål, Norwegian, Norwegian Bokmål nob nob nb + language Nogai nog nog + language Norse, Old non non + language Norwegian nor nor no + language N'Ko nqo nqo + language Pedi, Sepedi, Northern Sotho nso nso + language Nubian languages nub nub + language Classical Newari, Old Newari, Classical Nepal Bhasa nwc nwc + language Chichewa, Chewa, Nyanja nya nya ny + language Nyamwezi nym nym + language Nyankole nyn nyn + language Nyoro nyo nyo + language Nzima nzi nzi + language Occitan (post 1500) oci oci oc + language Ojibwa oji oji oj + language Oriya ori ori or + language Oromo orm orm om + language Osage osa osa + language Ossetian, Ossetic oss oss os + language Turkish, Ottoman (1500-1928) ota ota + language Otomian languages oto oto + language Papuan languages paa paa + language Pangasinan pag pag + language Pahlavi pal pal + language Pampanga, Kapampangan pam pam + language Panjabi, Punjabi pan pan pa + language Papiamento pap pap + language Palauan pau pau + language Persian, Old (ca.600-400 B.C.) peo peo + language Persian per per fas fa + language Philippine languages phi phi + language Phoenician phn phn + language Pali pli pli pi + language Polish pol pol pl + language Pohnpeian pon pon + language Portuguese por por pt + language Prakrit languages pra pra + language Proven√ßal, Old (to 1500), Occitan, Old (to 1500) pro pro + language Pushto, Pashto pus pus ps + language Quechua que que qu + language Rajasthani raj raj + language Rapanui rap rap + language Rarotongan, Cook Islands Maori rar rar + language Romance languages roa roa + language Romansh roh roh rm + language Romany rom rom + language Romanian, Moldavian, Moldovan rum rum ron ro + language Rundi run run rn + language Aromanian, Arumanian, Macedo-Romanian rup rup + language Russian rus rus ru + language Sandawe sad sad + language Sango sag sag sg + language Yakut sah sah + language South American Indian languages sai sai + language Salishan languages sal sal + language Samaritan Aramaic sam sam + language Sanskrit san san sa + language Sasak sas sas + language Santali sat sat + language Sicilian scn scn + language Scots sco sco + language Selkup sel sel + language Semitic languages sem sem + language Irish, Old (to 900) sga sga + language Sign Languages sgn sgn + language Shan shn shn + language Sidamo sid sid + language Sinhala, Sinhalese sin sin si + language Siouan languages sio sio + language Sino-Tibetan languages sit sit + language Slavic languages sla sla + language Slovak slo slo slk sk + language Slovenian slv slv sl + language Southern Sami sma sma + language Northern Sami sme sme se + language Sami languages smi smi + language Lule Sami smj smj + language Inari Sami smn smn + language Samoan smo smo sm + language Skolt Sami sms sms + language Shona sna sna sn + language Sindhi snd snd sd + language Soninke snk snk + language Sogdian sog sog + language Somali som som so + language Songhai languages son son + language Sotho, Southern sot sot st + language Spanish, Castilian spa spa es + language Sardinian srd srd sc + language Sranan Tongo srn srn + language Serbian srp srp sr + language Serer srr srr + language Nilo-Saharan languages ssa ssa + language Swati ssw ssw ss + language Sukuma suk suk + language Sundanese sun sun su + language Susu sus sus + language Sumerian sux sux + language Swahili swa swa sw + language Swedish swe swe sv + language Classical Syriac syc syc + language Syriac syr syr + language Tahitian tah tah ty + language Tai languages tai tai + language Tamil tam tam ta + language Tatar tat tat tt + language Telugu tel tel te + language Timne tem tem + language Tereno ter ter + language Tetum tet tet + language Tajik tgk tgk tg + language Tagalog tgl tgl tl + language Thai tha tha th + language Tibetan tib tib bod bo + language Tigre tig tig + language Tigrinya tir tir ti + language Tiv tiv tiv + language Tokelau tkl tkl + language Klingon, tlhIngan-Hol tlh tlh + language Tlingit tli tli + language Tamashek tmh tmh + language Tonga (Nyasa) tog tog + language Tonga (Tonga Islands) ton ton to + language Tok Pisin tpi tpi + language Tsimshian tsi tsi + language Tswana tsn tsn tn + language Tsonga tso tso ts + language Turkmen tuk tuk tk + language Tumbuka tum tum + language Tupi languages tup tup + language Turkish tur tur tr + language Altaic languages tut tut + language Tuvalu tvl tvl + language Twi twi twi tw + language Tuvinian tyv tyv + language Udmurt udm udm + language Ugaritic uga uga + language Uighur, Uyghur uig uig ug + language Ukrainian ukr ukr uk + language Umbundu umb umb + language Undetermined und und + language Urdu urd urd ur + language Uzbek uzb uzb uz + language Vai vai vai + language Venda ven ven ve + language Vietnamese vie vie vi + language Volapük vol vol vo + language Votic vot vot + language Wakashan languages wak wak + language Wolaitta, Wolaytta wal wal + language Waray war war + language Washo was was + language Welsh wel wel cym cy + language Sorbian languages wen wen + language Walloon wln wln wa + language Wolof wol wol wo + language Kalmyk, Oirat xal xal + language Xhosa xho xho xh + language Yao yao yao + language Yapese yap yap + language Yiddish yid yid yi + language Yoruba yor yor yo + language Yupik languages ypk ypk + language Zapotec zap zap + language Blissymbols, Blissymbolics, Bliss zbl zbl + language Zenaga zen zen + language Standard Moroccan Tamazight zgh zgh + language Zhuang, Chuang zha zha za + language Zande languages znd znd + language Zulu zul zul zu + language Zuni zun zun + language Zaza, Dimili, Dimli, Kirdki, Kirmanjki, Zazaki zza zza \ No newline at end of file diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldServiceBean.java index f6a566ae65f..306006ffe0b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldServiceBean.java @@ -37,6 +37,7 @@ import jakarta.persistence.NonUniqueResultException; import jakarta.persistence.PersistenceContext; import jakarta.persistence.PersistenceException; +import jakarta.persistence.Query; import jakarta.persistence.TypedQuery; import org.apache.commons.codec.digest.DigestUtils; @@ -201,7 +202,7 @@ public ControlledVocabularyValue findControlledVocabularyValueByDatasetFieldType } } } - + public ControlledVocabAlternate findControlledVocabAlternateByControlledVocabularyValueAndStrValue(ControlledVocabularyValue cvv, String strValue){ TypedQuery typedQuery = em.createQuery("SELECT OBJECT(o) FROM ControlledVocabAlternate AS o WHERE o.strValue = :strvalue AND o.controlledVocabularyValue = :cvv", ControlledVocabAlternate.class); typedQuery.setParameter("strvalue", strValue); @@ -237,13 +238,35 @@ public ControlledVocabularyValue findControlledVocabularyValueByDatasetFieldType } } - // return singleton NA Controled Vocabulary Value + // return singleton NA Controlled Vocabulary Value public ControlledVocabularyValue findNAControlledVocabularyValue() { TypedQuery typedQuery = em.createQuery("SELECT OBJECT(o) FROM ControlledVocabularyValue AS o WHERE o.datasetFieldType is null AND o.strValue = :strvalue", ControlledVocabularyValue.class); typedQuery.setParameter("strvalue", DatasetField.NA_VALUE); return typedQuery.getSingleResult(); } + public void reorderControlledVocabularyValueByDatasetFieldType(DatasetFieldType dsft) { + // sort to put rows with empty identifiers at the end (i.e. ["","Not Applicable"]) and sort alphabetically + Query query = em.createNativeQuery( + "SELECT o.id, o.datasetFieldType_id, o.identifier, o.strValue FROM ControlledVocabularyValue AS o WHERE o.datasetFieldType_id = " + dsft.getId() + + " ORDER BY CASE WHEN o.identifier = '' THEN 1 ELSE 0 END, o.strValue ASC"); + Query updateQuery = em.createQuery("UPDATE ControlledVocabularyValue o set o.displayOrder = :displayOrder WHERE o.id = :id "); + + List queryResults = query.getResultList(); + int order = 0; + for (Object[] result : queryResults) { + try { + if (result[0] != null) { + Long cvvId = Long.parseLong(result[0].toString()); + updateQuery.setParameter("displayOrder", order++); + updateQuery.setParameter("id", cvvId); + updateQuery.executeUpdate(); + } + } catch (Exception e) { + logger.warning("Failed to re-sort the display order for " + dsft.getName() + ". " + e.getMessage()); + } + } + } public DatasetFieldType save(DatasetFieldType dsfType) { return em.merge(dsfType); } diff --git a/src/main/java/edu/harvard/iq/dataverse/api/DatasetFieldServiceApi.java b/src/main/java/edu/harvard/iq/dataverse/api/DatasetFieldServiceApi.java index 00b7dfa6e36..89fb8516c31 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/DatasetFieldServiceApi.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/DatasetFieldServiceApi.java @@ -19,12 +19,13 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; +import java.util.Enumeration; import java.util.List; import jakarta.ejb.EJB; import jakarta.ejb.EJBException; import jakarta.json.Json; import jakarta.json.JsonArrayBuilder; -import jakarta.validation.ConstraintViolation; import jakarta.validation.ConstraintViolationException; import jakarta.ws.rs.Consumes; import jakarta.ws.rs.GET; @@ -50,7 +51,7 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Paths; -import java.util.Enumeration; +import java.util.stream.Collectors; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; @@ -240,12 +241,15 @@ public Response loadDatasetFields(File file) { HeaderType header = null; JsonArrayBuilder responseArr = Json.createArrayBuilder(); String[] values = null; + boolean extended = false; + List reorderTypes = new ArrayList<>(); // used to re-sort merged datasetFieldTypes try { br = new BufferedReader(new FileReader("/" + file)); while ((line = br.readLine()) != null) { lineNumber++; values = line.split(splitBy); if (values[0].startsWith("#")) { // Header row + extended = false; switch (values[0]) { case "#metadataBlock": header = HeaderType.METADATABLOCK; @@ -256,6 +260,10 @@ public Response loadDatasetFields(File file) { case "#controlledVocabulary": header = HeaderType.CONTROLLEDVOCABULARY; break; + case "#controlledVocabularyExt": + header = HeaderType.CONTROLLEDVOCABULARY; + extended = true; + break; default: throw new IOException("Encountered unknown #header type at line lineNumber " + lineNumber); } @@ -275,8 +283,11 @@ public Response loadDatasetFields(File file) { case CONTROLLEDVOCABULARY: responseArr.add( Json.createObjectBuilder() - .add("name", parseControlledVocabulary(values)) + .add("name", parseControlledVocabulary(values, extended)) .add("type", "Controlled Vocabulary") ); + if (extended && !reorderTypes.contains(values[1])) { + reorderTypes.add(values[1]); + } break; default: @@ -285,6 +296,11 @@ public Response loadDatasetFields(File file) { } } } + // If we just merged data then we may need to re-order the list(s) + reorderTypes.forEach(datasetField -> { + datasetFieldService.reorderControlledVocabularyValueByDatasetFieldType(datasetFieldService.findByName(datasetField)); + }); + } catch (FileNotFoundException e) { alr.setActionResult(ActionLogRecord.Result.BadRequest); alr.setInfo( alr.getInfo() + "// file not found"); @@ -433,21 +449,34 @@ private String parseDatasetField(String[] values) { return dsf.getName(); } - private String parseControlledVocabulary(String[] values) { - - DatasetFieldType dsv = datasetFieldService.findByName(values[1]); + private String parseControlledVocabulary(String[] values, boolean extended) throws IOException { + if (values.length < 5) { // index 0-4 must be present (even if they are empty strings) + throw new IOException("Not enough values in cvv line: " + values); + } + String datasetField = values[1].trim(); + String name = values[2].trim(); + String identifier = values[3].trim(); + Integer displayOrder = values[4].isBlank() ? Integer.MAX_VALUE : Integer.valueOf(values[4]); + List altValues = values.length > 4 ? + List.of(Arrays.copyOfRange(values, 5, values.length)).stream() + .filter(item-> item != null && !item.trim().isEmpty()) + .map(String::trim) + .collect(Collectors.toList()) : + Collections.emptyList(); + + DatasetFieldType dsv = datasetFieldService.findByName(datasetField); //See if it already exists /* Matching relies on assumption that only one cv value will exist for a given identifier or display value If the lookup queries return multiple matches then retval is null */ //First see if cvv exists based on display name - ControlledVocabularyValue cvv = datasetFieldService.findControlledVocabularyValueByDatasetFieldTypeAndStrValue(dsv, values[2], true); + ControlledVocabularyValue cvv = datasetFieldService.findControlledVocabularyValueByDatasetFieldTypeAndStrValue(dsv, name, true); //then see if there's a match on identifier ControlledVocabularyValue cvvi = null; - if (values[3] != null && !values[3].trim().isEmpty()){ - cvvi = datasetFieldService.findControlledVocabularyValueByDatasetFieldTypeAndIdentifier(dsv, values[3]); + if (!identifier.isEmpty()) { + cvvi = datasetFieldService.findControlledVocabularyValueByDatasetFieldTypeAndIdentifier(dsv, identifier); } //if there's a match on identifier use it @@ -455,30 +484,59 @@ private String parseControlledVocabulary(String[] values) { cvv = cvvi; } + // if extending and no match on display name or identifier then check alternate values + if (extended && cvv == null) { + int idx = altValues.size(); + while (cvv == null && idx > 0) { + cvv = datasetFieldService.findControlledVocabularyValueByDatasetFieldTypeAndStrValue(dsv, altValues.get(--idx), true); + } + // if we found a match on altValue then add this "name" to the list of names in the existing cvv + if (cvv != null) { + // split csv and merge without duplicates + List origNames = Arrays.asList(cvv.getStrValue().split("[,;]+")).stream() + .map(String::trim) + .collect(Collectors.toList()); + List newNames = Arrays.asList(name.split("[,;]+")).stream() + .map(String::trim) + .collect(Collectors.toList()); + newNames.forEach(n -> { + if (!origNames.contains(n)) { + origNames.add(n); + } + }); + logger.info("Merging new language (" + name + ") to: " + cvv.getStrValue()); + cvv.setStrValue(StringUtils.join(origNames, ", ")); + } + } + //if there's no match create a new one if (cvv == null) { cvv = new ControlledVocabularyValue(); cvv.setDatasetFieldType(dsv); + cvv.setStrValue(name); + cvv.setIdentifier(identifier); + cvv.setDisplayOrder(displayOrder); } // Alternate variants for this controlled vocab. value: - // Note that these are overwritten every time: - cvv.getControlledVocabAlternates().clear(); + // Note that these are overwritten every time for tsv files. + // For ISO_639 and other extended tsv files we add to the list of alternates since we don't want to remove any values added via a tsv file. + if (!extended) { + cvv.getControlledVocabAlternates().clear(); + } // - meaning, if an alternate has been removed from the tsv file, // it will be removed from the database! -- L.A. 5.4 - - for (int i = 5; i < values.length; i++) { - ControlledVocabAlternate alt = new ControlledVocabAlternate(); - alt.setDatasetFieldType(dsv); - alt.setControlledVocabularyValue(cvv); - alt.setStrValue(values[i]); - cvv.getControlledVocabAlternates().add(alt); + + for (String val : altValues) { + if (!cvv.getControlledVocabAlternates().stream().anyMatch(v -> val.equals(v.getStrValue()))) { + ControlledVocabAlternate alt = new ControlledVocabAlternate(); + alt.setDatasetFieldType(dsv); + alt.setControlledVocabularyValue(cvv); + alt.setStrValue(val); + cvv.getControlledVocabAlternates().add(alt); + } } - - cvv.setStrValue(values[2]); - cvv.setIdentifier(values[3]); - cvv.setDisplayOrder(Integer.parseInt(values[4])); datasetFieldService.save(cvv); return cvv.getStrValue(); } diff --git a/src/test/java/edu/harvard/iq/dataverse/api/AdminIT.java b/src/test/java/edu/harvard/iq/dataverse/api/AdminIT.java index 44f062e8254..e5758b47883 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/AdminIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/AdminIT.java @@ -31,10 +31,9 @@ import static jakarta.ws.rs.core.Response.Status.INTERNAL_SERVER_ERROR; import static jakarta.ws.rs.core.Response.Status.OK; import static jakarta.ws.rs.core.Response.Status.UNAUTHORIZED; -import static org.junit.jupiter.api.Assertions.assertEquals; import static org.hamcrest.CoreMatchers.equalTo; import static org.hamcrest.CoreMatchers.notNullValue; -import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.*; public class AdminIT { @@ -890,7 +889,42 @@ public void testDownloadTmpFile() throws IOException { .body("status", equalTo("ERROR")) .body("message", equalTo("Path must begin with '/tmp' but after normalization was '/etc/passwd'.")); } + @Test + public void testLoadISO_639_controlledVocabularyExt() { + Response createUser = UtilIT.createRandomUser(); + String apiToken = UtilIT.getApiTokenFromResponse(createUser); + byte[] updatedContent = null; + try + { + updatedContent = Files.readAllBytes(Paths.get("scripts/api/data/metadatablocks/language_ISO_639.tsv")); + } catch(IOException e) { + fail(e.getMessage()); + } + + Response response = UtilIT.loadMetadataBlock(apiToken, updatedContent); + response.prettyPrint(); + assertEquals(200,response.getStatusCode()); + response.then().assertThat().statusCode(OK.getStatusCode()); + + String body = response.getBody().asString(); + String status = JsonPath.from(body).getString("status"); + + assertEquals("OK",status); + + // Issue #8578 Some codes are still not managed. In the cases encountered, frm (Medieval French) and fro (Old French). + // Included in language_ISO_639.tsv under extension: #controlledVocabularyExt + assertTrue(body.contains("French, Middle (ca.1400-1600)")); + assertTrue(body.contains("French, Old (842-ca.1400)")); + // "Uighur; Uyghur" get merged with "Uyghur, Uighur" and remains as "Uyghur, Uighur" + // since only the order changed and ';' and ',' are both allowed as separators + assertTrue(body.contains("Uyghur, Uighur")); + // "Mapudungun, Mapuche" gets added + assertTrue(body.contains("Mapudungun, Mapuche")); + // Flemish gets merged into "Dutch" creating a display Name of "Dutch, Flemish" since they contain like alternates "nld,dut,nl" + assertTrue(body.contains("Dutch, Flemish")); + + } private String createTestNonSuperuserApiToken() { Response createUserResponse = UtilIT.createRandomUser(); createUserResponse.then().assertThat().statusCode(OK.getStatusCode());