diff --git a/.ci/pull-request-check/selenium-test.js b/.ci/pull-request-check/selenium-test.js index 2076a96ec2..37da3a04c9 100755 --- a/.ci/pull-request-check/selenium-test.js +++ b/.ci/pull-request-check/selenium-test.js @@ -113,10 +113,10 @@ var allPassed = false; // No API to retrieve extension ID. Hacks, sigh. await driver.get("chrome://system/"); - await driver.wait(until.elementLocated({id: 'extensions-value-btn'}), 60*1000); + await driver.wait(until.elementLocated({id: 'btn-extensions-value'}), 60*1000); // Chrome 89+ has the extension list expanded by default try { - let extBtn = await driver.findElement({css: '#extensions-value-btn'}); + let extBtn = await driver.findElement({css: '#btn-extensions-value'}); await extBtn.click(); } catch (e) {} let contentElem = await driver.findElement({css: '#content'}); diff --git a/BibTeX.js b/BibTeX.js index 8d15b6a727..7aef0de5d4 100644 --- a/BibTeX.js +++ b/BibTeX.js @@ -18,7 +18,7 @@ }, "inRepository": true, "translatorType": 3, - "lastUpdated": "2022-10-31 23:11:08" + "lastUpdated": "2023-04-09 18:35:07" } /* @@ -684,6 +684,7 @@ function unescapeBibTeX(value) { value = value.replace(mapped, unicode); } } + value = value.replace(/\$([^$]+)\$/g, '$1') // kill braces value = value.replace(/([^\\])[{}]+/g, "$1"); @@ -1109,15 +1110,23 @@ function mapHTMLmarkup(characters){ return characters; } - +function xcase(prefix, cased, tag, tex) { + return (prefix ? `$${prefix}$` : '') + (reversemappingTable[`$${tex}{${cased}}$`] || `<${tag}>${cased}`) +} +function sup(match, prefix, cased) { + return xcase(prefix, cased, 'sup', '^'); +} +function sub(match, prefix, cased) { + return xcase(prefix, cased, 'sub', '_'); +} function mapTeXmarkup(tex){ //reverse of the above - converts tex mark-up into html mark-up permitted by Zotero //italics and bold tex = tex.replace(/\\textit\{([^\}]+\})/g, "$1").replace(/\\textbf\{([^\}]+\})/g, "$1"); //two versions of subscript the .* after $ is necessary because people m - tex = tex.replace(/\$[^\{\$]*_\{([^\}]+\})\$/g, "$1").replace(/\$[^\{]*_\{\\textrm\{([^\}]+\}\})/g, "$1"); + tex = tex.replace(/\$([^\{\$]*)_\{([^\}]+)\}\$/g, sub).replace(/\$([^\{\$]*)_\{\\textrm\{([^\}\$]+)\}\}\$/g, sub); //two version of superscript - tex = tex.replace(/\$[^\{]*\^\{([^\}]+\}\$)/g, "$1").replace(/\$[^\{]*\^\{\\textrm\{([^\}]+\}\})/g, "$1"); + tex = tex.replace(/\$([^\{\$]*)\^\{([^\}]+)\}\$/g, sup).replace(/\$([^\{\$]*)\^\{\\textrm\{([^\}]+)\}\}\$/g, sup); //small caps tex = tex.replace(/\\textsc\{([^\}]+)/g, "$1"); return tex; @@ -3326,7 +3335,7 @@ var testCases = [ "items": [ { "itemType": "journalArticle", - "title": "Test of markupconversion: Italics, bold, superscript, subscript, and small caps: Mitochondrial DNA2$ sequences suggest unexpected phylogenetic position of Corso-Sardinian grass snakes (Natrix cetti) and do not support their species status, with notes on phylogeography and subspecies delineation of grass snakes.", + "title": "Test of markupconversion: Italics, bold, superscript, subscript, and small caps: Mitochondrial DNA₂ sequences suggest unexpected phylogenetic position of Corso-Sardinian grass snakes (Natrix cetti) and do not support their species status, with notes on phylogeography and subspecies delineation of grass snakes.", "creators": [ { "firstName": "U.", @@ -3348,7 +3357,7 @@ var testCases = [ "DOI": "10.1007/s13127-011-0069-8", "itemID": "Frit2", "pages": "71-80", - "publicationTitle": "Actes du ème$ Congrès Français d'Acoustique", + "publicationTitle": "Actes du 4ème Congrès Français d'Acoustique", "volume": "12", "attachments": [], "tags": [], @@ -4159,6 +4168,40 @@ var testCases = [ "seeAlso": [] } ] + }, + { + "type": "import", + "input": "@article{Borissov:2855446,\r\n author = \"Borissov, Alexander and Solokhin, Sergei\",\r\n collaboration = \"ALICE\",\r\n title = \"{Production of $\\Sigma^{0}$ Hyperon and Search of\r\n $\\Sigma^{0}$ Hypernuclei at LHC with ALICE}\",\r\n journal = \"Phys. At. Nucl.\",\r\n volume = \"85\",\r\n number = \"6\",\r\n pages = \"970-975\",\r\n year = \"2023\",\r\n url = \"https://cds.cern.ch/record/2855446\",\r\n doi = \"10.1134/S1063778823010131\",\r\n }", + "items": [ + { + "itemType": "journalArticle", + "title": "Production of Σ⁰ Hyperon and Search of Σ⁰ Hypernuclei at LHC with ALICE", + "creators": [ + { + "firstName": "Alexander", + "lastName": "Borissov", + "creatorType": "author" + }, + { + "firstName": "Sergei", + "lastName": "Solokhin", + "creatorType": "author" + } + ], + "date": "2023", + "DOI": "10.1134/S1063778823010131", + "issue": "6", + "itemID": "Borissov:2855446", + "pages": "970-975", + "publicationTitle": "Phys. At. Nucl.", + "url": "https://cds.cern.ch/record/2855446", + "volume": "85", + "attachments": [], + "tags": [], + "notes": [], + "seeAlso": [] + } + ] } ] /** END TEST CASES **/ diff --git a/FAO Publications.js b/FAO Publications.js index 75c5249e24..b5e7a45686 100644 --- a/FAO Publications.js +++ b/FAO Publications.js @@ -2,14 +2,14 @@ "translatorID": "4883f662-29df-44ad-959e-27c9d036d165", "label": "FAO Publications", "creator": "Bin Liu ", - "target": "^https?://www\\.fao\\.org/(documents|publications)/", - "minVersion": "3.0", + "target": "^https?://www\\.fao\\.org/(publications|documents)/", + "minVersion": "5.0", "maxVersion": "", "priority": 100, "inRepository": true, "translatorType": 4, "browserSupport": "gcsibv", - "lastUpdated": "2021-08-31 04:00:00" + "lastUpdated": "2023-04-01 16:44:37" } /* @@ -30,11 +30,17 @@ */ function detectWeb(doc, url) { // Just differentiate single and multiple. - // Identify item type (book or conferencePaper) based on "fdr_label" class. - if (url.includes('card')) { + if (url.includes('/card/')) { let isConferencePaper = false; let confMetaName = ['اسم الاجتماع', '会议名称', 'Meeting Name', 'Nom de la réunion', 'Название мероприятия', 'Nombre de la reunión']; - let labelArray = doc.querySelectorAll('.fdr_label'); + let labelArray = []; + if (url.includes('/publications/')) { + labelArray = doc.querySelectorAll('.fdr_label'); // Identify item type (book or conferencePaper) based on "fdr_label" class. + } + else if (url.includes('/documents/')) { + labelArray = doc.querySelectorAll('.fw-bold'); // Identify item type (book or conferencePaper) based on "fw-bold" class. + // Page layout for meeting documents is not functioning properly at "documents" pages (e.g. https://www.fao.org/documents/card/en/c/ND423EN/ and http://www.fao.org/documents/card/zh/c/mw246ZH/ ). Keep the code for now because it doesn't interfere with books and meeting documents are very few. + } for (let i = 0; i < labelArray.length; i++) { for (let j = 0; j < confMetaName.length; j++) { isConferencePaper = labelArray[i].innerText.includes(confMetaName[j]); @@ -62,21 +68,85 @@ function detectWeb(doc, url) { return false; } -function cleanMeta(str) { - // clean meta fields obtained from page +function cleanMetaPub(str) { + // clean meta fields obtained from page for "publications" pages if (str.includes(';') === false) { return str.slice(str.indexOf(':') + 2); } else { - var strArray = str.slice(str.indexOf(':') + 2).split(';'); + let strArray = str.slice(str.indexOf(':') + 2).split(';'); + return strArray; + } +} + +function cleanMetaDoc(str) { + // clean meta fields obtained from page for "documents" pages + if (str.includes(';') === false) { + return str; + } + else { + let strArray = str.split(';').filter(String); // split by semicolon and remove empty elements return strArray; } } +function getLang(str) { + // language: 2 or 3 letters following ISO 639 + // indicated by the last 1-3 letters in PDF file name (langCode) + // One good example is the various language versions of http://www.fao.org/publications/card/en/c/I2801E + let langCode, lang = ''; + let matches = str.match(/([a-z]+)\.pdf$/i); + if (matches) { + langCode = matches[1]; + } + // In the new PDF naming scheme, langCode follows ISO 639. + if (langCode.length > 1) { + lang = langCode.toLowerCase(); + } + // In the old PDF naming scheme, langCode is one lower/upper case letter and only differentiates between the 6 UN languages. + else if ((langCode == 'a') || (langCode == 'A')) { + lang = 'ar'; + } + else if ((langCode == 'c') || (langCode == 'C')) { + lang = 'zh'; + } + else if ((langCode == 'e') || (langCode == 'E')) { + lang = 'en'; + } + else if ((langCode == 'f') || (langCode == 'F')) { + lang = 'fr'; + } + else if ((langCode == 'r') || (langCode == 'R')) { + lang = 'ru'; + } + else if ((langCode == 's') || (langCode == 'S')) { + lang = 'es'; + } + else { // Other languages are usually designated 'o'. Using 'else' just to be safe. + lang = 'other'; + } + return lang; +} + function scrape(doc, url) { var newItem = new Z.Item(); + var abs, existingMeta = {}; + var textVariable = { // declarations for metadata names as appeared in document pages in different languages + date: ['سنة النشر', '出版年份', 'Year of publication', 'Année de publication', 'Год издания', 'Fecha de publicación'], + publisher: ['الناشر', '出版方', 'Publisher', 'Éditeur', 'Издатель', 'Editor'], + place: ['مكان النشر', '出版地点', 'Place of publication', 'Lieu de publication', 'Место публикации', 'Lugar de publicacion'], + pages: ['الصفحات', '页数', 'Pages', 'Страницы', 'Páginas'], + ISBN: ['الرقم الدولي الموحد للكتاب', 'ISBN'], + author: ['الكاتب', '作者', 'Author', 'Auteur', 'Автор', 'Autor'], + seriesTitle: ['العنوان التسلسي', '系列标题', 'Serial Title', 'Titre de la série', 'Название серии', 'Título de la serie'], + seriesNumber: ['رقم المسلسل', '系列号码', 'Series number', 'Numéro de série', 'Серийный номер', 'Número de serie'], + conference: ['اسم الاجتماع', '会议名称', 'Meeting Name', 'Nom de la réunion', 'Название мероприятия', 'Nombre de la reunión'] + }; + var metaText = []; + var DOIMatch, pdfUrl, mainTitle, subTitle, metaResult, conferenceWeb = ''; + var DOILead = 'https://doi.org/'; - if (url.includes('card')) { + if (url.includes('/card/')) { // attach document card URL and snapshot // TEMP: Disable at least until we have post-JS snapshots /* newItem.attachments.push({ @@ -85,235 +155,365 @@ function scrape(doc, url) { mimeType: 'text/html', snapshot: true }); */ + if (url.includes('/publications/')) { + //* ********* Begin fixed-location variables ********** - //* ********* Begin fixed-location variables ********** - - // Some variables always appear and appear at the same location in all document pages. + // Some variables always appear and appear at the same location in all document pages. - // abstract - var abs = doc.getElementById("mainContentN0"); - // The childrens of `abs` are the label "Abstract:" in a strong-tag, - // the abstract in several p-tags or text nodes directly, and possibly - // a note about other languages which begins also with a strong-tag. - if (abs) { - var children = abs.childNodes; - var abstractFound = false; - for (let child of children) { - if (child.tagName == "STRONG" || (child.nodeType == 1 && ZU.xpathText(child, './/strong'))) { - if (abstractFound) { - break; // stop when another strong tag is found + // abstract + abs = doc.getElementById("mainContentN0"); + // The childrens of `abs` are the label "Abstract:" in a strong-tag, + // the abstract in several p-tags or text nodes directly, and possibly + // a note about other languages which begins also with a strong-tag. + if (abs) { + let children = abs.childNodes; + let abstractFound = false; + for (let child of children) { + if (child.tagName == "STRONG" || (child.nodeType == Node.ELEMENT_NODE && text(child, 'strong'))) { + if (abstractFound) { + break; // stop when another strong tag is found + } + else { + abstractFound = true; + continue; // exclude the label "Abstract" + } } - else { - abstractFound = true; - continue; // exclude the label "Abstract" + if (newItem.abstractNote) { + if (newItem.abstractNote.slice(-1) !== "\n") { + newItem.abstractNote += "\n\n"; + } + newItem.abstractNote += child.textContent; } - } - if (newItem.abstractNote) { - if (newItem.abstractNote.slice(-1) !== "\n") { - newItem.abstractNote += "\n\n"; + else { + newItem.abstractNote = child.textContent; } - newItem.abstractNote += child.textContent; } - else { - newItem.abstractNote = child.textContent; + // DOI: Some docs contain DOI as a separate paragraph in abs field + if (abs.innerText.includes(DOILead)) { + DOIMatch = abs.innerText.match(/https:\/\/doi\.org\/(.+)/i); + newItem.DOI = DOIMatch[1]; } } - // DOI: Some docs contain DOI as a separate paragraph in abs field - var DOILead = 'https://doi.org/'; - if (abs.innerText.includes(DOILead)) { - var DOIMatch = abs.innerText.match(/https:\/\/doi\.org\/(.+)/i); - newItem.DOI = DOIMatch[1]; + + // attach PDF: PDF link in innerHTML of "dynafef_det" class. + pdfUrl = attr(doc, '.dynafef_det a[href$=".pdf"]', 'href'); + newItem.attachments.push({ + url: pdfUrl, + title: 'Full Text PDF', + mimeType: 'application/pdf' + }); + + // url + newItem.url = url; + + //language + newItem.language = getLang(pdfUrl); + + // title: use colon to connect main title and subtitle (if subtitle exists) + mainTitle = text(doc, '#headerN0 > h1'); + subTitle = text(doc, 'h4.csc-firstHeader'); + if (!subTitle) { + newItem.title = mainTitle; + } + else if ((newItem.language == 'zh') || (newItem.language == 'ja')) { + newItem.title = mainTitle + ':' + subTitle; + } + else { + newItem.title = mainTitle + ': ' + subTitle; } - } - // attach PDF - var pdfUrl = ZU.xpath(doc, '//*[@id="mainRightN0"]/div[2]/a')[0].href; - newItem.attachments.push({ - url: pdfUrl, - title: 'Full Text PDF', - mimeType: 'application/pdf' - }); - // url - newItem.url = url; - // language: 2 or 3 letters following ISO 639 - // indicated by the last 1-3 letters in PDF file name (langCode) - // One good example is the various language versions of http://www.fao.org/publications/card/en/c/I2801E - var langCode = ''; - var matches = pdfUrl.match(/([a-z]+)\.pdf$/i); - if (matches) { - langCode = matches[1]; - } - // In the new PDF naming scheme, langCode follows ISO 639. - if (langCode.length > 1) { - newItem.language = langCode.toLowerCase(); - } - // In the old PDF naming scheme, langCode is one lower/upper case letter and only differentiates between the 6 UN languages. - else if ((langCode == 'a') || (langCode == 'A')) { - newItem.language = 'ar'; - } - else if ((langCode == 'c') || (langCode == 'C')) { - newItem.language = 'zh'; - } - else if ((langCode == 'e') || (langCode == 'E')) { - newItem.language = 'en'; - } - else if ((langCode == 'f') || (langCode == 'F')) { - newItem.language = 'fr'; - } - else if ((langCode == 'r') || (langCode == 'R')) { - newItem.language = 'ru'; - } - else if ((langCode == 's') || (langCode == 'S')) { - newItem.language = 'es'; - } - else { // Other languages are usually designated 'o'. Using 'else' just to be safe. - newItem.language = 'other'; - } - // title: use colon to connect main title and subtitle (if subtitle exists) - var mainTitle = ZU.xpathText(doc, '//*[@id="headerN0"]/h1'); - var subTitle = ZU.xpathText(doc, '//h4[@class="csc-firstHeader h1"]'); - if (!subTitle) { - newItem.title = mainTitle; - } - else if ((newItem.language == 'zh') || (newItem.language == 'ja')) { - newItem.title = mainTitle + ':' + subTitle; - } - else { - newItem.title = mainTitle + ': ' + subTitle; - } - //* ********* End fixed-location variables ********** - - - //* ********* Begin dynamic-location variables ********** - - // Variables that appear neither in all document pages nor at same positions in the pages. - var metaText = ZU.xpath(doc, '//*[@id="mainN0"]')[0].innerText.split('\n'); // scrape text of meta area and split into an array based on line breaks. - // get what variables are listed in the page, save to object existingMeta - var textVariable = { // declarations for metadata names as appeared in document pages in different languages - date: ['سنة النشر', '出版年份', 'Year of publication', 'Année de publication', 'Год издания', 'Fecha de publicación'], - publisher: ['الناشر', '出版方', 'Publisher', 'Éditeur', 'Издатель', 'Editor'], - place: ['مكان النشر', '出版地点', 'Place of publication', 'Lieu de publication', 'Место публикации', 'Lugar de publicacion'], - pages: ['الصفحات', '页数', 'Pages', 'Страницы', 'Páginas'], - ISBN: ['الرقم الدولي الموحد للكتاب', 'ISBN'], - author: ['الكاتب', '作者', 'Author', 'Auteur', 'Автор', 'Autor'], - seriesTitle: ['العنوان التسلسي', '系列标题', 'Serial Title', 'Titre de la série', 'Название серии', 'Título de la serie'], - seriesNumber: ['رقم المسلسل', '系列号码', 'Series number', 'Numéro de série', 'Серийный номер', 'Número de serie'], - conference: ['اسم الاجتماع', '会议名称', 'Meeting Name', 'Nom de la réunion', 'Название мероприятия', 'Nombre de la reunión'], - tags: ['المعجم الكلمات الموضوع', 'AGROVOC', 'Agrovoc', 'АГРОВОК'] - }; - var existingMeta = {}; - for (let i = 0; i < metaText.length; i++) { - for (let key in textVariable) { - for (let j = 0; j < textVariable[key].length; j++) { - if (metaText[i].includes(textVariable[key][j])) { - existingMeta[key] = metaText[i]; + //* ********* End fixed-location variables ********** + + + //* ********* Begin dynamic-location variables ********** + + // Variables that appear neither in all document pages nor at same positions in the pages. + // scrape text of meta area and split into an array based on line breaks. + metaText = text(doc, '#fdr_label').split('\n'); + // get what variables are listed in the page, save to object existingMeta + for (let i = 0; i < metaText.length; i++) { + for (let key in textVariable) { + for (let j = 0; j < textVariable[key].length; j++) { + if (metaText[i].includes(textVariable[key][j])) { + existingMeta[key] = metaText[i]; + } } } } + + for (let key in existingMeta) { + metaResult = cleanMetaPub(existingMeta[key]); + + // date + if (key.includes('date')) { + newItem.date = metaResult; + } + // publisher + if (key.includes('publisher')) { + newItem.publisher = metaResult; + } + // place + if (key.includes('place')) { + newItem.place = metaResult; + } + // number of pages + if (key.includes('pages')) { + newItem.numPages = metaResult.match(/\d+/)[0]; + } + // ISBN + if (key.includes('ISBN')) { + newItem.ISBN = ZU.cleanISBN(metaResult, false); + } + // author(s): whether there is one or more authors; whether last and first name are separated by ',' (if not, use single-field mode). + if (key.includes('author')) { + if (Array.isArray(metaResult)) { // If there are more than 1 authors, metaResult returns an array. + for (let i = 0; i < metaResult.length; i++) { + if (metaResult[i].includes(',')) { + newItem.creators.push(ZU.cleanAuthor(metaResult[i], 'author', true)); + } + else { + newItem.creators.push({ + lastName: metaResult[i], + creatorType: 'author', + fieldMode: 1 + }); + } + } + } + else if (metaResult.includes(',')) { + newItem.creators.push(ZU.cleanAuthor(metaResult, 'author', true)); + } + else { + newItem.creators.push({ + lastName: metaResult, + creatorType: 'author', + fieldMode: 1 + }); + } + } + // tag (Agrovoc) + if (key.includes('tags')) { + for (var i = 0; i < metaResult.length; i++) { + newItem.tags[i] = metaResult[i].trim(); + } + } + // seriesTitle + if (key.includes('seriesTitle')) { + newItem.series = metaResult; + } + // seriesNumber + if (key.includes('seriesNumber')) { + newItem.seriesNumber = metaResult; + } + // conferenceName: save for later conditions. + if (key.includes('conference')) { + conferenceWeb = metaResult[0]; + newItem.conferenceName = conferenceWeb; + } + } + + // If there's no publisher, use 'FAO' as publisher. + if (!newItem.publisher) { + newItem.publisher = 'FAO'; + } + // If there's no place, use 'Rome, Italy' as place. + if (!newItem.place) { + newItem.place = 'Rome, Italy'; + } + // If there's no author, use 'FAO' as author. + if (!newItem.creators.length) { + newItem.creators.push({ + lastName: 'FAO', + creatorType: 'author', + fieldMode: 1 + }); + } + // If conference exists in document page, the itemType is 'conferencePaper'; otherwise it's 'book'. + if (conferenceWeb) { + newItem.itemType = 'conferencePaper'; + } + else { + newItem.itemType = 'book'; + } + //* ********* End dynamic-location variables ********** } + if (url.includes('documents')) { + //* ********* Begin fixed-location variables ********** + + // Some variables always appear and appear at the same location in all document pages. - for (let key in existingMeta) { - var metaResult = cleanMeta(existingMeta[key]); + // abstract + abs = doc.getElementsByClassName("_card-body-info-center")[0]; + // abstractNote should be all text before the class "others-info". See example: https://www.fao.org/documents/card/en/c/ca8466en + var otherInfo = abs.querySelectorAll(".others-info")[0]; + var keywords = abs.querySelectorAll(".tags-list")[0]; // "KEYWORDS:" + tags + newItem.abstractNote = (abs.innerText.replace(otherInfo.innerText, '').replace(keywords.innerText, '')).trim(); - // date - if (key.includes('date')) { - newItem.date = metaResult; + // tags: class="badge" within abs + var tags = abs.querySelectorAll(".badge"); + for (let i = 0; i < tags.length; i++) { + newItem.tags[i] = tags[i].innerText.trim(); } - // publisher - if (key.includes('publisher')) { - newItem.publisher = metaResult; + + // attach PDF: PDF link in innerHTML of "_card-buttons-downloads" class. + pdfUrl = (doc.getElementsByClassName("_card-buttons-downloads")[0].innerHTML).match(/http\S*\.pdf/gi)[0]; + newItem.attachments.push({ + url: pdfUrl, + title: 'Full Text PDF', + mimeType: 'application/pdf' + }); + + // url + newItem.url = url; + + // language: 2 or 3 letters following ISO 639 + newItem.language = getLang(pdfUrl); + + // title: use colon to connect main title and subtitle (if subtitle exists) + mainTitle = doc.getElementsByClassName("page-title")[0].innerText; + var subTitleElement = doc.getElementsByClassName("sub-title"); + if (subTitleElement.length == '0') { // If there's no sub-title class in the web page, subTitleElement is an empty HTMLCollection with “0” (string, not number) as the length attribute. + newItem.title = mainTitle; } - // place - if (key.includes('place')) { - newItem.place = metaResult; + else if ((newItem.language == 'zh') || (newItem.language == 'ja')) { + newItem.title = mainTitle + ':' + subTitleElement[0].innerText; } - // number of pages - if (key.includes('pages')) { - newItem.numPages = metaResult.match(/\d+/)[0]; + else { + newItem.title = mainTitle + ': ' + subTitleElement[0].innerText; } - // ISBN - if (key.includes('ISBN')) { - newItem.ISBN = ZU.cleanISBN(metaResult, false); + + //* ********* End fixed-location variables ********** + + + //* ********* Begin dynamic-location variables ********** + + // Variables that appear neither in all document pages nor at same positions in the pages. + metaText = doc.getElementsByClassName("_card-body-info-left")[0].innerText; + + // DOI + if (metaText.includes(DOILead)) { + DOIMatch = metaText.match(/https:\/\/doi\.org\/(.+)/i); + newItem.DOI = DOIMatch[1]; } - // author(s): whether there is one or more authors; whether last and first name are separated by ',' (if not, use single-field mode). - if (key.includes('author')) { - if (Array.isArray(metaResult)) { // If there are more than 1 authors, metaResult returns an array. - for (let i = 0; i < metaResult.length; i++) { - if (metaResult[i].includes(',')) { - newItem.creators.push(ZU.cleanAuthor(metaResult[i], 'author', true)); + + // scrape text of meta area and split into an array based on line breaks. + var metaTextArr = metaText.split('\n'); + // get what variables are listed in the page, save to object existingMeta + for (let i = 0; i < metaTextArr.length; i++) { + for (let key in textVariable) { + for (let j = 0; j < textVariable[key].length; j++) { + if (metaTextArr[i].includes(textVariable[key][j])) { + existingMeta[key] = metaTextArr[i + 1]; // In metaTextArr, the value of a meta field always appears at the next element of the meta. } - else { - newItem.creators.push({ - lastName: metaResult[i], - creatorType: 'author', - fieldMode: 1 - }); + } + } + } + + for (let key in existingMeta) { + metaResult = cleanMetaDoc(existingMeta[key]); + + // date + if (key.includes('date')) { + newItem.date = metaResult; + } + // publisher + if (key.includes('publisher')) { + if (Array.isArray(metaResult)) { // differentiate between multiple (array) and single (string) + newItem.publisher = metaResult.join(', '); + } + else { + newItem.publisher = metaResult; + } + } + // place + if (key.includes('place')) { // differentiate between multiple (array) and single (string) + if (Array.isArray(metaResult)) { + newItem.publisher = metaResult.join(', '); + } + else { + newItem.publisher = metaResult; + } + } + // number of pages + if (key.includes('pages')) { + newItem.numPages = metaResult.match(/\d+/)[0]; + } + // ISBN + if (key.includes('ISBN')) { + newItem.ISBN = ZU.cleanISBN(metaResult, false); + } + // author(s): whether there is one or more authors; whether last and first name are separated by ',' (if not, use single-field mode). + if (key.includes('author')) { + if (Array.isArray(metaResult)) { // If there are more than 1 authors, metaResult returns an array. + for (let i = 0; i < metaResult.length; i++) { + if (metaResult[i].includes(',')) { + newItem.creators.push(ZU.cleanAuthor(metaResult[i], 'author', true)); + } + else { + newItem.creators.push({ + lastName: metaResult[i], + creatorType: 'author', + fieldMode: 1 + }); + } } } + else if (metaResult.includes(',')) { + newItem.creators.push(ZU.cleanAuthor(metaResult, 'author', true)); + } + else { + newItem.creators.push({ + lastName: metaResult, + creatorType: 'author', + fieldMode: 1 + }); + } } - else if (metaResult.includes(',')) { - newItem.creators.push(ZU.cleanAuthor(metaResult, 'author', true)); + // seriesTitle + if (key.includes('seriesTitle')) { + newItem.series = metaResult; } - else { - newItem.creators.push({ - lastName: metaResult, - creatorType: 'author', - fieldMode: 1 - }); + // seriesNumber + if (key.includes('seriesNumber')) { + newItem.seriesNumber = metaResult; } - } - // tag (Agrovoc) - if (key.includes('tags')) { - for (var i = 0; i < metaResult.length; i++) { - newItem.tags[i] = metaResult[i].trim(); + // conferenceName + if (key.includes('conference')) { + newItem.conferenceName = metaResult[0]; } } - // seriesTitle - if (key.includes('seriesTitle')) { - newItem.series = metaResult; + // If there's no publisher, use 'FAO' as publisher. + if (!newItem.publisher) { + newItem.publisher = 'FAO'; } - // seriesNumber: extract the number. - if (key.includes('seriesNumber')) { - newItem.seriesNumber = (metaResult.match(/\d+/) || [])[0]; + // If there's no place, use 'Rome, Italy' as place. + if (!newItem.place) { + newItem.place = 'Rome, Italy'; } - // conferenceName: save for later conditions. - if (key.includes('conference')) { - var conferenceWeb = metaResult[0]; - newItem.conferenceName = conferenceWeb; + // If there's no author, use 'FAO' as author. + if (!newItem.creators.length) { + newItem.creators.push({ + lastName: 'FAO', + creatorType: 'author', + fieldMode: 1 + }); } + // If conference exists in document page, the itemType is 'conferencePaper'; otherwise it's 'book'. + if (newItem.conferenceName) { + newItem.itemType = 'conferencePaper'; + } + else { + newItem.itemType = 'book'; + } + //* ********* End dynamic-location variables ********** } - - // If there's no publisher, use 'FAO' as publisher. - if (!newItem.publisher) { - newItem.publisher = 'FAO'; - } - // If there's no place, use 'Rome, Italy' as place. - if (!newItem.place) { - newItem.place = 'Rome, Italy'; - } - // If there's no author, use 'FAO' as author. - if (!newItem.creators.length) { - newItem.creators.push({ - lastName: 'FAO', - creatorType: 'author', - fieldMode: 1 - }); - } - // If conference exists in document page, the itemType is 'conferencePaper'; otherwise it's 'book'. - if (conferenceWeb) { - newItem.itemType = 'conferencePaper'; - } - else { - newItem.itemType = 'book'; - } - //* ********* End dynamic-location variables ********** } newItem.complete(); } - // get items from a multiple-item page -function getSearchResults(doc, checkOnly) { +// Multiple-item searching is no longer provided. +/*function getSearchResults(doc, checkOnly) { var items = {}; var found = false; var rows = ZU.xpath(doc, '//*[@class="item-image"]'); @@ -326,24 +526,23 @@ function getSearchResults(doc, checkOnly) { items[href] = title; } return found ? items : false; -} +}*/ function doWeb(doc, url) { - if (detectWeb(doc, url) == "multiple") { - Z.selectItems(getSearchResults(doc, false), function (items) { - if (!items) { - return; - } - var articles = []; - for (var i in items) { - articles.push(i); - } - ZU.processDocuments(articles, scrape); - }); - } - else { - scrape(doc, url); - } + // if (detectWeb(doc, url) == "multiple") { + // Z.selectItems(getSearchResults(doc, false), function (items) { + // if (!items) { + // return; + // } + // var articles = []; + // for (var i in items) {// articles.push(i); + // } + // ZU.processDocuments(articles, scrape); + // }); + // } + // else { + scrape(doc, url); + // } } // Note on test cases: Because the pages use dynamic elements (which is also why the translator doesn't work for multiple item pages), automatic test in Scaffold doesn't work. Every time a test is needed, use "New Web" to manually add it. @@ -352,12 +551,12 @@ function doWeb(doc, url) { var testCases = [ { "type": "web", - "url": "http://www.fao.org/documents/card/en/c/ca8466en", + "url": "https://www.fao.org/documents/card/en?details=cc0461en", "defer": true, "items": [ { "itemType": "book", - "title": "Responding to the impact of the COVID-19 outbreak on food value chains through efficient logistics", + "title": "The State of World Fisheries and Aquaculture 2022: Towards Blue Transformation", "creators": [ { "lastName": "FAO", @@ -365,15 +564,18 @@ var testCases = [ "fieldMode": 1 } ], - "date": "2020", - "ISBN": "9789251323717", - "abstractNote": "Measures implemented around the world to contain the COVID-19 pandemic have entailed a severe reduction not only in the transportation of goods and services that rely on transport, but also in the migration of labour domestically and internationally. Workers are less available reflecting both disruptions in transportation systems and restrictions to stop the transmission of the disease, within and across borders. \n\nThe Food and Agriculture Organization of the United Nations (FAO) urges countries to maintain functioning food value chains to avoid food shortages, following practices that are being proven to work. This note summarizes some practices that could be useful for governments and the private sector to maintain critical logistical elements in food value chain.\n\nRevised 26 April 2020.\n\nSee the full list of policy briefs related to COVID-19\n\n.", + "date": "2022", + "ISBN": "9789251363645", + "abstractNote": "The 2022 edition of The State of World Fisheries and Aquaculture coincides with the launch of the Decade of Action to deliver the Global Goals, the United Nations Decade of Ocean Science for Sustainable Development and the United Nations Decade on Ecosystem Restoration. It presents how these and other equally important United Nations events, such as the International Year of Artisanal Fisheries and Aquaculture (IYAFA 2022), are being integrated and supported through Blue Transformation, a priority area of FAO’s new Strategic Framework 2022–2031 designed to accelerate achievement of the 2030 Agenda for Sustainable Development in food and agriculture.\n\nThe concept of Blue Transformation emerged from the Thirty-fourth Session of the FAO Committee on Fisheries in February 2021, and in particular the Declaration for Sustainable Fisheries and Aquaculture, which was negotiated and endorsed by all FAO Members. The Declaration calls for support for “an evolving and positive vision for fisheries and aquaculture in the twenty first century, where the sector is fully recognized for its contribution to fighting poverty, hunger and malnutrition.” In this context, Part 1 of this edition of The State of World Fisheries and Aquaculture reviews the world status of fisheries and aquaculture, while Parts 2 and 3 are devoted to Blue Transformation and its pillars on intensifying and expanding aquaculture, improving fisheries management and innovating fisheries and aquaculture value chains. Blue Transformation emphasizes the need for forward-looking and bold actions to be launched or accelerated in coming years to achieve the objectives of the Declaration and in support of the 2030 Agenda. Part 4 covers current and high-impact emerging issues – COVID-19, climate change and gender equality – that require thorough consideration for transformative steps and preparedness to secure sustainable, efficient and equitable fisheries and aquaculture, and finally draws some outlook on future trends based on projections.\n\nThe State of World Fisheries and Aquaculture aims to provide objective, reliable and up-to-date information to a wide audience – policymakers, managers, scientists, stakeholders and indeed everyone interested in the fisheries and aquaculture sector.", "language": "en", "libraryCatalog": "FAO Publications", - "numPages": "4", + "numPages": "266", "place": "Rome, Italy", "publisher": "FAO", - "url": "http://www.fao.org/documents/card/en/c/ca8466en", + "series": "The State of World Fisheries and Aquaculture (SOFIA)", + "seriesNumber": "2022", + "shortTitle": "The State of World Fisheries and Aquaculture 2022", + "url": "https://www.fao.org/documents/card/en?details=cc0461en", "attachments": [ { "title": "Full Text PDF", @@ -382,16 +584,28 @@ var testCases = [ ], "tags": [ { - "tag": "Coronavirus" + "tag": "aquaculture production" }, { - "tag": "agrifood sector" + "tag": "climate change adaptation" }, { - "tag": "infectious diseases" + "tag": "fish trade" }, { - "tag": "logistics" + "tag": "fishery management" + }, + { + "tag": "fishery production" + }, + { + "tag": "fishery resources" + }, + { + "tag": "gender equality" + }, + { + "tag": "sustainable fisheries" }, { "tag": "value chains" @@ -404,35 +618,95 @@ var testCases = [ }, { "type": "web", - "url": "http://www.fao.org/documents/card/en/c/ca8751en/", + "url": "https://www.fao.org/publications/card/en?details=cc0461en", "defer": true, "items": [ { "itemType": "book", - "title": "Blockchain application in seafood value chains", + "title": "The State of World Fisheries and Aquaculture 2022: Towards Blue Transformation", "creators": [ { - "firstName": "F.", - "lastName": "Blaha", - "creatorType": "author" + "lastName": "FAO", + "creatorType": "author", + "fieldMode": 1 + } + ], + "date": "2022", + "ISBN": "9789251363645", + "abstractNote": "The 2022 edition of The State of World Fisheries and Aquaculture coincides with the launch of the Decade of Action to deliver the Global Goals, the United Nations Decade of Ocean Science for Sustainable Development and the United Nations Decade on Ecosystem Restoration. It presents how these and other equally important United Nations events, such as the International Year of Artisanal Fisheries and Aquaculture (IYAFA 2022), are being integrated and supported through Blue Transformation, a priority area of FAO’s new Strategic Framework 2022–2031 designed to accelerate achievement of the 2030 Agenda for Sustainable Development in food and agriculture. \n\nThe concept of Blue Transformation emerged from the Thirty-fourth Session of the FAO Committee on Fisheries in February 2021, and in particular the Declaration for Sustainable Fisheries and Aquaculture, which was negotiated and endorsed by all FAO Members. The Declaration calls for support for “an evolving and positive vision for fisheries and aquaculture in the twenty first century, where the sector is fully recognized for its contribution to fighting poverty, hunger and malnutrition.” In this context, Part 1 of this edition of The State of World Fisheries and Aquaculture reviews the world status of fisheries and aquaculture, while Parts 2 and 3 are devoted to Blue Transformation and its pillars on intensifying and expanding aquaculture, improving fisheries management and innovating fisheries and aquaculture value chains. Blue Transformation emphasizes the need for forward-looking and bold actions to be launched or accelerated in coming years to achieve the objectives of the Declaration and in support of the 2030 Agenda. Part 4 covers current and high-impact emerging issues – COVID-19, climate change and gender equality – that require thorough consideration for transformative steps and preparedness to secure sustainable, efficient and equitable fisheries and aquaculture, and finally draws some outlook on future trends based on projections. \n\nThe State of World Fisheries and Aquaculture aims to provide objective, reliable and up-to-date information to a wide audience – policymakers, managers, scientists, stakeholders and indeed everyone interested in the fisheries and aquaculture sector.\n\nThe following complementary information is available:\n\nRead online the full digital reportSee the interactive storyRead the In Brief\n\nHelp us improve your reading experience\n\nLast updated date 19/08/2022", + "language": "other", + "libraryCatalog": "FAO Publications", + "numPages": "266", + "place": "Rome, Italy", + "publisher": "FAO", + "series": "The State of World Fisheries and Aquaculture (SOFIA)", + "seriesNumber": "2022", + "shortTitle": "The State of World Fisheries and Aquaculture 2022", + "url": "https://www.fao.org/publications/card/en?details=cc0461en", + "attachments": [ + { + "title": "Full Text PDF", + "mimeType": "application/pdf" + } + ], + "tags": [ + { + "tag": "aquaculture production" }, { - "firstName": "K.", - "lastName": "Katafono", - "creatorType": "author" + "tag": "climate change adaptation" + }, + { + "tag": "fish trade" + }, + { + "tag": "fishery management" + }, + { + "tag": "fishery production" + }, + { + "tag": "fishery resources" + }, + { + "tag": "gender equality" + }, + { + "tag": "sustainable fisheries" + }, + { + "tag": "value chains" + } + ], + "notes": [], + "seeAlso": [] + } + ] + }, + { + "type": "web", + "url": "https://www.fao.org/documents/card/en/c/ca8466en", + "defer": true, + "items": [ + { + "itemType": "book", + "title": "Responding to the impact of the COVID-19 outbreak on food value chains through efficient logistics", + "creators": [ + { + "lastName": "FAO", + "creatorType": "author", + "fieldMode": 1 } ], "date": "2020", - "ISBN": "9789251324530", - "abstractNote": "Innovation through information and communication technologies is a key enabler in transforming food systems and holds great potential to achieve the Sustainable Development Goals. Recent developments, such as mobile technologies, smart networks, drones, remote-sensing, distributed computing, as well as disruptive technologies, such as blockchain, the Internet of things and artificial intelligence, are serving as the premise for a “digital revolution” whereby management of resources can potentially be highly optimized, intelligent and anticipatory. This publication establishes chain traceability as the substrate over which digital solutions need to operate. It provides a comprehensive introduction to blockchain, and covers smart contracts, explores how they relate to blockchain with an example of their use in seafood value chains, and then examines major development and operational considerations for blockchain applications. The publication also analyses the seafood supply chain with considerations on flag, coastal, port, processing and market States. It identifies general control elements (critical tracking events and corresponding key data elements) that form the basis for traceability monitoring and acquisition, and summarizes suitability for blockchain. It also investigates considerations for legality, transparency, species fraud and food safety.", + "ISBN": "9789251323717", + "abstractNote": "Measures implemented around the world to contain the COVID-19 pandemic have entailed a severe reduction not only in the transportation of goods and services that rely on transport, but also in the migration of labour domestically and internationally. Workers are less available reflecting both disruptions in transportation systems and restrictions to stop the transmission of the disease, within and across borders.\n\nThe Food and Agriculture Organization of the United Nations (FAO) urges countries to maintain functioning food value chains to avoid food shortages, following practices that are being proven to work. This note summarizes some practices that could be useful for governments and the private sector to maintain critical logistical elements in food value chain.", "language": "en", "libraryCatalog": "FAO Publications", - "numPages": "56", + "numPages": "4", "place": "Rome, Italy", "publisher": "FAO", - "series": "FAO Fisheries and Aquaculture Circular", - "seriesNumber": "1207", - "url": "http://www.fao.org/documents/card/en/c/ca8751en/", + "url": "https://www.fao.org/documents/card/en/c/ca8466en", "attachments": [ { "title": "Full Text PDF", @@ -441,22 +715,19 @@ var testCases = [ ], "tags": [ { - "tag": "analysis" - }, - { - "tag": "blockchain technology" + "tag": "Coronavirus" }, { - "tag": "fisheries" + "tag": "agrifood sector" }, { - "tag": "food production" + "tag": "infectious diseases" }, { - "tag": "food systems" + "tag": "logistics" }, { - "tag": "traceability" + "tag": "value chains" } ], "notes": [], @@ -466,33 +737,35 @@ var testCases = [ }, { "type": "web", - "url": "http://www.fao.org/documents/card/en/c/I9069EN", + "url": "https://www.fao.org/documents/card/en/c/ca8751en/", "defer": true, "items": [ { "itemType": "book", - "title": "Republic of Moldova Value Chain Gap Analysis", + "title": "Blockchain application in seafood value chains", "creators": [ { - "firstName": "J.", - "lastName": "O'Connell", + "firstName": "F.", + "lastName": "Blaha", "creatorType": "author" }, { - "firstName": "P.", - "lastName": "Kiparisov", + "firstName": "K.", + "lastName": "Katafono", "creatorType": "author" } ], - "date": "2018", - "ISBN": "9789251304839", - "abstractNote": "Agriculture and food industry sectors have a major importance for the Moldovan economy. The Republic of Moldova has one of the highest share of rural population among the countries in Europe and Central Asia, and its agriculture sector significantly contributes to the country’s gross domestic product.\n\nThis work is a part of a series of studies on the value chain development gaps and the environment for doing business for farmers. The goal of this study is to try to consolidate the information on countrywide value chain development gathered from various open sources and based on materials developed in a field mission by FAO officers with an emphasis on the plum and berry value chains. The authors did not aim at close examination of the selected value chains; rather, this paper is a general overview that will be a reference point for future field work in the country.\n\nTo get the results, the authors analysed the legislative history related to value chains, collected materials and statistics from open sources, conducted a field mission and interviewed stakeholders.\n\nThe first part of the report observes the overall situation in the Republic of Moldova with a focus on the agriculture sector, reviewing related legislation, the environment for doing business for farmers, and trade. The paper examines existing support measures for agriculture and covers the banking sector and trade policy. The second part examines value chain actors and overviews the selected value chains of plums and berries. The final part provides recommendations.", + "date": "2020", + "ISBN": "9789251324530", + "abstractNote": "Innovation through information and communication technologies is a key enabler in transforming food systems and holds great potential to achieve the Sustainable Development Goals. Recent developments, such as mobile technologies, smart networks, drones, remote-sensing, distributed computing, as well as disruptive technologies, such as blockchain, the Internet of things and artificial intelligence, are serving as the premise for a “digital revolution” whereby management of resources can potentially be highly optimized, intelligent and anticipatory. This publication establishes chain traceability as the substrate over which digital solutions need to operate. It provides a comprehensive introduction to blockchain, and covers smart contracts, explores how they relate to blockchain with an example of their use in seafood value chains, and then examines major development and operational considerations for blockchain applications. The publication also analyses the seafood supply chain with considerations on flag, coastal, port, processing and market States. It identifies general control elements (critical tracking events and corresponding key data elements) that form the basis for traceability monitoring and acquisition, and summarizes suitability for blockchain. It also investigates considerations for legality, transparency, species fraud and food safety.", "language": "en", "libraryCatalog": "FAO Publications", - "numPages": "65", - "place": "Budapest, Hungary", + "numPages": "56", + "place": "Rome, Italy", "publisher": "FAO", - "url": "http://www.fao.org/documents/card/en/c/I9069EN", + "series": "FAO Fisheries and Aquaculture Circular", + "seriesNumber": "No. 1207", + "url": "https://www.fao.org/documents/card/en/c/ca8751en/", "attachments": [ { "title": "Full Text PDF", @@ -501,28 +774,22 @@ var testCases = [ ], "tags": [ { - "tag": "Republic of Moldova" - }, - { - "tag": "agricultural sector" - }, - { - "tag": "data analysis" + "tag": "analysis" }, { - "tag": "economic analysis" + "tag": "blockchain technology" }, { - "tag": "economic infrastructure" + "tag": "fisheries" }, { - "tag": "economic situation" + "tag": "food production" }, { - "tag": "research" + "tag": "food systems" }, { - "tag": "supply chain" + "tag": "traceability" } ], "notes": [], @@ -532,7 +799,7 @@ var testCases = [ }, { "type": "web", - "url": "http://www.fao.org/documents/card/en/c/ca7988en/", + "url": "https://www.fao.org/documents/card/en/c/ca7988en/", "defer": true, "items": [ { @@ -554,7 +821,7 @@ var testCases = [ "place": "Rome, Italy", "publisher": "FAO", "shortTitle": "FAO publications catalogue 2020", - "url": "http://www.fao.org/documents/card/en/c/ca7988en/", + "url": "https://www.fao.org/documents/card/en/c/ca7988en/", "attachments": [ { "title": "Full Text PDF", @@ -582,7 +849,7 @@ var testCases = [ }, { "type": "web", - "url": "http://www.fao.org/publications/card/fr/c/77dbd058-8dd4-4295-af77-23f6b28cc683/", + "url": "https://www.fao.org/documents/card/fr/c/77dbd058-8dd4-4295-af77-23f6b28cc683/", "defer": true, "items": [ { @@ -602,15 +869,15 @@ var testCases = [ ], "date": "2016", "ISBN": "9789252094890", - "abstractNote": "Ce livre nous emmène au cœur des zones de forêts denses et sahéliennes de l’Afrique centrale, un écosystème précieux et essentiel à la vie quotidienne de ses habitants, représentant l’un des trois principaux ensembles boisés tropicaux de la planète. Dix pays (Burundi, Cameroun, Congo, Gabon, Guinée Equatoriale, République Centrafricaine, République Démocratique du Congo, Rwanda, Sao Tomé & Principe, Tchad) abritent ces forêts et savanes, riches d’importantes ressources naturelles. Ils ont en com mun une longue histoire liée à la colonisation, suivie d'une expérience de coopération multiforme depuis les indépendances qui évolue incontestablement vers une intégration économique et monétaire. De nos jours, alors que les équilibres séculaires entre l’homme et la nature semblent ébranlés, que la sécurité alimentaire, la lutte contre la pauvreté et la préservation de la biodiversité et des ressources forestières sont devenus des enjeux mondiaux ; à l’heure où la croissance démographique non m aîtrisée fragilise le maintien des écosystèmes forestiers tout en accentuant les conflits liés à la recherche d’espace vital, le phénomène des changements climatiques vient davantage sonder le génie créateur des populations forestières dans la préservation et la gestion durable de la forêt et des produits forestiers non ligneux (PFNL) qui en sont issus. Cette publication est l’œuvre du personnel technique de la FAO, avec la contribution des partenaires internationaux et locaux engagés dans l’évo lution des PFNL. Elle est un document précieux consacré au développement des peuples par la promotion des PFNL en Afrique centrale en vue du renforcement de la sécurité alimentaire et la lutte contre la pauvreté. \n\n Voir aussi la sommaire en version anglais", + "abstractNote": "Ce livre nous emmène au cœur des zones de forêts denses et sahéliennes de l’Afrique centrale, un écosystème précieux et essentiel à la vie quotidienne de ses habitants, représentant l’un des trois principaux ensembles boisés tropicaux de la planète. Dix pays (Burundi, Cameroun, Congo, Gabon, Guinée Equatoriale, République Centrafricaine, République Démocratique du Congo, Rwanda, Sao Tomé & Principe, Tchad) abritent ces forêts et savanes, riches d’importantes ressources naturelles. Ils ont en com mun une longue histoire liée à la colonisation, suivie d'une expérience de coopération multiforme depuis les indépendances qui évolue incontestablement vers une intégration économique et monétaire. De nos jours, alors que les équilibres séculaires entre l’homme et la nature semblent ébranlés, que la sécurité alimentaire, la lutte contre la pauvreté et la préservation de la biodiversité et des ressources forestières sont devenus des enjeux mondiaux ; à l’heure où la croissance démographique non m aîtrisée fragilise le maintien des écosystèmes forestiers tout en accentuant les conflits liés à la recherche d’espace vital, le phénomène des changements climatiques vient davantage sonder le génie créateur des populations forestières dans la préservation et la gestion durable de la forêt et des produits forestiers non ligneux (PFNL) qui en sont issus. Cette publication est l’œuvre du personnel technique de la FAO, avec la contribution des partenaires internationaux et locaux engagés dans l’évo lution des PFNL. Elle est un document précieux consacré au développement des peuples par la promotion des PFNL en Afrique centrale en vue du renforcement de la sécurité alimentaire et la lutte contre la pauvreté.\n\nVoir aussi la sommaire en version anglais", "language": "fr", "libraryCatalog": "FAO Publications", "numPages": "251", "place": "Rome, Italy", "publisher": "FAO", "series": "Produits Forestiers Non-Ligneux", - "seriesNumber": "21", - "url": "http://www.fao.org/publications/card/fr/c/77dbd058-8dd4-4295-af77-23f6b28cc683/", + "seriesNumber": "No. 21", + "url": "https://www.fao.org/documents/card/fr/c/77dbd058-8dd4-4295-af77-23f6b28cc683/", "attachments": [ { "title": "Full Text PDF", @@ -662,7 +929,7 @@ var testCases = [ }, { "type": "web", - "url": "http://www.fao.org/publications/card/zh/c/mw246ZH/", + "url": "https://www.fao.org/publications/card/zh/c/mw246ZH/", "defer": true, "items": [ { @@ -682,7 +949,7 @@ var testCases = [ "libraryCatalog": "FAO Publications", "place": "Rome, Italy", "publisher": "FAO", - "url": "http://www.fao.org/publications/card/zh/c/mw246ZH/", + "url": "https://www.fao.org/publications/card/zh/c/mw246ZH/", "attachments": [ { "title": "Full Text PDF", @@ -749,12 +1016,12 @@ var testCases = [ }, { "type": "web", - "url": "http://www.fao.org/publications/card/en/c/5014f143-be17-4b58-b90e-f1c6bef344a0/", + "url": "https://www.fao.org/documents/card/ar/c/c6c2c8d7-3683-53a7-ab58-ce480c65f36c/", "defer": true, "items": [ { "itemType": "book", - "title": "Climate-Smart Agriculture: A Call for Action: Synthesis of the Asia-Pacific Regional Workshop Bangkok, Thailand, 18 to 20 June 2015", + "title": "الخطوط التوجيهية الطوعية بشأن الحوكمة المسؤولة لحيازة الأراضي ومصايد الأسماك والغابات في سياق الأمن الغذائي الوطني", "creators": [ { "lastName": "FAO", @@ -762,17 +1029,15 @@ var testCases = [ "fieldMode": 1 } ], - "date": "2015", - "ISBN": "9789251088630", - "abstractNote": "This publication is a summary of the workshop held in Bangkok, Thailand from 18 to 20 June 2015 to promote the mainstreaming and up-scaling of Climate-Smart Agriculture in the region. Included in the report are successful case studies that agriculturists have been practicing as a means to address food security under adverse circumstances.", - "language": "en", + "date": "2012", + "ISBN": "9789256072771", + "abstractNote": "هذه الخطوط التوجيهية هي أول صكّ عالمي شامل خاص بالحيازات وإدارتها يُعدّ من خلال مفاوضات حكومية دولية. وتضع هذه الخطوط التوجيهية مبادئ ومعايير مقبولة دولياً للممارسات المسؤولة لاستخدام الأراضي ومصايد الأسماك والغابات وللتحكّم بها. وهي تعطي توجيهات لتحسين الأطر القانونية والتنظيمية والمتصلة بالسياسات التي تنظّم حقوق الحيازة ولزيادة شفافية نظم الحيازة وإدارتها ولتعزيز القدرات والإجراءات التي تتخذها الأجهزة العامة ومؤسسات القطاع الخاص ومنظمات المجتمع المدني وجميع المعنيين بالحيازات وإد ارتها. وتُدرج هذه الخطوط التوجيهية إدارة الحيازات ضمن السياق الوطني للأمن الغذائي وهي تسعى إلى المساهمة في الإعمال المطرد للحق في غذاء كافٍ والقضاء على الفقر وحماية البيئة وتحقيق التنمية الاجتماعية والاقتصادية المستدامة.", + "language": "ar", "libraryCatalog": "FAO Publications", - "numPages": "106", + "numPages": "40", "place": "Rome, Italy", - "publisher": "FAO Regional Office for Asia and the Pacific", - "series": "RAP Publication", - "shortTitle": "Climate-Smart Agriculture", - "url": "http://www.fao.org/publications/card/en/c/5014f143-be17-4b58-b90e-f1c6bef344a0/", + "publisher": "FAO", + "url": "https://www.fao.org/documents/card/ar/c/c6c2c8d7-3683-53a7-ab58-ce480c65f36c/", "attachments": [ { "title": "Full Text PDF", @@ -781,28 +1046,25 @@ var testCases = [ ], "tags": [ { - "tag": "climate-smart agriculture" - }, - { - "tag": "forestry" + "tag": "guidelines" }, { - "tag": "market gardens" + "tag": "أمن غذائي" }, { - "tag": "meetings" + "tag": "إقتصاديات الغابة" }, { - "tag": "sustainable agriculture" + "tag": "اقتصاد الصيد" }, { - "tag": "sustainable development" + "tag": "الحكم" }, { - "tag": "urban farmers" + "tag": "النوع الاجتماعي" }, { - "tag": "water harvesting" + "tag": "حيازة الأراضي" } ], "notes": [], @@ -812,12 +1074,12 @@ var testCases = [ }, { "type": "web", - "url": "http://www.fao.org/publications/card/ar/c/c6c2c8d7-3683-53a7-ab58-ce480c65f36c/", + "url": "https://www.fao.org/documents/card/en/c/5014f143-be17-4b58-b90e-f1c6bef344a0/", "defer": true, "items": [ { "itemType": "book", - "title": "الخطوط التوجيهية الطوعية بشأن الحوكمة المسؤولة لحيازة الأراضي ومصايد الأسماك والغابات في سياق الأمن الغذائي الوطني", + "title": "Climate-Smart Agriculture: A Call for Action: Synthesis of the Asia-Pacific Regional Workshop Bangkok, Thailand, 18 to 20 June 2015", "creators": [ { "lastName": "FAO", @@ -825,14 +1087,17 @@ var testCases = [ "fieldMode": 1 } ], - "date": "2012", - "abstractNote": "هذه الخطوط التوجيهية هي أول صكّ عالمي شامل خاص بالحيازات وإدارتها يُعدّ من خلال مفاوضات حكومية دولية. وتضع هذه الخطوط التوجيهية مبادئ ومعايير مقبولة دولياً للممارسات المسؤولة لاستخدام الأراضي ومصايد الأسماك والغابات وللتحكّم بها. وهي تعطي توجيهات لتحسين الأطر القانونية والتنظيمية والمتصلة بالسياسات التي تنظّم حقوق الحيازة ولزيادة شفافية نظم الحيازة وإدارتها ولتعزيز القدرات والإجراءات التي تتخذها الأجهزة العامة ومؤسسات القطاع الخاص ومنظمات المجتمع المدني وجميع المعنيين بالحيازات وإد ارتها. وتُدرج هذه الخطوط التوجيهية إدارة الحيازات ضمن السياق الوطني للأمن الغذائي وهي تسعى إلى المساهمة في الإعمال المطرد للحق في غذاء كافٍ والقضاء على الفقر وحماية البيئة وتحقيق التنمية الاجتماعية والاقتصادية المستدامة.", - "language": "ar", + "date": "2015", + "ISBN": "9789251088630", + "abstractNote": "This publication is a summary of the workshop held in Bangkok, Thailand from 18 to 20 June 2015 to promote the mainstreaming and up-scaling of Climate-Smart Agriculture in the region. Included in the report are successful case studies that agriculturists have been practicing as a means to address food security under adverse circumstances.", + "language": "en", "libraryCatalog": "FAO Publications", - "numPages": "40", + "numPages": "106", "place": "Rome, Italy", - "publisher": "FAO", - "url": "http://www.fao.org/publications/card/ar/c/c6c2c8d7-3683-53a7-ab58-ce480c65f36c/", + "publisher": "FAO Regional Office for Asia and the Pacific", + "series": "RAP Publication", + "shortTitle": "Climate-Smart Agriculture", + "url": "https://www.fao.org/documents/card/en/c/5014f143-be17-4b58-b90e-f1c6bef344a0/", "attachments": [ { "title": "Full Text PDF", @@ -841,25 +1106,28 @@ var testCases = [ ], "tags": [ { - "tag": "null" + "tag": "climate-smart agriculture" + }, + { + "tag": "forestry" }, { - "tag": "null" + "tag": "market gardens" }, { - "tag": "null" + "tag": "meetings" }, { - "tag": "null" + "tag": "sustainable agriculture" }, { - "tag": "أمن غذائي" + "tag": "sustainable development" }, { - "tag": "الحكم" + "tag": "urban farmers" }, { - "tag": "حيازة الأراضي" + "tag": "water harvesting" } ], "notes": [], diff --git a/INSPIRE.js b/INSPIRE.js index b7ac6984bc..a281abef57 100644 --- a/INSPIRE.js +++ b/INSPIRE.js @@ -9,7 +9,7 @@ "inRepository": true, "translatorType": 4, "browserSupport": "gcsibv", - "lastUpdated": "2021-07-19 16:22:40" + "lastUpdated": "2023-04-06 18:53:02" } /* @@ -88,7 +88,7 @@ function scrape(doc, url) { item.tags.push({ tag: tag.textContent.trim() }); } - for (let action of doc.querySelectorAll('.__ListItemAction__ a')) { + for (let action of doc.querySelectorAll('.__UserAction__ a')) { if (/\bpdf\b/i.test(action.textContent)) { item.attachments.push({ title: 'Full Text PDF', @@ -115,6 +115,7 @@ var testCases = [ { "type": "web", "url": "https://inspirehep.net/literature/1284987", + "defer": true, "items": [ { "itemType": "journalArticle", @@ -186,6 +187,7 @@ var testCases = [ { "type": "web", "url": "https://inspirehep.net/literature/1282171", + "defer": true, "items": [ { "itemType": "journalArticle", diff --git a/Lapham's Quarterly.js b/Lapham's Quarterly.js new file mode 100644 index 0000000000..884d1cdd5c --- /dev/null +++ b/Lapham's Quarterly.js @@ -0,0 +1,1062 @@ +{ + "translatorID": "e329ec79-397e-4aa5-a06e-1aa32f10a138", + "label": "Lapham's Quarterly", + "creator": "Zoë C. Ma", + "target": "^https?://www\\.laphamsquarterly\\.org/", + "minVersion": "5.0", + "maxVersion": "", + "priority": 100, + "inRepository": true, + "translatorType": 4, + "browserSupport": "gcsibv", + "lastUpdated": "2023-04-11 10:35:51" +} + +/* + ***** BEGIN LICENSE BLOCK ***** + Copyright © 2023 Zoë C. Ma + + This file is part of Zotero. + Zotero is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + Zotero is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + You should have received a copy of the GNU Affero General Public License + along with Zotero. If not, see . + ***** END LICENSE BLOCK ***** +*/ + +function detectWeb(doc, url) { + const urlObj = new URL(url); + // About pages, legal notes, content listings, event notices... or + // content pages without identifiable author, or fragmentary quotations of + // historical materials. + const skipPath = /^\/(about|legal|issues|archive|contributors|conversations|lq-interactives|outreach|programs|events|world-in-time|deja-vu)/; + if (urlObj.pathname.match(skipPath)) { + return false; + } + + // Also skip pages from magazine sections that has no usable author + // info. (Maps, Miscellany, charts and graphs, etc.). + const skipSection = /^\/.+\/(maps|miscellany|charts-graphs)\/.+/; + if (urlObj.pathname.match(skipSection)) { + return false; + } + + // Also skip the individual issue pages. This can only be done by + // inspecting the document. + if (doc.querySelector("body.node-type-issue")) { + return false; + } + + if (urlObj.pathname.match(/^\/search\/node\/.+/)) { + // Search results. + if (getSearchResults(doc, true)) { + return "multiple"; + } + else { + return false; + } + } + + if (doc.querySelector("body.node-type-podcast")) { + return "podcast"; + } + + if (doc.querySelector("body.section-roundtable")) { + return "blogPost"; + } + + return "magazineArticle"; +} + +async function doWeb(doc, url) { + if (detectWeb(doc, url) == 'multiple') { + const items = await Z.selectItems(getSearchResults(doc, false)); + if (!items) return; + for (const url of Object.keys(items)) { + await scrape(await requestDocument(url)); + } + } + else { + await scrape(doc, url); + } +} + +function getSearchResults(doc, checkOnly = false) { + const resultElems = doc.querySelectorAll(".search-results .search-result"); // Lovely semantics! + if (!resultElems.length) return false; + + // Title string -> array of URLs. + const titleMap = new Map(); + // While collecting title -> URL mapping for possible duplicate titles, + // check for duplicate URLs too, though this is unlikely. If it does + // happen, the first one in the document order takes precedence. + const hrefsSeen = new Set(); + for (const elem of resultElems) { + const href = attr(elem, "h3 > a", "href"); + const title = text(elem, "h3 > a"); + + if (href && !hrefsSeen.has(href) && title) { + if (checkOnly) return true; + + hrefsSeen.add(href); + // Title may contain duplicates even if the links are + // unique. In other words, one title may be associated with + // multiple (i.e. an array of) URLs. + if (!titleMap.has(title)) { + titleMap.set(title, []); + } + titleMap.get(title).push(href); + } + } + + // If the same title text is associated with multiple URLs, add a + // parenthesized number showing the order of the title's appearance in the + // search results. + const items = {}; + // Map conveniently maintains insertion order. + for (const [title, hrefArray] of titleMap) { + const hasDup = hrefArray.length > 1; + for (const [i, href] of hrefArray.entries()) { + items[href] = !hasDup + ? title + : `${title} (${i + 1}, URL: ${(new URL(href)).pathname})`; + } + } + + return hrefsSeen.size && items; +} + +async function scrape(doc, url = doc.location.href) { + const type = detectWeb(doc, url); + if (!type) { + // This could happen if the user selects an item from the + // multiple, but that item happens to be something we cannot + // exclude based on URL/title alone. + Z.debug(`scrape function encountered mismatched type ${type} for ${url}`); + return; + } + + const item = new Z.Item(type); + item.url = url; + item.language = attr(doc, "html", "lang"); + item.attachments = []; + + switch (type) { + case "magazineArticle": + await applyMagazine(doc, item); + break; + case "blogPost": + applyBlog(doc, item); + break; + case "podcast": + applyPodcast(doc, item); + break; + } + + item.attachments.push({ + document: doc, + title: "Snapshot", + mimeType: "text/html" + }); + + item.complete(); +} + +// Magazine articles. This will always be async even if the async task is not +// performed, in the (unlikely) case when the issue-info URL to be scraped is +// not found on the article page. +async function applyMagazine(doc, item) { + item.ISSN = "1935-7494"; + item.publicationTitle = "Lapham’s Quarterly"; + + item.title = text(doc, "#page-title"); + item.creators = parseAuthors(getArticleAuthorText(doc)); + + const excerpt = text(doc, ".excerpt"); + if (excerpt) item.abstractNote = excerpt; + + if (doc.querySelector("body.node-type-voices-in-time")) { + // Voices in Time + let tmp = ZU.trimInternal(text(doc, ".title .date")); // Original date + if (tmp) { + item.originalDate = tmp; + } + + tmp = getVITRightsTrans(doc); // Rights and translators + if (tmp) { + if (tmp.rights) { + item.rights = tmp.rights; + } + if (tmp.translators) { // could be undefined + item.creators.push(...tmp.translators); + } + } + + tmp = getVITAboutText(doc); // "About the text" or brief bio of author + if (tmp) { + item.notes = [tmp]; + } + } + + const issueRelURL = attr(doc, ".sticky-content > a", "href"); + if (!issueRelURL) { + Z.debug(`Article at ${item.url} missing the link to its issue.`); + return undefined; + } + + const issueURL = (new URL(issueRelURL, doc.location)).href; + return setIssueDate(issueURL, item); +} + +// Cache for the issue info. Keys are the permalinks to the issue-page URLs, +// and values are the corresponding issue info returned by +// fetchIssueDateInfo(). This is to avoid repeated network requests for the +// same document when saving multiple items. +const _issueCache = new Map(); + +async function setIssueDate(url, item) { + let value; + if (_issueCache.has(url)) { + value = _issueCache.get(url); + } + else { + value = await fetchIssueDateInfo(url); + if (value) { + _issueCache.set(url, value); + } + } + Object.assign(item, value); +} + +async function fetchIssueDateInfo(url) { + let doc; + try { + doc = await requestDocument(url); + } + catch (err) { + Z.debug(`Failed to request ${url} for issue/date info.`); + return null; + } + + let dateText = text(doc, "p.date"); + if (!dateText) { + Z.debug(`Issue/date info unexpectedly missing at ${url}`); + return null; + } + + dateText = ZU.trimInternal(dateText); + + // dateText should look like the following: + // "Volume XIV, Number 4 | [season/month-range] 2022". + // Convert it into array like ["XIV", "4", "2022"] + const [volume, number, year] = dateText.split(/[,|]/) + .map(x => x.trim().split(" ")[1]); + return { + volume: romanToInt(volume), + issue: parseInt(number), + date: parseInt(year) + }; +} + +// Get the rights and translator info for Voices in Time if any. +function getVITRightsTrans(doc) { + const paragraphs = doc.querySelectorAll(".content-wrapper > p"); + if (!paragraphs.length) { + return false; + } + + const str + = ZU.trimInternal(paragraphs.item(paragraphs.length - 1).textContent); + + if (str) { + const infoObj = {}; + + // . [optional words ](C) yyyy[ by name] ... (full stop) + let match = str.match(/(?:^|\.\s+)((?:\w+\s+)*©\s+\d+.+?\.)/im); + if (match) { + infoObj.rights = match[1]; + } + + // Translator. "Translated by ... [stop or semicolon]" + match = str.match(/(?:^|\.\s+)translated by (.+?)[.;]/i); + if (match) { + const transArray = parseAuthors(match[1], "translator"); + if (transArray.length) { + infoObj.translators = transArray; + } + } + + return infoObj; + } + + return false; +} + +// Get the text block under "About this text" for Voices in Time. The block is +// present even if the text has no identifiable author. +function getVITAboutText(doc) { + const paragraphs = doc.querySelectorAll(".bio-block > p"); + if (!paragraphs.length) { + return ""; + } + + const output = []; + for (const paragraph of paragraphs.values()) { + output.push(ZU.trimInternal(paragraph.textContent.trim())); + } + // Re-inserting paragraph-ending line breaks and add extra line break + // between paragraphs. + return output.join("\n\n"); +} + +// Blog articles. +function applyBlog(doc, item) { + // Blog-article title proper + item.title = text(doc, ".title > h2"); + item.creators = parseAuthors(getArticleAuthorText(doc)); + item.date = getBlogPostDate(doc); + // blogTitle refers to the name of the blog hosted by Lapham's. + item.blogTitle = text(doc, "#page-title"); +} + +function getBlogPostDate(doc) { + const dateText = text(doc, ".pub-date"); + return !!dateText && (new Date(dateText)).toISOString(); +} + +// Returns the author string (for magazine article or blog post). +function getArticleAuthorText(doc) { + // Take the author's byline from the "Contributor" block, which is more + // cumbersome but also more reliable than the byline at ".title .author". + let byline = text(doc, + '.banner-block a[href^="/contributors/"]' // usual place + + ', .bio-heading a[href^="/contributors/"]' // "voices in time" + ); + + if (!byline) { + // Just in case the above didn't work, try this more obvious but less + // generic one. + byline = text(doc, ".title .author"); // Could be p or h2 element. + + // NOTE: failure mode: None of the selectors can locate the element. + if (!byline) return ""; + + // Remove any initial "By ..." + byline = byline.replace(/^(By\s+)?/i, ""); + } + + const authorText = ZU.trimInternal(byline); + + if (authorText === "Lapham’s Quarterly") { + // Skip adding author info when the "author" is the same as the + // publisher. + return ""; + } + + return authorText; +} + +// Podcasts +function applyPodcast(doc, item) { + const podPublication = text(doc, ".title > h1"); + item.seriesTitle = podPublication; + // Date text uses the same DOM element as it is on blog articles. + item.date = getBlogPostDate(doc); + let t = getPodDuration(doc); + if (!Number.isNaN(t)) { + item.runningTime = t; + } + + const mainAudioSelector = ".top-image-block audio > source"; + const epURL = attr(doc, mainAudioSelector, "src"); + item.audioFileType = attr(doc, mainAudioSelector, "type"); + + item.abstractNote = attr(doc, "meta[name='description']", "content"); + + const headingText = text(doc, ".title > h2"); + if (podPublication.toLowerCase() === "the world in time") { + // The EiC's own podcast. + item.creators = [ZU.cleanAuthor("Lewis H. Lapham", "author")]; + item.title = headingText; + // Extract episode number + const epMatch = epURL.match(/episode-(\d+)-/i); + if (epMatch) { + item.episodeNumber = parseInt(epMatch[1]); + } + + const guestName = inferEiCPodGuest(doc, headingText, epURL); + if (guestName) { + item.creators.push(ZU.cleanAuthor(guestName, "guest")); + } + } + else if (podPublication.toLowerCase() === "lq podcast") { + // The metadata for "LQ Podcast" is more difficult to obtain. The + // naming scheme is more diverse, and even if we're tempted to parse + // the audio filename for author info, see this for how it may not + // work: https://www.laphamsquarterly.org/content/poes-terror-soul + const [, ep, title] = headingText.match(/#(\d+)\s+(.+)/i); + item.episodeNumber = parseInt(ep); + item.title = title; + } + + item.attachments.push({ + title: "Audio", + mimeType: item.audioFileType, + url: epURL, + }); +} + +// Get the duration of episode as a string. This can return NaN if the duration +// cannot be scraped from the doc. +function getPodDuration(doc) { + const currTime = text(doc, ".jp-current-time"); + const remainTime = text(doc, ".jp-duration"); // Negative value. + return timeToDuration(parseTime(currTime) - parseTime(remainTime)); +} + +// Parse mm:ss time duration string as number of seconds. Returns NaN if the +// input string does not match the expected format. +function parseTime(str) { + const strTimeMatch = str.match(/(-?\d+):(\d+)/); + if (!strTimeMatch) { + return NaN; + } + let [, m, s] = strTimeMatch; + s = parseInt(s); + let t = parseInt(m) * 60; + t += t > 0 ? s : -s; + return t; +} + +// Convert number of seconds to duration string in h:mm:ss format. +function timeToDuration(s) { + let h = 0; + let m = Math.floor(s / 60); + s %= 60; + if (m > 59) { + h = Math.floor(m / 60); + m %= 60; + } + m = zeroPad(m, 2); + s = zeroPad(s, 2); + return h > 0 ? `${h}:${m}:${s}` : `${m}:${s}`; +} + +// Zero-pad an integer up to length. +function zeroPad(num, length) { + return ZU.lpad(`${num}`, "0", length); +} + +// Find the name of the guest in Lewis Lapham's podcast episode. +// NOTE: Usually the title is the guest's name, but not always. +// See: https://www.laphamsquarterly.org/content/vicars-christ, where the title +// is "Vicars of Christ". +// Therefore we try to infer the name using heuristics: +// 1. "The name often appears in the main-content paragraphs containing the +// words '[Lewis H.] Lapham (verb, speaks/talks) with [title] NAME [punct or +// 'about', but also possibly more noisy words]" +// 2. "It is very likely to be in the title." +// 3. "But if not 2, the name may also appear in the audio source file's name +// in the URL." +function inferEiCPodGuest(doc, headingText, epURL) { + // Take the basename of the episode audio without the last file extension. + // .../url/path/to/(basename).ext?query#frag + let [, epSource] = epURL.match(/^(?:.+\/)(.+)(?:\..+)$/); + + // Try to find name candidate by parsing the paragraph text. + const paragraphs = doc.querySelectorAll(".jp-jplayer ~ p"); + + let nameCandidate; + if (paragraphs) { + const textString = Array.from(paragraphs) + .map(x => ZU.trimInternal(x.textContent.trim())) + .join(" "); + // [Lewis[ H.] ]Lapham [verb] with (noisy name candidate)[, or about ] + // Note here "noisy name candidate" matches leniently, but + // non-greedily. Otherwise the group will match all the way to the last + // comma punct or "about". + const nameMatch = textString + .match(/(?:Lewis(?: H\.)? )Lapham \S+ with ([\S ]+?)(?:,| about)/); + if (nameMatch) { + nameCandidate = nameMatch[1]; + } + } + + // If no useful name candidate is extracted, here's our last ditch effort: + // fall back to using the episode file name alone (this is the case for + // some old episodes). + if (!nameCandidate) { + return epSource.replace(/_/g, " "); + } + + if (nameCandidate.toLowerCase().includes(headingText.toLowerCase())) { + // Name candidate (possibly surrounded by noise) is in title: This + // means the title can be taken to be the guest's name with high + // confidence. + return headingText; + } + else { + // Name candidate found but not in title. In this case, use to the + // audio file's name as a filter to clean up name candidate. + // So we need to case-normalize the episode filename. + epSource = epSource.toLowerCase(); + + // Generate "token stream" from name candidate, e.g. + // "Johann Sebastian Bach" -> ["Johann", "Sebastian", "Bach"] + // "Vita Sackerville-West" -> ["Vita", "Sackerville-West"] + const tokens = nameCandidate.split(" "); + const filteredTokens = []; // output + for (const token of tokens) { + // Token may contain punct such as period or comma as "noise" + // around the word, and apostrophe as internal "noise", but careful + // not to overgeneralize (TODO: replace any dash with one single + // minus-sign-hyphen (0x2D)?). Also, need to normalize and remove + // the diacritics. + const cleanToken + = token + .normalize("NFD") + .replace(/[\u0300-\u036F]/g, "") // Most of diacritics + .toLowerCase() + .split("") // to remove noisy puncts + .filter(x => !(x === "." || x === "," + || x === "'" || x === "’")) + .join(""); + + // Note that we use clean token for logic but original token + // for output. + if (epSource.includes(cleanToken)) { + filteredTokens.push(token); + } + } + return filteredTokens.join(" "); + } +} + +// Utility functions + +// Process author. Parse it as "[possibly Oxford] comma-separated, possibly +// with the word 'and'". +function parseAuthors(str, authorType = "author") { + return str.split(/(?:,|\s+and\s+)/) + .map(s => s.trim()) + .filter(Boolean) + .map(s => ZU.cleanAuthor(s, authorType)); +} + +// Convert from Roman numeral to integer. Note that the function assumes a +// correctly formed Roman numeral using letters up to C. +const ROMAN_NUMERAL = { + I: 1, + V: 5, + X: 10, + L: 50, + C: 100 // "D" and "M" unlikely to be encountered any time soon. +}; + +function romanToInt(str) { + return str.split("") + .map(i => ROMAN_NUMERAL[i.toUpperCase()]) + .reduce((sum, curValue, cur, arr) => { + const prev = cur - 1; + const trySum = sum + curValue; + if (cur > 0 && arr[prev] < curValue) { + // Should subtract instead of add. + return trySum - arr[prev] * 2; + } + return trySum; + }, 0); +} + +/** BEGIN TEST CASES **/ +var testCases = [ + { + "type": "web", + "url": "https://www.laphamsquarterly.org/search/node/mesopotamian", + "items": "multiple" + }, + { + "type": "web", + "url": "https://www.laphamsquarterly.org/education/schoolboy-where-are-you-going", + "items": [ + { + "itemType": "magazineArticle", + "title": "Schoolboy, Where Are You Going?", + "creators": [ + { + "firstName": "Moudhy", + "lastName": "Al-Rashid", + "creatorType": "author" + } + ], + "date": 2022, + "ISSN": "1935-7494", + "abstractNote": "Scribal education in the ancient Mesopotamian tablet house.", + "issue": 4, + "language": "en", + "libraryCatalog": "Lapham's Quarterly", + "publicationTitle": "Lapham’s Quarterly", + "url": "https://www.laphamsquarterly.org/education/schoolboy-where-are-you-going", + "volume": 14, + "attachments": [ + { + "title": "Snapshot", + "mimeType": "text/html" + } + ], + "tags": [], + "notes": [], + "seeAlso": [] + } + ] + }, + { + "type": "web", + "url": "https://www.laphamsquarterly.org/roundtable/double-vision", + "items": [ + { + "itemType": "blogPost", + "title": "Double Vision", + "creators": [ + { + "firstName": "Frank", + "lastName": "Gonzalez-Crussi", + "creatorType": "author" + } + ], + "date": "2023-03-14T16:00:00.000Z", + "blogTitle": "Roundtable", + "language": "en", + "url": "https://www.laphamsquarterly.org/roundtable/double-vision", + "attachments": [ + { + "title": "Snapshot", + "mimeType": "text/html" + } + ], + "tags": [], + "notes": [], + "seeAlso": [] + } + ] + }, + { + "type": "web", + "url": "https://www.laphamsquarterly.org/roundtable/ancient-mesopotamian-tablet-cookbook", + "items": [ + { + "itemType": "blogPost", + "title": "The Ancient Mesopotamian Tablet as Cookbook", + "creators": [ + { + "firstName": "Gojko", + "lastName": "Barjamovic", + "creatorType": "author" + }, + { + "firstName": "Patricia Jurado", + "lastName": "Gonzalez", + "creatorType": "author" + }, + { + "firstName": "Chelsea A.", + "lastName": "Graham", + "creatorType": "author" + }, + { + "firstName": "Agnete W.", + "lastName": "Lassen", + "creatorType": "author" + }, + { + "firstName": "Nawal", + "lastName": "Nasrallah", + "creatorType": "author" + }, + { + "firstName": "Pia M.", + "lastName": "Sörensen", + "creatorType": "author" + } + ], + "date": "2019-06-10T16:00:00.000Z", + "blogTitle": "Roundtable", + "language": "en", + "url": "https://www.laphamsquarterly.org/roundtable/ancient-mesopotamian-tablet-cookbook", + "attachments": [ + { + "title": "Snapshot", + "mimeType": "text/html" + } + ], + "tags": [], + "notes": [], + "seeAlso": [] + } + ] + }, + { + "type": "web", + "url": "https://www.laphamsquarterly.org/content/peter-s-goodman", + "items": [ + { + "itemType": "podcast", + "title": "Peter S. Goodman", + "creators": [ + { + "firstName": "Lewis H.", + "lastName": "Lapham", + "creatorType": "author" + }, + { + "firstName": "Peter S.", + "lastName": "Goodman", + "creatorType": "guest" + } + ], + "abstractNote": "“Davos Man’s domination of the gains of globalization,” journalist Peter S. Goodman writes in Davos Man: How the Billionaires Devoured the World, “is how the United States found itself led by a patently unqualified casino developer as it grappled with a public health emergency that killed more Americans than those who died in World War I, World War II, and the Vietnam War", + "audioFileType": "audio/mpeg", + "episodeNumber": 87, + "language": "en", + "runningTime": "35:51", + "seriesTitle": "The World in Time", + "url": "https://www.laphamsquarterly.org/content/peter-s-goodman", + "attachments": [ + { + "title": "Audio", + "mimeType": "audio/mpeg" + }, + { + "title": "Snapshot", + "mimeType": "text/html" + } + ], + "tags": [], + "notes": [], + "seeAlso": [] + } + ] + }, + { + "type": "web", + "url": "https://www.laphamsquarterly.org/content/soviets-spies", + "items": [ + { + "itemType": "podcast", + "title": "Soviets & Spies", + "creators": [], + "abstractNote": "Did an Englishman assist in the murder of Rasputin? Did a man knowns as the “Ace of Spies” almost carry off the assassination of the entire Bolshevik power structure? Did British agents really use semen as an invisible ink? Giles Milton, author of Russian Roulette: How British Spies Thwarted Lenin's Plot for Global Revolution, has the answers.", + "audioFileType": "audio/mpeg", + "episodeNumber": 61, + "language": "en", + "runningTime": "43:42", + "seriesTitle": "LQ Podcast", + "url": "https://www.laphamsquarterly.org/content/soviets-spies", + "attachments": [ + { + "title": "Audio", + "mimeType": "audio/mpeg" + }, + { + "title": "Snapshot", + "mimeType": "text/html" + } + ], + "tags": [], + "notes": [], + "seeAlso": [] + } + ] + }, + { + "type": "web", + "url": "https://www.laphamsquarterly.org/content/paradise-city", + "items": [ + { + "itemType": "podcast", + "title": "To the Paradise City", + "creators": [ + { + "firstName": "Lewis H.", + "lastName": "Lapham", + "creatorType": "author" + }, + { + "firstName": "Brook", + "lastName": "Wilensky-Lanford", + "creatorType": "guest" + } + ], + "abstractNote": "Lewis Lapham talks with author Brook Wilensky-Lanford about the search for Adam and Eve’s hometown.", + "audioFileType": "audio/mpeg", + "language": "en", + "runningTime": "16:27", + "seriesTitle": "The World in Time", + "url": "https://www.laphamsquarterly.org/content/paradise-city", + "attachments": [ + { + "title": "Audio", + "mimeType": "audio/mpeg" + }, + { + "title": "Snapshot", + "mimeType": "text/html" + } + ], + "tags": [], + "notes": [], + "seeAlso": [] + } + ] + }, + { + "type": "web", + "url": "https://www.laphamsquarterly.org/content/death-nothing-us", + "items": [ + { + "itemType": "podcast", + "title": "Death Is Nothing to Us", + "creators": [ + { + "firstName": "Lewis H.", + "lastName": "Lapham", + "creatorType": "author" + }, + { + "firstName": "Stephen", + "lastName": "Greenblatt", + "creatorType": "guest" + } + ], + "abstractNote": "Historian Stephen Greenblatt writes of “the concentrated force of the buried past” in The Swerve, his 2011 National Book Award winner in nonfiction.", + "audioFileType": "audio/mpeg", + "language": "en", + "runningTime": "20:05", + "seriesTitle": "The World in Time", + "url": "https://www.laphamsquarterly.org/content/death-nothing-us", + "attachments": [ + { + "title": "Audio", + "mimeType": "audio/mpeg" + }, + { + "title": "Snapshot", + "mimeType": "text/html" + } + ], + "tags": [], + "notes": [], + "seeAlso": [] + } + ] + }, + { + "type": "web", + "url": "https://www.laphamsquarterly.org/content/roosevelt-montas", + "items": [ + { + "itemType": "podcast", + "title": "Roosevelt Montás", + "creators": [ + { + "firstName": "Lewis H.", + "lastName": "Lapham", + "creatorType": "author" + }, + { + "firstName": "Roosevelt", + "lastName": "Montás", + "creatorType": "guest" + } + ], + "abstractNote": "“In my sophomore year of high school, I came upon a remarkable book in a garbage pile next to the house where we rented an apartment in Queens,” scholar Roosevelt Montás writes at the beginning of Rescuing Socrates: How the Great Books Changed My Life and Why They Matter for a New Generation. “It was the second volume of the pretentiously bound Harvard Classics series, and it", + "audioFileType": "audio/mpeg", + "episodeNumber": 85, + "language": "en", + "runningTime": "32:19", + "seriesTitle": "The World in Time", + "url": "https://www.laphamsquarterly.org/content/roosevelt-montas", + "attachments": [ + { + "title": "Audio", + "mimeType": "audio/mpeg" + }, + { + "title": "Snapshot", + "mimeType": "text/html" + } + ], + "tags": [], + "notes": [], + "seeAlso": [] + } + ] + }, + { + "type": "web", + "url": "https://www.laphamsquarterly.org/content/andrew-j-oshaughnessy", + "items": [ + { + "itemType": "podcast", + "title": "Andrew J. O’Shaughnessy", + "creators": [ + { + "firstName": "Lewis H.", + "lastName": "Lapham", + "creatorType": "author" + }, + { + "firstName": "Andrew J.", + "lastName": "O’Shaughnessy", + "creatorType": "guest" + } + ], + "abstractNote": "“Existing biographies of Thomas Jefferson,” the historian Andrew J. O’Shaughnessy writes in The Illimitable Freedom of the Human Mind: Thomas Jefferson’s Idea of a University, treat the retired president’s singular founding of a university “as merely an epilogue, while institutional histories give little consideration to the biographical context…Beginning at the age of", + "audioFileType": "audio/mpeg", + "episodeNumber": 84, + "language": "en", + "runningTime": "32:41", + "seriesTitle": "The World in Time", + "url": "https://www.laphamsquarterly.org/content/andrew-j-oshaughnessy", + "attachments": [ + { + "title": "Audio", + "mimeType": "audio/mpeg" + }, + { + "title": "Snapshot", + "mimeType": "text/html" + } + ], + "tags": [], + "notes": [], + "seeAlso": [] + } + ] + }, + { + "type": "web", + "url": "https://www.laphamsquarterly.org/freedom/andrey-kurkov-picks-his-pen", + "items": [ + { + "itemType": "magazineArticle", + "title": "Andrey Kurkov Picks Up His Pen", + "creators": [ + { + "firstName": "Andrey", + "lastName": "Kurkov", + "creatorType": "author" + } + ], + "date": 2023, + "ISSN": "1935-7494", + "abstractNote": "On the freedom to write in Ukraine.", + "issue": 1, + "language": "en", + "libraryCatalog": "Lapham's Quarterly", + "publicationTitle": "Lapham’s Quarterly", + "rights": "Copyright © 2022 by Andrey Kurkov.", + "url": "https://www.laphamsquarterly.org/freedom/andrey-kurkov-picks-his-pen", + "volume": 15, + "attachments": [ + { + "title": "Snapshot", + "mimeType": "text/html" + } + ], + "tags": [], + "notes": [ + "From a speech delivered at the PEN World Voices Festival. The son of a doctor and a pilot, Kurkov trained as a Japanese translator and began writing novels while serving as a prison guard in Odesa. His novel Grey Bees, which he wrote after meeting refugees in Kyiv who made regular trips to the Donbas to deliver medicine, depicts the 2014 war through the perspective of a beekeeper. “For Ukrainians, freedom is more important than stability,” Kurkov said in a March 2022 interview. “For Russians, it is the opposite. Ukrainians change their presidents at each election, Russians keep their tsar until the tsar is dead.”" + ], + "seeAlso": [] + } + ] + }, + { + "type": "web", + "url": "https://www.laphamsquarterly.org/freedom/we-refuse-logic", + "items": [ + { + "itemType": "magazineArticle", + "title": "We Refuse This Logic", + "creators": [ + { + "firstName": "Arlen", + "lastName": "Austin", + "creatorType": "translator" + } + ], + "date": 2023, + "ISSN": "1935-7494", + "abstractNote": "The problem is not abortion.", + "issue": 1, + "language": "en", + "libraryCatalog": "Lapham's Quarterly", + "publicationTitle": "Lapham’s Quarterly", + "rights": "Translation copyright © 2022 by Arlen Austin.", + "url": "https://www.laphamsquarterly.org/freedom/we-refuse-logic", + "volume": 15, + "attachments": [ + { + "title": "Snapshot", + "mimeType": "text/html" + } + ], + "tags": [], + "notes": [ + "Movimento di Lotta Femminile di Padova, from “Pregnancy and Abortion.” In June 1971 Mariarosa Dalla Costa, who had been active in the Italian workers’ movement, convened a meeting in Padua to discuss demanding wages for housework. The meeting led to the formation of what came to be called Lotta Femminista, which produced pamphlets, conducted studies, and documented its militant activity. This manifesto was later published in Dalla Costa and Selma James’ book The Power of Women and the Subversion of the Community." + ], + "seeAlso": [] + } + ] + }, + { + "type": "web", + "url": "https://www.laphamsquarterly.org/youth/sweet-and-cold", + "items": [ + { + "itemType": "magazineArticle", + "title": "Sweet and Cold", + "creators": [ + { + "firstName": "Xu", + "lastName": "Wei", + "creatorType": "author" + }, + { + "firstName": "Jonathan", + "lastName": "Chaves", + "creatorType": "translator" + } + ], + "date": 2014, + "ISSN": "1935-7494", + "abstractNote": "What a shame that I have carried a boy, as he ate some candy, to his death.", + "issue": 3, + "language": "en", + "libraryCatalog": "Lapham's Quarterly", + "publicationTitle": "Lapham’s Quarterly", + "rights": "© 1986, Columbia University Press.", + "url": "https://www.laphamsquarterly.org/youth/sweet-and-cold", + "volume": 7, + "attachments": [ + { + "title": "Snapshot", + "mimeType": "text/html" + } + ], + "tags": [], + "notes": [ + "“A Kite.” After failing the civil-service examination on eight occasions, Xu became the personal secretary to a military commander in 1558 and assisted in defending his hometown from the attacks of Japanese pirates. After his patron’s downfall and death, he was faced with serious professional difficulties and, either insane or faking it effectively, attempted suicide by pushing an awl through his ear and pounding his testicles with a hammer. Later, he killed his third wife and went to prison but won release after seven years." + ], + "seeAlso": [] + } + ] + } +] +/** END TEST CASES **/ diff --git a/Lexis+.js b/Lexis+.js new file mode 100644 index 0000000000..f7357fffcf --- /dev/null +++ b/Lexis+.js @@ -0,0 +1,264 @@ +{ + "translatorID": "419638d9-9049-44ad-ba08-fa54ed24b5e6", + "label": "Lexis+", + "creator": "bfahrenfort", + "target": "^https?://plus\\.lexis\\..*/", + "minVersion": "5.0", + "maxVersion": "", + "priority": 100, + "inRepository": true, + "translatorType": 4, + "browserSupport": "gcsibv", + "lastUpdated": "2023-04-10 03:15:48" +} + +/* + ***** BEGIN LICENSE BLOCK ***** + + Copyright © 2023 Brandon Fahrenfort + + This file is part of Zotero. + + Zotero is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Zotero is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with Zotero. If not, see . + + ***** END LICENSE BLOCK ***** +*/ + +function detectWeb(doc, _url) { + if (doc.title.includes("results")) { + return "multiple"; + } + else if (/[a-zA-Z. ]+\s§\s\d+/.test(doc.title) + || /act/i.test(doc.title) + || /p\.l\./i.test(doc.title)) { // Match: ... Tex. Bus. & Com. Code § 26.01 ... + return "statute"; + } + else if (/\d+\s[a-zA-Z0-9. ]+\s\d+/.test(doc.title)) { // Match: ... 5 U.S. 137 ... + return "case"; + } + // TODO secondary sources + + return false; +} + +function getSearchResults(doc, url) { + var items = {}; + var nextTitle; + + if (detectWeb(doc, url) == "multiple") { + // TODO check what type of element it is (currently only working for 'cases' searches) + let titles = doc.querySelectorAll('a.titleLink'); + let dates = doc.querySelectorAll('span.metaDataItem'); // Not technically only dates, but that's all I use it for atm + var nextDate; + var dateOffset = 1; + + // dates[0] is first court name + nextDate = dates[dateOffset]; + dateOffset += 3; + // dates[2] is first citation + for (var i = 0; i < titles.length; i++) { + nextTitle = titles[i]; + items[nextTitle.href] = nextTitle.textContent + "(" + nextDate.textContent + ")"; + + // dates[0] is court name + nextDate = dates[dateOffset]; + + // dates[2] is a citation + } + + return items; + } + + return false; +} + +async function doWeb(doc, url) { + if (detectWeb(doc, url) == 'multiple') { + let items = await Zotero.selectItems(getSearchResults(doc, url)); + if (!items) return; + for (let url of Object.keys(items)) { + await scrape(await requestDocument(url)); + } + } + else { + await scrape(doc, url); + } +} + +async function scrape(doc, url) { + var title = text(doc, 'h1#SS_DocumentTitle'); + + if (detectWeb(doc, url) == "case") { + var newCase = new Zotero.Item("case"); + //newCase.url = doc.location.href; // Disabled for style reasons + + newCase.title = title; + + newCase.notes.push({ note: "Snapshot: " + newCase.title + doc.getElementById('document-content').innerHTML }); + + let citation = text(doc, 'span.active-reporter'); + newCase.reporterVolume = citation.substring(0, citation.indexOf(' ')); + newCase.reporter = citation.substring(citation.indexOf(' ') + 1, citation.lastIndexOf(' ')); + newCase.firstPage = citation.substring(citation.lastIndexOf(' ') + 1); + + newCase.court = text(doc, 'p.SS_DocumentInfo', 0); + + newCase.dateDecided = text(doc, 'span.date'); + + let docket = text(doc, 'p.SS_DocumentInfo', 2); + if (/^no\./i.test(docket) + || /^\d+/.test(docket) + || /^case no\./i.test(docket)) { + newCase.docketNumber = docket; // This won't be in perfect cite form, shouldn't be a hassle unless you're citing dozens of memorandum opinions + } + + newCase.complete(); + } + else if (detectWeb(doc, url) == "statute") { + var newStatute = new Zotero.Item("statute"); + + //newStatute.url = doc.location.href; // Disabled for style reasons + + newStatute.title = title; + + newStatute.notes.push({ note: "Snapshot: " + newStatute.title + doc.getElementById('document-content').innerHTML }); + + let info = text(doc, 'p.SS_DocumentInfo'); + + let isolation = info.substring(info.search( + /\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)/i + )); // isolate date on the frontend + newStatute.dateEnacted = isolation.substring(0, isolation.search(/[1-2][0-9][0-9][0-9]/) + 4); + + if (/act/i.test(title) + || /of\s[1-2][0-9][0-9][0-9]/i.test(title)) { // Session law, or act, not codified statute + // BB 21st ed. requires parallel cite to Pub. L. No. and Stat. for session laws + + // Title formatting + // TODO ZU's capitalizer is good, but doesn't work with closed-up abbreviations like P.L. or U.S. + // This could break titles in future. I do remove P.L. below though. + if (title === title.toUpperCase()) title = ZU.capitalizeTitle(title.toLowerCase(), true); // Some acts are capitalized + + // Remove some unnecessary information + var cleanedTitle = title; + let pLCite = title.match(/(\d+ p\.l\. \d+)/i); + let statCite = title.match(/(\d+ stat\. \d+)/i); + let enactedCite = title.match(/\d+ enacted [a-zA-Z0-9.]+ \d+/gi); + let part = title.match(/(part \d+(?: of \d+)?)/i); + if (pLCite) cleanedTitle = cleanedTitle.replace(pLCite[1], ''); + if (statCite) cleanedTitle = cleanedTitle.replace(statCite[1], ''); + if (part) cleanedTitle = cleanedTitle.replace(part[1], ''); + if (enactedCite) { + // Remove every enacted cite + for (var value of Object.values(enactedCite)) { + cleanedTitle = cleanedTitle.replace(value, ''); + } + } + cleanedTitle = cleanedTitle.replace(/(^\s*,)|(,\s*$)/g, ''); // Trim commas and whitespace + cleanedTitle = cleanedTitle.replace(/(^\s*,)|(,\s*$)/g, ''); // Another one + if (ZU.trim(cleanedTitle) === "") { // If the title's empty now, put it as the highest precedence citation in the title + if (pLCite) cleanedTitle = pLCite[1]; + else if (statCite) cleanedTitle = statCite[1]; + else if (enactedCite) { + cleanedTitle = enactedCite[0] + " & " + (Object.keys(enactedCite).length - 1) + " more"; + } + } + newStatute.title = cleanedTitle; + + // Reporter & citation formatting + var statutesAtLarge, publicLawNo; + let potentialReporter = text(doc, 'a.SS_ActiveRptr'); + if (potentialReporter) { // Sometimes Lexis is weird and doesn't give an ActiveRptr + if (/stat\./i.test(potentialReporter)) statutesAtLarge = potentialReporter; + else if (/pub\./i.test(potentialReporter) + || /p\.l\./i.test(potentialReporter)) { + publicLawNo = potentialReporter; + } + } + + let otherReporters = doc.querySelectorAll('span.SS_NonPaginatedRptr'); + + for (var i = 0; i < otherReporters.length; i++) { + var nextReporter = otherReporters[i].textContent; + if (/stat\./i.test(nextReporter)) statutesAtLarge = nextReporter; + else if (/pub\./i.test(nextReporter) + || /p\.l\./i.test(nextReporter)) { + publicLawNo = nextReporter; + } + } + + // Turn publicLawNo into the public law fields + if (/\d+-\d+/.test(publicLawNo)) { // Ex. P.L. 115-164 + let numPos = publicLawNo.search(/\d+-\d+/); + newStatute.publicLawNumber = publicLawNo.substring(numPos, publicLawNo.substring(numPos + 1).indexOf(' ')); // Gets 115-164 + + newStatute.session = newStatute.publicLawNumber.substring(0, newStatute.publicLawNumber.indexOf('-')); + } + else { // Ex. 115 P.L. 164 + let pLNumbers = publicLawNo.match(/(\d+) p\.l\. (\d+)/i); + newStatute.session = pLNumbers[1]; + newStatute.publicLawNumber = pLNumbers[1] + '-' + pLNumbers[2]; + } + + // Turn statutesAtLarge into the code#/code/section fields + // TODO in styles, check for "Stat." as the code, and if so, don't append a section symbol + let statNumbers = statutesAtLarge.match(/(\d+) stat\. (\d+)/i); + newStatute.codeNumber = statNumbers[1]; + newStatute.code = "Stat."; + newStatute.section = statNumbers[2]; + } + else { // Codified statute + // Title & citation formatting + if (title.match(/^\d+/)) { // Starts with digit, organized by title, ex. 47 U.S.C.S. § 230 + // Sadly, named groups aren't working + let groups = title.match(/^(\d+)\s([a-zA-Z0-9. ]+) § ([0-9.()a-zA-Z]+)/); + newStatute.codeNumber = groups[1]; + newStatute.code = groups[2]; + newStatute.section = groups[3]; + } + else { // Starts with letter, organized by code, ex. Tex. Bus. & Com. Code § 26.01 + let groups = title.match(/^([a-zA-Z&. ]+) § ([0-9.()a-zA-Z]+)/); + newStatute.code = groups[1]; + newStatute.section = groups[2]; + } + + // Reporter formatting, theoretically unnecessary but nice to have if it's there + /* + * Matches: + * P.L. 117-327 + * Pub. L. 117-327 + * Pub. Law 117-327 + * Pub. L. No. 117-327 + * Pub. Law No. 117-327 + * Public Law 117-327 + * Public Law Number 117-327 + * Public Law No. 117-327 + */ + let pL = info.match(/(p\.l\.|pub\. l(?:aw|\.)(?: no\.)?|public law(?: number| no\.)?)\s(\d+-\d+)/i); + if (pL) newStatute.publicLawNumber = pL[2]; + + if (newStatute.publicLawNumber) newStatute.session = newStatute.publicLawNumber.substring(0, newStatute.publicLawNumber.indexOf('-')); + } + + newStatute.notes.push({ note: "Document Info: " + info }); // Since the info section is all over the place, just dump the whole thing in for manual cite checks + + newStatute.complete(); + } +} + + +/** BEGIN TEST CASES **/ +var testCases = [ +] +/** END TEST CASES **/ diff --git a/newspapers.com.js b/newspapers.com.js index dd7caf6a49..abfa0f20ae 100644 --- a/newspapers.com.js +++ b/newspapers.com.js @@ -2,14 +2,14 @@ "translatorID": "22dd8e35-02da-4968-b306-6efe0779a48d", "label": "newspapers.com", "creator": "Peter Binkley", - "target": "^https?://www\\.newspapers\\.com/clip/", + "target": "^https?://[^/]+\\.newspapers\\.com/(clip|article)/", "minVersion": "3.0", "maxVersion": "", "priority": 100, "inRepository": true, "translatorType": 4, "browserSupport": "gcsibv", - "lastUpdated": "2020-10-29 03:32:09" + "lastUpdated": "2023-04-05 15:26:20" } /* @@ -39,23 +39,17 @@ function detectWeb(_doc, _url) { return "newspaperArticle"; } +function doWeb(doc, url) { + if (url.includes('/clip/')) { + scrapeClip(doc, url); + } + else { + scrapeArticle(doc, url); + } +} -function doWeb(doc, _url) { +function scrapeClip(doc, url) { var newItem = new Zotero.Item("newspaperArticle"); - var scripts = doc.getElementsByTagName("script"); - var json = ''; - var jsonre = /var staPageDetail = JSON.parse\((.+?)\);/; - for (var i = 0; i < scripts.length; i++) { - var arr = scripts[i].textContent.match(jsonre); - if (arr) { - json = arr[1]; - break; - } - } - - // one JSON.parse to unstringify the json string, and one to parse it into an object - // the replace fixes escaped apostrophes in the source, which JSON.parse considers invalid - var details = JSON.parse(JSON.parse(json.replace(/\\'/g, "'"))); var metaArr = {}; var metaTags = doc.getElementsByTagName("meta"); @@ -64,9 +58,9 @@ function doWeb(doc, _url) { metaArr[metaTag.getAttribute("property")] = metaTag.getAttribute("content"); } } - newItem.title = details.citation.title; + newItem.title = text(doc, '#mainContent h1') || text(doc, '[itemprop="about"]'); // remove the unnecessary xid param - newItem.url = details.citation.url.replace(/\?xid=[0-9]*$/, ""); + newItem.url = attr(doc, 'link[rel="canonical"]', 'href'); /* The user can append the author to the title with a forward slash @@ -82,41 +76,69 @@ function doWeb(doc, _url) { newItem.creators.push(Zotero.Utilities.cleanAuthor(author, "author")); } } - - newItem.abstractNote = details.media.note; - var uniqueID = newItem.url.match(/\/clip\/(\d+)/)[1]; - var pdfurl = "https://www.newspapers.com/clippings/download/?id=" + uniqueID; - newItem.attachments.push({ - title: "Full Text PDF", - mimeType: "application/pdf", - url: pdfurl - }); - - newItem.publicationTitle = details.source.publisherName; + newItem.publicationTitle = text(doc, '[itemprop="name"]'); // details["source"]["title"] gives a string like // "Newspapers.com - The Akron Beacon Journal - 1939-10-30 - Page Page 15" - var editiontokens = details.source.title.replace(/ - /g, "|").split("|"); - if (editiontokens.length == 3) { // there's an edition label - newItem.edition = editiontokens[1]; - } - newItem.pages = editiontokens.slice(-1)[0].replace(/Page/g, ''); - newItem.date = details.source.publishedDate; - newItem.place = details.source.publishedLocation; + newItem.pages = text(doc, '[itemprop="position"]').replace(/Page/g, ''); + newItem.date = ZU.strToISO(text(doc, '[itemprop="dateCreated"]')); + newItem.place = text(doc, '[itemprop="locationCreated"]'); + + newItem.attachments.push(makeImageAttachment(url)); + newItem.attachments.push(makePDFAttachment(url)); // handle empty title if (newItem.title === "") { - newItem.title = "Clipped From " + newItem.publicationTitle; + newItem.title = "Article clipped from " + newItem.publicationTitle + ""; } newItem.complete(); } +function scrapeArticle(doc, url) { + let item = new Zotero.Item('newspaperArticle'); + let json = JSON.parse(text(doc, 'script[type="application/ld+json"]')); + + item.publicationTitle = json.publisher && ZU.unescapeHTML(json.publisher.legalName); + item.title = ZU.trimInternal(ZU.unescapeHTML(json.about)) + || 'Article clipped from ' + item.publicationTitle + ''; + item.abstractNote = ZU.unescapeHTML(json.text); + item.place = ZU.unescapeHTML(json.locationCreated); + item.date = json.datePublished; + item.pages = json.pageStart && ZU.unescapeHTML(json.pageStart.replace('Page', '')); + item.url = attr(doc, 'link[rel="canonical"]', 'href'); + item.attachments.push(makeImageAttachment(url)); + item.attachments.push(makePDFAttachment(url)); + + item.complete(); +} + +function getID(url) { + return url.match(/\/(\d+)/)[1]; +} + +function makePDFAttachment(url) { + return { + title: 'Full Text PDF', + mimeType: 'application/pdf', + url: 'https://www.newspapers.com/clippings/download/?id=' + getID(url) + }; +} + +function makeImageAttachment(url) { + return { + title: 'Image', + mimeType: 'image/jpeg', + url: 'https://img.newspapers.com/img/img?clippingId=' + getID(url) + }; +} + /** BEGIN TEST CASES **/ var testCases = [ { "type": "web", "url": "https://www.newspapers.com/clip/7960447/my-day-eleanor-roosevelt/", + "detectedItemType": "newspaperArticle", "items": [ { "itemType": "newspaperArticle", @@ -135,6 +157,10 @@ var testCases = [ "publicationTitle": "The Akron Beacon Journal", "url": "https://www.newspapers.com/clip/7960447/my-day-eleanor-roosevelt/", "attachments": [ + { + "title": "Image", + "mimeType": "image/jpeg" + }, { "title": "Full Text PDF", "mimeType": "application/pdf" @@ -149,10 +175,11 @@ var testCases = [ { "type": "web", "url": "https://www.newspapers.com/clip/18535448/the-sunday-leader/", + "detectedItemType": "newspaperArticle", "items": [ { "itemType": "newspaperArticle", - "title": "Clipped From The Sunday Leader", + "title": "Article clipped from The Sunday Leader", "creators": [], "date": "1887-07-17", "libraryCatalog": "newspapers.com", @@ -161,6 +188,10 @@ var testCases = [ "publicationTitle": "The Sunday Leader", "url": "https://www.newspapers.com/clip/18535448/the-sunday-leader/", "attachments": [ + { + "title": "Image", + "mimeType": "image/jpeg" + }, { "title": "Full Text PDF", "mimeType": "application/pdf" @@ -175,6 +206,7 @@ var testCases = [ { "type": "web", "url": "https://www.newspapers.com/clip/31333699/driven-from-governors-office-ohio/", + "detectedItemType": "newspaperArticle", "items": [ { "itemType": "newspaperArticle", @@ -187,6 +219,74 @@ var testCases = [ "publicationTitle": "Rushville Republican", "url": "https://www.newspapers.com/clip/31333699/driven-from-governors-office-ohio/", "attachments": [ + { + "title": "Image", + "mimeType": "image/jpeg" + }, + { + "title": "Full Text PDF", + "mimeType": "application/pdf" + } + ], + "tags": [], + "notes": [], + "seeAlso": [] + } + ] + }, + { + "type": "web", + "url": "https://www.newspapers.com/article/the-times-picayune-telegraphed-to-the-ne/120087578/", + "detectedItemType": "newspaperArticle", + "items": [ + { + "itemType": "newspaperArticle", + "title": "Telegraphed to the New Orleans Picayune. Latest from Charleston. Fort Sumter Returns Fire", + "creators": [], + "date": "1861-04-13", + "abstractNote": "Telegraphed to the New Orleans Picayune. LATEST FROM CHARLESTON. FORT SUMTER RETflUS FIRE. SULLI VAN12AND MORRIS ISLAND BATTERIES AT WORK. BREACH MADE IN FORT SUMTER. War Vessels Reported Outside. By the Southwestern Line. Charleston, April 12. The batteries of Sullivan's Island, Morris Island and other points opened fire on Fort Sumter at half - past four o'clock this morning. Fort Sumter returned the fire. A brisk cannonading is being kept up. There is no infoimation from the seaboard. The military are under arms. The whole population is on the streets, and the harbor is filled with anxious spectators. SECONB DISPATCH. The Moating battery is doing good service. Up to eleven o clock there has been no loea on our side. Fort Sumter replied at 7 o'clock this morning, and has kept up an astonishing fire ever since. Stevens's battery is slightly injured. Three sbejls are fired per minute. Four hundred, in all, have fallen. A breach is expected to be made in Fort Sumter to - morrow. Major Anderson's fire is principally directed I against the floating battery. j War vessels are reported outside the harbor. Only two soldiers are wounded on Salli - ! van's Island. The range is more perfect from the land batteries. Every shot tells. It ia thought from Mnjor Anderson's fire thai he haa more men than was supposed. Fort Sumter will succumb by to - morrow. It is raining at Charleston, but there - is no cessation of the batteries. A continuous steady fire on both sides is beinc kept up. The cutter Harriet Lane, and the steam gnu boat Crntader, are reported olf the bar, but have not entered the harbor. The War Department have as yet no official diepatches. (Jen. Beauregard was at the batteries all day. , The Government expects Fort Sumter to succumb to - morrow. third dispatch The firing continued all day. Two of Fort Sumter's guns are silenced, and it is reported a breach has been made through the southeast wall. No casualty has yet happened to any of the forces. Only seven of the nineteen batteries have opened fire on Fort Sumter. The remainder are held ready for the expected fleet. Two thousand men reached the city this morning and immediately embarked for Morris Island. FOURTH DI fAT H. Charleston, April 10, 11 P. M. Tne bombardment of Fort Saniter is going on every twenty minutes from the mortars It is supposed Major Anderson is resting his men for the night. Three vessels of war are reported outside tho bar. They cannot get in on account of the roughness of the sea. No one has as yet received any injury. The floating battery works admirably well. Every inlet to the harbor is well guarded. Our forces are having a lively time of it.", + "libraryCatalog": "newspapers.com", + "pages": "4", + "place": "New Orleans, Louisiana", + "publicationTitle": "The Times-Picayune", + "url": "https://www.newspapers.com/article/the-times-picayune-telegraphed-to-the-ne/120087578/", + "attachments": [ + { + "title": "Image", + "mimeType": "image/jpeg" + }, + { + "title": "Full Text PDF", + "mimeType": "application/pdf" + } + ], + "tags": [], + "notes": [], + "seeAlso": [] + } + ] + }, + { + "type": "web", + "url": "https://nydailynews.newspapers.com/article/daily-news/121098969/", + "detectedItemType": "newspaperArticle", + "items": [ + { + "itemType": "newspaperArticle", + "title": "Article clipped from Daily News", + "creators": [], + "date": "1965-02-26", + "abstractNote": "Donavena 8-5 Choice; Can He Kayo Folley? By Jim McCulIey Professional oddsmakers, otherwise referred to as bookies, evidently are counting on Oscar (Ringo) Bona- vena to flatten Zora Folley the Garden. Otherwise, why young thumper from Argentina an 8-5 favorite over the ringwise No. 5 heavyweight contender from Arizona? Only two fighters in Folley's 13-year career have outpointed him England's Henry Cooper, in London, and big Ernie Terrell, in New York. Five men have stopped Zora, however. IT DOESN'T SEEM possible Bonavena, with only eight pro fights under his belt, could win a decision over tonight's 32-year-old opponent. Oscar has stopped seven of his eight opponents, however, and, of course, does have a powerful body and a punishing punch in either mitt. The fight mob is really puzzled over this fight. Some of those well versed in fisticuffs can't understand how the odds-bodkins can make 22-year-old Bonavena such a big favorite. Some 10,000 fans are expected to come see for themselves and put another $40,- 000 into the current boxing revival. \"I KNOW FOLLEY dogs it at times,\" said a former heavyweight contender, who did not want to be named because he is now an official with the boxing commission. ''But Bonavena is a real novice compared to Zora. It seems to me Folley should be a big favorite, but then the kid does have a punch and he is game. It's possible he can reach Folley and knock him out.\" The price, for Folley backers, Is most enticing. \"I CAN'T RESIST the price,\" said a knowledgeable fight man who has been known to wager a bob now and then when the figures are right. \"Know something, 1 think it will be down close to pick 'em before they get into the ring.\" One thing is certain. Folley can't lose another fight in New York at this time, or he is through as a top contender. He is going for a payday on the gamble that he can go the distance with Oscar; and there is a chance he might stop the young man, too, though nobody has done that yet. RIGHT NOW, FOLLEY is unbeaten in hi3 last six bouts since losing to Terrell here July 27, '63. In that span he has whipped George Chuvalo, easily, and has recorded a draw with European champion Karl Maldenbfrger in Germany. Zora's overall record stands 68-7-4, for 79 professional fights, and includes 38 knockouts, some proof that he can punch as well as box. Only opponent to go the route (10 rounds) with Bonavena was Dick Wipperman, last Nov. 13 here. Oscar came back to the Garden a month later and knocked out Billy Stephan in six. The South American still is unranked among the big boys, but a win tonight will put him up there where he can start hollering. History shows heavyweights do mature a lot quicker than the lighter men, and Oscar may ev.en. be an unusual young fighter, v . . ( . r in 10 rounds or less tonight at would they continue to list the - Vlsic lliv ;.. Vnn-t ST!rt -. lM. FEB. 26, 1958 ZDhe BOSTON CELTICS WOM THEIR SECOMt STRAIGHT N.&.A. EASTERN CROWN BY DOWNING DETROIT, 106-99, AS &1LL RUSSELL COUTftOLU&THE BOARDS. BOBCOUSYAN& BILL SHAfeMAH EACH SCORED 18 POIWTS. Lincoln Downs Results 1ST Clmp.: 4-np: 5 f.: off 1:33. Ravenala Prince (Garry)5.ti0 4i 2 SO Mission Bound (Parker) 6.10 .'i.8'1 Favorite Act (Bradley) K.MI T-l:02, Also Lord Culpeper. Your Reporter, Deacon Shnne. Prmrie Rose. Rinsr Shut, Fearless Leader, ilaryg Gilt. Soft Glance. 2D Clmg-.; 4-np: 7 f.: off 2:00. Idle Threats (Allan) 4 no 2 SO Grey Whirl (Giovanni) 3.40 3.00 Good Effort (Maeda) B.20 T-1:32t4. Also Greek Paire. Inquisition. Frozen North, Fast Bid. Foxy Sway. (Daily Double. 8-1, Paid :!.\". liOl 3D Clm?:3yrs:mdns:5 f :off 2 :2!) . Dogrwood Pateh(MaRia)7.ai) o.no 4.20 I.L Abie K. t Bradley) 13. NO U.KO Peaceful T. (Donahue) H.uO T.-l:t)3. Also Doe I.ark. AlHnx. Miss Pilot. Sum Bomb. Fast Bell. Greek Action, Win Joe. Dont Btatne Babe. 4TH Clmar.: 4-up; 7 t.: off 2:58. Irish Dotty (Bradley) 4.4D 3.20 2. SO Sibling- (Allan) 9.80 6.20 Brimstone Road (Row an 6. Of) T.-l :35 . Also Stahlstown. Emerson Hill. Patti Dowd. Ou The Lawn. Sieve H.. Game Start. Set. 5TH Clma:.: 3-up: 8 t.; off 3:254. Ancient Queen (Lamonte)-4.80 3. no 2.40 Wlwndilly (Merrier) 3 20 2 .So Lady Mink (Bradley) 2.80 T-l:02. Alio Mandolas. Lady Rhody. O. K. Debbie. Jury Verdict. Swift Salonga. Mix n Match. La Calvados. 6TH Clm?: 3-4 yrs; 5 f: off 3:52. Tessie Tansor(Davern)12.60 o.BO 5.00 French Line (Myers) 4.80 5 40 Captain Bronze (Allan) 10. hi) T.-l:02 9i. Alyso Rosie Anirel. Lony-bridge Lu Lu. Star Status, Toute Ma Vie. Tompkins County. 7TH Alw.: 3-4-yos.: 5 fur. off 4:20. Lories Honey (Hole) 24.20 20 3.8\" Rndoon (Clinch) 2.40 2.40 Presta Sun (Gamb'della) 5.00 T.-l:03. Also Green Toea. Anthony Scarfo. Prince O Morn. Captain Lockitup. Caronia. 8TH Clmr.: 4-up: 1 m.: off 4:48. ratcount (Alberts) 13.HO 5 HO 4.20 Lone Peak (Rodriguez) 5.60 3 flu Kilda (Ledezma) 3.40 T-l:48Si. Also Hue or Spank. Carb-anrel, Whitey. Wild Desire. 9TH Clmg-: 41iip: 1 m: off 5:16. Oportscaster (Allan) 20.80 8.KO 7.20 Waste Of Time(Miller) 49.20 2B.20 Da.vFromDallas(G's'do) 20.40 T.-1:5H4. Also Symboleer, Dandy Randy. Sea Tread. My Buyer. Cosmic Rule. Busted Budeet. Another Take, Presented. (Twin Double 8-1 8-3 Paid $3.51 1.20) , Att, 4,744. Handle $364,968. ' r think ( ConraoLf) THEY'LL 7t1-fT EVER SvSXv. ' C COME J uAV' BE A LOHG JfeTV", + "libraryCatalog": "newspapers.com", + "pages": "60", + "place": "New York, New York", + "publicationTitle": "Daily News", + "url": "https://www.newspapers.com/article/daily-news/121098969/", + "attachments": [ + { + "title": "Image", + "mimeType": "image/jpeg" + }, { "title": "Full Text PDF", "mimeType": "application/pdf"