Skip to content

Commit

Permalink
GVKFetcher: Ensure that first letter of subtitle is always capitalized
Browse files Browse the repository at this point in the history
Minor improvements:
 * replace calls to datafield.getAttribute("tag") by a single variable
  • Loading branch information
koppor committed Nov 21, 2015
1 parent b2c8db5 commit 7047a1d
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 25 deletions.
64 changes: 40 additions & 24 deletions src/main/java/net/sf/jabref/importer/fetcher/GVKParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,22 @@
import net.sf.jabref.importer.ImportFormatReader;
import net.sf.jabref.model.entry.BibtexEntry;
import net.sf.jabref.model.entry.IdGenerator;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import com.google.common.base.Strings;

public class GVKParser {

private static final Log LOGGER = LogFactory.getLog(GVKParser.class);


public List<BibtexEntry> parseEntries(InputStream is)
throws ParserConfigurationException, SAXException, IOException {
DocumentBuilder dbuild = DocumentBuilderFactory.newInstance().newDocumentBuilder();
Expand Down Expand Up @@ -91,20 +99,21 @@ private BibtexEntry parseEntry(Element e) {
while (iter.hasNext()) {
Element datafield = iter.next();

// System.out.println(datafield.getAttributeValue("tag"));
String tag = datafield.getAttribute("tag");
LOGGER.debug("tag: " + tag);

// mak
if (datafield.getAttribute("tag").equals("002@")) {
if (tag.equals("002@")) {
mak = getSubfield("0", datafield);
}

//ppn
if (datafield.getAttribute("tag").equals("003@")) {
if (tag.equals("003@")) {
ppn = getSubfield("0", datafield);
}

//author
if (datafield.getAttribute("tag").equals("028A")) {
if (tag.equals("028A")) {
String vorname = getSubfield("d", datafield);
String nachname = getSubfield("a", datafield);

Expand All @@ -116,7 +125,7 @@ private BibtexEntry parseEntry(Element e) {
author = author.concat(vorname + " " + nachname);
}
//author (weiterer)
if (datafield.getAttribute("tag").equals("028B")) {
if (tag.equals("028B")) {
String vorname = getSubfield("d", datafield);
String nachname = getSubfield("a", datafield);

Expand All @@ -129,7 +138,7 @@ private BibtexEntry parseEntry(Element e) {
}

//editor
if (datafield.getAttribute("tag").equals("028C")) {
if (tag.equals("028C")) {
String vorname = getSubfield("d", datafield);
String nachname = getSubfield("a", datafield);

Expand All @@ -142,24 +151,24 @@ private BibtexEntry parseEntry(Element e) {
}

//title and subtitle
if (datafield.getAttribute("tag").equals("021A")) {
if (tag.equals("021A")) {
title = getSubfield("a", datafield);
subtitle = getSubfield("d", datafield);
}

//publisher and address
if (datafield.getAttribute("tag").equals("033A")) {
if (tag.equals("033A")) {
publisher = getSubfield("n", datafield);
address = getSubfield("p", datafield);
}

//date
if (datafield.getAttribute("tag").equals("011@")) {
if (tag.equals("011@")) {
date = getSubfield("a", datafield);
}

//date, volume, number, pages (year bei Zeitschriften (evtl. redundant mit 011@))
if (datafield.getAttribute("tag").equals("031A")) {
if (tag.equals("031A")) {
date = getSubfield("j", datafield);
volume = getSubfield("e", datafield);
number = getSubfield("a", datafield);
Expand All @@ -170,7 +179,7 @@ private BibtexEntry parseEntry(Element e) {
// 036D seems to contain more information than the other fields
// overwrite information using that field
// 036D also contains information normally found in 036E
if (datafield.getAttribute("tag").equals("036D")) {
if (tag.equals("036D")) {
// 021 might have been present
if (title != null) {
// convert old title (contained in "a" of 021A) to volume
Expand All @@ -189,7 +198,7 @@ private BibtexEntry parseEntry(Element e) {
}

//series and number
if (datafield.getAttribute("tag").equals("036E")) {
if (tag.equals("036E")) {
series = getSubfield("a", datafield);
number = getSubfield("l", datafield);
String kor = getSubfield("b", datafield);
Expand All @@ -200,17 +209,17 @@ private BibtexEntry parseEntry(Element e) {
}

//note
if (datafield.getAttribute("tag").equals("037A")) {
if (tag.equals("037A")) {
note = getSubfield("a", datafield);
}

//edition
if (datafield.getAttribute("tag").equals("032@")) {
if (tag.equals("032@")) {
edition = getSubfield("a", datafield);
}

//isbn
if (datafield.getAttribute("tag").equals("004A")) {
if (tag.equals("004A")) {
String isbn_10 = getSubfield("0", datafield);
String isbn_13 = getSubfield("A", datafield);

Expand All @@ -226,7 +235,7 @@ private BibtexEntry parseEntry(Element e) {

// Hochschulschriftenvermerk
// Bei einer Verlagsdissertation ist der Ort schon eingetragen
if (datafield.getAttribute("tag").equals("037C")) {
if (tag.equals("037C")) {
if (address == null) {
address = getSubfield("b", datafield);
address = removeSortCharacters(address);
Expand All @@ -252,23 +261,23 @@ private BibtexEntry parseEntry(Element e) {
* Buchbeiträgen Verlag und Ort wichtig sind
* (sonst in Kategorie 033A).
*/
if (datafield.getAttribute("tag").equals("027D")) {
if (tag.equals("027D")) {
journal = getSubfield("a", datafield);
booktitle = getSubfield("a", datafield);
address = getSubfield("p", datafield);
publisher = getSubfield("n", datafield);
}

//pagetotal
if (datafield.getAttribute("tag").equals("034D")) {
if (tag.equals("034D")) {
pagetotal = getSubfield("a", datafield);

// S, S. etc. entfernen
pagetotal = pagetotal.replaceAll(" S\\.?$", "");
}

// Behandlung von Konferenzen
if (datafield.getAttribute("tag").equals("030F")) {
if (tag.equals("030F")) {
address = getSubfield("k", datafield);

if (!entryType.equals("proceedings")) {
Expand All @@ -292,17 +301,17 @@ private BibtexEntry parseEntry(Element e) {
//SRU-Schnittstelle gelieferten Daten zur
//Quelle unvollständig sind (z.B. nicht Serie
//und Nummer angegeben werden)
if (datafield.getAttribute("tag").equals("039B")) {
if (tag.equals("039B")) {
quelle = getSubfield("8", datafield);
}
if (datafield.getAttribute("tag").equals("046R")) {
if (tag.equals("046R")) {
if (quelle.equals("") || (quelle == null)) {
quelle = getSubfield("a", datafield);
}
}

// URLs behandeln
if (datafield.getAttribute("tag").equals("009P")) {
if (tag.equals("009P")) {
if (datafield.getAttribute("occurrence").equals("03")
|| datafield.getAttribute("occurrence").equals("05")) {
if (url == null) {
Expand Down Expand Up @@ -367,8 +376,15 @@ private BibtexEntry parseEntry(Element e) {
if (title != null) {
result.setField("title", title);
}
if (subtitle != null) {
result.setField("subtitle", subtitle);
if (!Strings.isNullOrEmpty(subtitle)) {
// ensure that first letter is an upper case letter
// there could be the edge case that the string is only one character long, therefore, this special treatment
// this is apache commons lang StringUtils.capitalize (https://commons.apache.org/proper/commons-lang/javadocs/api-release/org/apache/commons/lang3/StringUtils.html#capitalize%28java.lang.String%29), but we don't want to add an additional dependency ('org.apache.commons:commons-lang3:3.4')
String newSubtitle = Character.toString(Character.toUpperCase(subtitle.charAt(0)));
if (subtitle.length() > 1) {
newSubtitle += subtitle.substring(1);
}
result.setField("subtitle", newSubtitle);
}
if (publisher != null) {
result.setField("publisher", publisher);
Expand Down
27 changes: 26 additions & 1 deletion src/test/java/net/sf/jabref/importer/fetcher/GVKParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ private void doTest(String xmlName, int expectedSize, List<String> resourceNames
GVKParser parser = new GVKParser();
List<BibtexEntry> entries = parser.parseEntries(is);
Assert.assertNotNull(entries);
Assert.assertEquals(entries.size(), expectedSize);
Assert.assertEquals(expectedSize, entries.size());
int i = 0;
for (String resourceName : resourceNames) {
BibtexEntryUtil.doAssertEquals(GVKParser.class, resourceName, entries.get(i));
Expand All @@ -46,4 +46,29 @@ public void resultFor797485368() throws Exception {
public void GMP() throws Exception {
doTest("gvk_gmp.xml", 2, Arrays.asList(new String[] {"gvk_gmp.1.bib", "gvk_gmp.2.bib"}));
}

@Test
public void subTitleTest() throws Exception {
try (InputStream is = GVKParser.class.getResourceAsStream("gvk_artificial_subtitle_test.xml")) {
GVKParser parser = new GVKParser();
List<BibtexEntry> entries = parser.parseEntries(is);
Assert.assertNotNull(entries);
Assert.assertEquals(5, entries.size());

BibtexEntry entry = entries.get(0);
Assert.assertEquals(null, entry.getField("subtitle"));

entry = entries.get(1);
Assert.assertEquals("C", entry.getField("subtitle"));

entry = entries.get(2);
Assert.assertEquals("Word", entry.getField("subtitle"));

entry = entries.get(3);
Assert.assertEquals("Word1 word2", entry.getField("subtitle"));

entry = entries.get(4);
Assert.assertEquals("Word1 word2", entry.getField("subtitle"));
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<zs:searchRetrieveResponse xmlns:zs="http://www.loc.gov/zing/srw/">
<zs:version>1.1</zs:version>
<zs:numberOfRecords>1</zs:numberOfRecords>
<zs:records>
<zs:record>
<zs:recordSchema>picaxml</zs:recordSchema>
<zs:recordPacking>xml</zs:recordPacking>
<zs:recordData>
<record xmlns="info:srw/schema/5/picaXML-v1.0">
<datafield tag="021A">
<subfield code="d"></subfield>
</datafield>
</record>
</zs:recordData>
<zs:recordPosition>1</zs:recordPosition>
</zs:record>
<zs:record>
<zs:recordSchema>picaxml</zs:recordSchema>
<zs:recordPacking>xml</zs:recordPacking>
<zs:recordData>
<record xmlns="info:srw/schema/5/picaXML-v1.0">
<datafield tag="021A">
<subfield code="d">c</subfield>
</datafield>
</record>
</zs:recordData>
<zs:recordPosition>2</zs:recordPosition>
</zs:record>
<zs:record>
<zs:recordSchema>picaxml</zs:recordSchema>
<zs:recordPacking>xml</zs:recordPacking>
<zs:recordData>
<record xmlns="info:srw/schema/5/picaXML-v1.0">
<datafield tag="021A">
<subfield code="d">word</subfield>
</datafield>
</record>
</zs:recordData>
<zs:recordPosition>3</zs:recordPosition>
</zs:record>
<zs:record>
<zs:recordSchema>picaxml</zs:recordSchema>
<zs:recordPacking>xml</zs:recordPacking>
<zs:recordData>
<record xmlns="info:srw/schema/5/picaXML-v1.0">
<datafield tag="021A">
<subfield code="d">word1 word2</subfield>
</datafield>
</record>
</zs:recordData>
<zs:recordPosition>4</zs:recordPosition>
</zs:record>
<zs:record>
<zs:recordSchema>picaxml</zs:recordSchema>
<zs:recordPacking>xml</zs:recordPacking>
<zs:recordData>
<record xmlns="info:srw/schema/5/picaXML-v1.0">
<datafield tag="021A">
<subfield code="d">Word1 word2</subfield>
</datafield>
</record>
</zs:recordData>
<zs:recordPosition>5</zs:recordPosition>
</zs:record>
</zs:records>
<zs:echoedSearchRetrieveRequest>
<zs:version>1.1</zs:version>
<zs:query>pica.all=797485368</zs:query>
<zs:maximumRecords>50</zs:maximumRecords>
<zs:recordPacking>xml</zs:recordPacking>
<zs:recordSchema>picaxml</zs:recordSchema>
<zs:sortKeys>Year,,1</zs:sortKeys>
</zs:echoedSearchRetrieveRequest>
</zs:searchRetrieveResponse>

0 comments on commit 7047a1d

Please sign in to comment.