Merge pull request #45 from tleedepriest/load_scopus_csv

Add ability load Scopus CSV files
NLeSC · Jun 8, 2023 · 8fee09a · 8fee09a
2 parents ac1c1d1 + a126a49
commit 8fee09a
Show file tree

Hide file tree

Showing 5 changed files with 162 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 ### Added
+- Add support for loading CSV files exported from Scopus (Thanks tleedepriest!)
+
 ### Changed
 ### Removed
 ### Fixed

diff --git a/litstudy/__init__.py b/litstudy/__init__.py
@@ -6,6 +6,7 @@
     load_csv,
     load_ieee_csv,
     load_ris_file,
+    load_scopus_csv,
     load_springer_csv,
     refine_crossref,
     refine_scopus,

diff --git a/litstudy/sources/__init__.py b/litstudy/sources/__init__.py
@@ -8,6 +8,7 @@
 from .ris import load_ris_file
 from .arxiv import search_arxiv
 from .csv import load_csv
+from .scopus_csv import load_scopus_csv
 
 __all__ = [
     "fetch_crossref",
@@ -17,6 +18,7 @@
     "load_csv",
     "load_ieee_csv",
     "load_ris_file",
+    "load_scopus_csv",
     "load_springer_csv",
     "refine_crossref",
     "refine_scopus",

diff --git a/litstudy/sources/scopus_csv.py b/litstudy/sources/scopus_csv.py
@@ -0,0 +1,117 @@
+"""
+support loading Scopus CSV export.
+"""
+from typing import List, Optional
+from ..types import Document, Author, DocumentSet, DocumentIdentifier, Affiliation
+from ..common import robust_open
+import csv
+
+
+class ScopusCsvAffiliation(Affiliation):
+    def __init__(self, name):
+        self._name = name
+
+    @property
+    def name(self):
+        return self._name
+
+
+class ScopusCsvAuthor(Author):
+    def __init__(self, name, affiliation):
+        self._name = name
+        self._affiliation = affiliation
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def affiliations(self):
+        return [ScopusCsvAffiliation(self._affiliation)]
+
+
+class ScopusCsvDocument(Document):
+    def __init__(self, entry):
+        doi = entry.get("DOI")
+        title = entry.get("Title")
+        pubmed_id = entry.get("PubMed ID")
+        eid = entry.get("EID")
+        identifier = DocumentIdentifier(title, doi=doi, pubmed=pubmed_id, eid=eid)
+        super().__init__(identifier)
+        self.entry = entry
+
+    @property
+    def title(self) -> Optional[str]:
+        return self.entry.get("Title") or None
+
+    @property
+    def authors(self) -> List[ScopusCsvAuthor]:
+        auths_affs = self.entry.get("Authors with affiliations")
+        auths_id = self.entry.get("Author(s) ID", "")
+        # author_last, first initial, affiliation; .....
+        if not auths_affs:
+            return []
+        auths_affs = auths_affs.split("; ")
+        auths = [", ".join(auth_aff.split(", ")[0:2]) for auth_aff in auths_affs]
+        affs = [", ".join(auth_aff.split(", ")[2:]) for auth_aff in auths_affs]
+        # try to add id to author name
+        auths_id = auths_id.split(";")[:-1]  # remove empty string last el
+        if len(auths) == len(auths_id):
+            auths = [f"{name} (ID: {auth_id})" for name, auth_id in zip(auths, auths_id)]
+        return [ScopusCsvAuthor(a, b) for a, b in zip(auths, affs)]
+
+    @property
+    def publisher(self) -> Optional[str]:
+        return self.entry.get("Publisher") or None
+
+    @property
+    def publication_year(self) -> Optional[int]:
+        year = self.entry.get("Year")
+        if not year:
+            return None
+
+        try:
+            return int(year)
+        except:
+            return None
+
+    @property
+    def keywords(self) -> Optional[List[str]]:
+        keywords = self.entry.get("Author Keywords")
+        if not keywords:
+            return None
+        return keywords.split("; ")
+
+    @property
+    def abstract(self) -> Optional[str]:
+        abstract = self.entry.get("Abstract")
+        if not abstract:
+            return None
+        return abstract
+
+    @property
+    def citation_count(self) -> Optional[int]:
+        citation_count = self.entry.get("Cited by")
+        if not citation_count:
+            return None
+        return int(citation_count)
+
+    @property
+    def language(self) -> Optional[str]:
+        return self.entry.get("Language of Original Document") or None
+
+    @property
+    def publication_source(self) -> Optional[str]:
+        return self.entry.get("Source title") or None
+
+    @property
+    def source_type(self) -> Optional[str]:
+        return self.entry.get("Document Type") or None
+
+
+def load_scopus_csv(path: str) -> DocumentSet:
+    """Import CSV file exported from Scopus"""
+    with robust_open(path) as f:
+        lines = csv.DictReader(f)
+        docs = [ScopusCsvDocument(line) for line in lines]
+        return DocumentSet(docs)
diff --git a/tests/test_sources_scopus_csv.py b/tests/test_sources_scopus_csv.py
@@ -0,0 +1,40 @@
+from litstudy.sources.scopus_csv import load_scopus_csv
+import os
+
+
+def test_load_scopus_csv():
+    path = os.path.dirname(__file__) + "/resources/scopus.csv"
+    docs = load_scopus_csv(path)
+    for num, doc in enumerate(docs):
+        title = doc.title
+        doc_id_title = doc.id.title
+        doc_id_doi = doc.id.doi
+        doc_id_pubmed = doc.id.pubmed
+        doc_id_scopus = doc.id.scopusid
+        pub_year = doc.publication_year
+        keywords = doc.keywords
+        abstract = doc.abstract
+        citation_count = doc.citation_count
+        publication_source = doc.publication_source
+        source_type = doc.source_type
+        for author in doc.authors:
+            author_name = author.name
+            for aff in author.affiliations:
+                affiliation = aff.name
+        if num == 0:
+            assert title == doc_id_title
+            assert doc.title == "Scalable molecular dynamics with NAMD"
+            assert doc.abstract.startswith("NAMD is a parallel molecular dynamics code")
+            assert doc.publication_source == "Journal of Computational Chemistry"
+            assert doc.language == "English"
+            assert doc.publisher == "John Wiley and Sons Inc."
+            assert doc.citation_count == 13169
+            assert doc.keywords == [
+                "Biomolecular simulation",
+                "Molecular dynamics",
+                "Parallel computing",
+            ]
+            assert doc.publication_year == 2005
+
+            assert len(doc.authors) == 10
+            assert doc.authors[0].name == "Phillips, J.C. (ID: 57202138757)"