Skip to content

Commit

Permalink
Merge pull request #45 from tleedepriest/load_scopus_csv
Browse files Browse the repository at this point in the history
Add ability load Scopus CSV files
  • Loading branch information
stijnh committed Jun 8, 2023
2 parents ac1c1d1 + a126a49 commit 8fee09a
Show file tree
Hide file tree
Showing 5 changed files with 162 additions and 0 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]
### Added
- Add support for loading CSV files exported from Scopus (Thanks tleedepriest!)

### Changed
### Removed
### Fixed
Expand Down
1 change: 1 addition & 0 deletions litstudy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
load_csv,
load_ieee_csv,
load_ris_file,
load_scopus_csv,
load_springer_csv,
refine_crossref,
refine_scopus,
Expand Down
2 changes: 2 additions & 0 deletions litstudy/sources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .ris import load_ris_file
from .arxiv import search_arxiv
from .csv import load_csv
from .scopus_csv import load_scopus_csv

__all__ = [
"fetch_crossref",
Expand All @@ -17,6 +18,7 @@
"load_csv",
"load_ieee_csv",
"load_ris_file",
"load_scopus_csv",
"load_springer_csv",
"refine_crossref",
"refine_scopus",
Expand Down
117 changes: 117 additions & 0 deletions litstudy/sources/scopus_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
"""
support loading Scopus CSV export.
"""
from typing import List, Optional
from ..types import Document, Author, DocumentSet, DocumentIdentifier, Affiliation
from ..common import robust_open
import csv


class ScopusCsvAffiliation(Affiliation):
def __init__(self, name):
self._name = name

@property
def name(self):
return self._name


class ScopusCsvAuthor(Author):
def __init__(self, name, affiliation):
self._name = name
self._affiliation = affiliation

@property
def name(self):
return self._name

@property
def affiliations(self):
return [ScopusCsvAffiliation(self._affiliation)]


class ScopusCsvDocument(Document):
def __init__(self, entry):
doi = entry.get("DOI")
title = entry.get("Title")
pubmed_id = entry.get("PubMed ID")
eid = entry.get("EID")
identifier = DocumentIdentifier(title, doi=doi, pubmed=pubmed_id, eid=eid)
super().__init__(identifier)
self.entry = entry

@property
def title(self) -> Optional[str]:
return self.entry.get("Title") or None

@property
def authors(self) -> List[ScopusCsvAuthor]:
auths_affs = self.entry.get("Authors with affiliations")
auths_id = self.entry.get("Author(s) ID", "")
# author_last, first initial, affiliation; .....
if not auths_affs:
return []
auths_affs = auths_affs.split("; ")
auths = [", ".join(auth_aff.split(", ")[0:2]) for auth_aff in auths_affs]
affs = [", ".join(auth_aff.split(", ")[2:]) for auth_aff in auths_affs]
# try to add id to author name
auths_id = auths_id.split(";")[:-1] # remove empty string last el
if len(auths) == len(auths_id):
auths = [f"{name} (ID: {auth_id})" for name, auth_id in zip(auths, auths_id)]
return [ScopusCsvAuthor(a, b) for a, b in zip(auths, affs)]

@property
def publisher(self) -> Optional[str]:
return self.entry.get("Publisher") or None

@property
def publication_year(self) -> Optional[int]:
year = self.entry.get("Year")
if not year:
return None

try:
return int(year)
except:
return None

@property
def keywords(self) -> Optional[List[str]]:
keywords = self.entry.get("Author Keywords")
if not keywords:
return None
return keywords.split("; ")

@property
def abstract(self) -> Optional[str]:
abstract = self.entry.get("Abstract")
if not abstract:
return None
return abstract

@property
def citation_count(self) -> Optional[int]:
citation_count = self.entry.get("Cited by")
if not citation_count:
return None
return int(citation_count)

@property
def language(self) -> Optional[str]:
return self.entry.get("Language of Original Document") or None

@property
def publication_source(self) -> Optional[str]:
return self.entry.get("Source title") or None

@property
def source_type(self) -> Optional[str]:
return self.entry.get("Document Type") or None


def load_scopus_csv(path: str) -> DocumentSet:
"""Import CSV file exported from Scopus"""
with robust_open(path) as f:
lines = csv.DictReader(f)
docs = [ScopusCsvDocument(line) for line in lines]
return DocumentSet(docs)
40 changes: 40 additions & 0 deletions tests/test_sources_scopus_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from litstudy.sources.scopus_csv import load_scopus_csv
import os


def test_load_scopus_csv():
path = os.path.dirname(__file__) + "/resources/scopus.csv"
docs = load_scopus_csv(path)
for num, doc in enumerate(docs):
title = doc.title
doc_id_title = doc.id.title
doc_id_doi = doc.id.doi
doc_id_pubmed = doc.id.pubmed
doc_id_scopus = doc.id.scopusid
pub_year = doc.publication_year
keywords = doc.keywords
abstract = doc.abstract
citation_count = doc.citation_count
publication_source = doc.publication_source
source_type = doc.source_type
for author in doc.authors:
author_name = author.name
for aff in author.affiliations:
affiliation = aff.name
if num == 0:
assert title == doc_id_title
assert doc.title == "Scalable molecular dynamics with NAMD"
assert doc.abstract.startswith("NAMD is a parallel molecular dynamics code")
assert doc.publication_source == "Journal of Computational Chemistry"
assert doc.language == "English"
assert doc.publisher == "John Wiley and Sons Inc."
assert doc.citation_count == 13169
assert doc.keywords == [
"Biomolecular simulation",
"Molecular dynamics",
"Parallel computing",
]
assert doc.publication_year == 2005

assert len(doc.authors) == 10
assert doc.authors[0].name == "Phillips, J.C. (ID: 57202138757)"

0 comments on commit 8fee09a

Please sign in to comment.