example https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3539694

In [1]:
!if [ ! -f ArticleScraper.ipynb ]; then curl https://raw.githubusercontent.com/Karocyt/covid-scraping/master/ArticleScraper.ipynb > ArticleScraper.ipynb; else echo "Already done"; fi
%run ArticleScraper.ipynb

Already done
Ign:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease
Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:5 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Hit:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:10 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic InRelease
Get:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Fetched 252 kB in 2s (146 kB/s)
Reading package lists... Done
Reading package lists... Don

In [0]:
from selenium.webdriver.common.by import By
import time

In [0]:
example_dicts = [{'authors': 'C You, Y Deng, W Hu, J Sun, Q Lin, F Zhou… - Available at SSRN …, 2020 - papers.ssrn.com',
  'extra_link': 'https://www.medrxiv.org/content/medrxiv/early/2020/02/11/2020.02.08.20021253.full.pdf',
  'extra_link_text': '[PDF] medrxiv.org',
  'link': 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3539694',
  'preview': 'Background: The 2019 novel coronavirus (2019-nCoV) outbreak in Wuhan, China has\nattracted world-wide attention. As of February 11, 2020, a total of 44,730 cases of\npneumonia associated with the 2019-nCoV were confirmed by the National Health …',
  'title': 'Estimation of the Time-Varying Reproduction Number of COVID-19 Outbreak in China'},
 {'authors': 'J Gao, P Zheng, Y Jia, H Chen, Y Mao… - Available at SSRN …, 2020 - papers.ssrn.com',
  'extra_link': '',
  'extra_link_text': '',
  'link': 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3541120',
  'preview': 'Background: Huge citizens expos social media during a novel coronavirus disease (COVID-\n19) outbroke in Wuhan, China. We assess the prevalence of mental health problems and\nexamine their association with social media exposure. Methods: We conducted a cross …',
  'title': 'Mental Health Problems and Social Media Exposure During COVID-19 Outbreak'},
 {'authors': 'C GU, W Jiang, T Zhao, B Zheng - Available at SSRN 3551006, 2020 - papers.ssrn.com',
  'extra_link': '',
  'extra_link_text': '',
  'link': 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3551006',
  'preview': 'The statistics show that the mortality of COVID-19 is 20 times higher than seasonal flu and\nclose to that of Spanish flu, hence it is becoming an absolute priority for every country to take\nefficient measure to limit the transmission of COVID-19. In this short paper, we propose a …',
  'title': 'Mathematical Recommendations to Fight Against COVID-19'},
  ]

In [0]:
class SsrnSingleScraper(ArticleScraper):
  page_dict = None

  def __init__(self, page_dict):
    ArticleScraper.wd.get(page_dict['link'])
    self.page_dict = page_dict
    ArticleScraper.__init__(self, page_dict['link'])

  @property
  def title(self):
    return self.page.head.title.text.split(' by ')[0]

  @property
  def authors(self):
    return self.page.head.title.text.split(' by ')[1].split(" ::")[0]

  @property
  def doi(self):
    return None

  @property
  def abstract(self):
    div = self.page.find("div", {"class": "abstract-text"})
    text = div.find("p").text
    return text

  @property
  def date(self):
    xpath = '//*[@id="selectable"]/text()'
    text = self.tree.xpath(xpath)[0]
    try:
      date = datetime.strptime(text.split(")")[-2].split("(")[-1], '%m/%d/%Y')
    except:
      date = datetime.strptime(text.split(")")[-2].split("(")[-1], '%B %d, %Y')
    return date.strftime(self.date_format)

  @property
  def body(self):
    return None

  @property
  def source(self):
    return "SSRN"

  @property
  def source_impact_factor(self):
    return None

  @property
  def search_keyword(self):
    return None

  @property
  def categories(self):
    return None

  @property
  def licence(self):
    return None

  @property
  def citations(self):
    return None

  @property
  def organization(self):
    authors = ArticleScraper.wd.find_element_by_class_name("authors cell authors-full-width")
    return authors.find_element(By.TAG_NAME, 'p').text

  @property
  def keywords(self):
    keywords = self.page.find(text='Keywords:').parent.parent.text.split(': ')[1]
    return ",".join(keywords.split(';'))

  @property
  def references(self):
    wd = ArticleScraper.wd
    wd.find_element_by_xpath('//*[@id="references-widget"]/button').click()
    time.sleep(0.2)
    li_list = wd.find_elements_by_xpath('//*[@id="references-widget"]/ol/li')
    refs = []
    for elem in li_list:
      ref = {}
      try:
        ref['title'] = elem.find_element(By.CLASS_NAME, 'reference-title').text
      except:
        ref['title'] = ""
      
      try:
        ref['authors'] = elem.find_element(By.CLASS_NAME, 'author-list').text
      except:
        ref['authors'] = ""
      
      try:
        link = elem.find_element(By.TAG_NAME,  'a')
        ref['link'] = link.get_attribute('href')
      except:
        ref['link'] = ""
      refs.append(ref)
    
    return refs

  @property
  def link(self):
    return self.page_dict['link']

  @property
  def extralinks(self):
    return self.page_dict['extra_link']
  

In [56]:
class SsrnScraper():
  def parse(self, pages_dict):
    for d in pages_dict:
      yield SsrnSingleScraper(d).parse()  

scraper = SsrnScraper()
for d in scraper.parse(example_dicts):
  for k in d:
    print(k, d[k], sep=": ")
  print()

NoSuchElementException: ignored