#Scraping dependencies
Installations and imports

In [1]:
!pip3 install datetime selenium PyPDF2

!apt-get update # update ubuntu to correctly run apt-install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

Collecting datetime
[?25l  Downloading https://files.pythonhosted.org/packages/73/22/a5297f3a1f92468cc737f8ce7ba6e5f245fcfafeae810ba37bd1039ea01c/DateTime-4.3-py2.py3-none-any.whl (60kB)
[K     |████████████████████████████████| 61kB 1.8MB/s 
[?25hCollecting selenium
[?25l  Downloading https://files.pythonhosted.org/packages/80/d6/4294f0b4bce4de0abf13e17190289f9d0613b0a44e5dd6a7f5ca98459853/selenium-3.141.0-py2.py3-none-any.whl (904kB)
[K     |████████████████████████████████| 911kB 6.8MB/s 
[?25hCollecting PyPDF2
[?25l  Downloading https://files.pythonhosted.org/packages/b4/01/68fcc0d43daf4c6bdbc6b33cc3f77bda531c86b174cac56ef0ffdb96faab/PyPDF2-1.26.0.tar.gz (77kB)
[K     |████████████████████████████████| 81kB 8.5MB/s 
Collecting zope.interface
[?25l  Downloading https://files.pythonhosted.org/packages/2c/9e/bfbe2dd9c911602fa908d8446d8d8aad58a0f9b881c3a96b27cd981270b2/zope.interface-5.0.1-cp36-cp36m-manylinux2010_x86_64.whl (226kB)
[K     |████████████████████████████████| 2

In [0]:
import sys
import os
import time
from datetime import datetime
import traceback

from selenium import webdriver
from selenium.webdriver.common.by import By
from IPython.display import Image, display

#Adjustable parameters

###Class defaults

In [0]:
retry_sec = 60

###Selenium settings

In [0]:
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

def init_selenium():
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--disable-gpu");
    chrome_options.add_argument("--disable-extensions");
    chrome_options.add_argument("--window-size=1920,1080");
    chrome_options.add_argument("--kiosk")
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_experimental_option('prefs',  {
    "download.default_directory": os.getcwd(),
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    #"plugins.always_open_pdf_externally": True
    }
)
    return webdriver.Chrome('chromedriver',options=chrome_options)

#ArticleScraper Class  
(Note the singular in 'Article')
Has Selenium webdriver (chrome) available in `self.wd`.

###Constructor
Takes a page dictionary (as provided by Ivan) in it's constructor.  
Loads the current link in Selenium at creation.

Example input format:  
```{'authors': 'C You, Y Deng, W Hu, J Sun, Q Lin, F Zhou… - Available at SSRN …, 2020 - papers.ssrn.com',
  'extra_link': 'https://www.medrxiv.org/content/medrxiv/early/2020/02/11/2020.02.08.20021253.full.pdf',
  'extra_link_text': '[PDF] medrxiv.org',
  'link': 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3539694',
  'preview': 'Background: The 2019 novel coronavirus (2019-nCoV) outbreak in Wuhan, China has\nattracted world-wide attention. As of February 11, 2020, a total of 44,730 cases of\npneumonia associated with the 2019-nCoV were confirmed by the National Health …',
  'title': 'Estimation of the Time-Varying Reproduction Number of COVID-19 Outbreak in China'}
  ```

###Parse method
Takes an optionnal retry_sec parameter (seconds), default is 60, set to None to don't retry.  
Return value: dictionary of relevant fields.


In [0]:
class ArticleScraper:
  wd = init_selenium()
  date_format = "%Y-%m-%d"

  def __init__(self, page_dict):
    # variables initialization
    self.page_dict = page_dict
    self.url = page_dict['link']

    # loading page
    self.wd.get(self.url)

  @property
  def title(self):
    raise NotImplementedError()

  @property
  def authors(self):
    raise NotImplementedError()

  @property
  def doi(self):
    return None

  @property
  def abstract(self):
    return None

  @property
  def date(self):
    return None

  @property
  def body(self):
    return None

  @property
  def source(self):
    raise NotImplementedError()

  @property
  def source_impact_factor(self):
    return None

  @property
  def search_keyword(self):
    return None

  @property
  def categories(self):
    return None

  # Wrong US spelling, should be deleted
  @property
  def licence(self):
    #print('WARNING: the property is now spelled "license" and the relevant field in the dictionary is now "Licensing" instead of "Licence/Licencing", please fix your code accordingly')
    return None


  #temporary fix, see above
  license=licence
  
  # @property
  # def license(self):
  #   return None

  @property
  def acquisition_date(self):
    return datetime.now().strftime(self.date_format)

  @property
  def citations(self):
    return None

  @property
  def organization(self):
    return None

  @property
  def keywords(self):
    return None

  @property
  def references(self):
    return None

  @property
  def link(self):
    raise NotImplementedError()

  @property
  def extralinks(self):
    return None

  def parse(self, retry_sec = retry_sec):
    data = None
    while True:
      try:
        data = {'Title': self.title,
                'Authors': self.authors,
                'DOI': self.doi,
                'Abstract': self.abstract,
                'Date': self.date,
                'Full body': self.body,
                'Source': self.source,
                'Source impact factor': self.source_impact_factor,
                'Search keyword': self.search_keyword,
                'Category': self.categories,
                'Licensing': self.license, 
                'Document acquisition date': self.acquisition_date,
                'Citations': self.citations,
                'Organization affiliated': self.organization,
                'Keywords': self.keywords,
                'References': self.references,
                'Link': self.link,
                'Extra links': self.extralinks,
        }
        break
      except Exception as e:
        print("ERROR: while handling %s" % (self.url), e)
        self.wd.save_screenshot("temp.png")
        display(Image(filename="temp.png"))
        print("Current URL: %s" %(self.wd.current_url))
        traceback.print_exc()
        if (retry_sec is not None):
          print("Retrying in % d seconds..." % (retry_sec))
          time.sleep(retry_sec)
          print("Retrying now")
          self.wd.get(self.url)
          continue
        break
    return data
    

