## Fair Scraper

In [2]:
"""
Scrapes information from FAIRsharing.org
"""
import os
import requests
from lxml import html
import pandas as pd
import json
#from ncats_translator_dqa import config

In [3]:
import sys
sys.path.insert(0, '/Users/pedrohserrano/NCATS-Translator-DQA') 
import ncats_translator_dqa as dqa

#### fair_scraper(url)
FAIRsharing.org for some basic information.

    Scrapes FAIRsharing.org for some basic information, including title, scope and data types, terminology artifacts,
    and conditions of use.

    :param url: String url to page to scrape
    :return: FAIRPrelimStats object

- Which url

In [4]:
url = 'https://fairsharing.org/biodbcore-000015'

In [5]:
# output message
#if config.verbose:
#    print('Scraping: ' + url)

In [6]:
# load the page
page = requests.get(url)

In [7]:
# parse the HTML
html_content = html.fromstring(page.content)

- Get the title

In [8]:
title = html_content.xpath('//div[@class="title-text"]/h2/text()[last()]')

In [9]:
title = title[0].strip()

In [10]:
title = [title]
title

['ChEMBL: a large-scale bioactivity database for drug discovery']

- Get the tags

In [11]:
# Find the listed items under Scope and data types
# <li class="bio-tag domain">
#     <span class="bio-icon-tag"style="padding-right: 5px"></span>
#     Approved drug
# </li>
sad = html_content.xpath('//li[@class="bio-tag domain"]/text()[last()]')

In [12]:
sad = [x.strip() for x in sad]
sad

['Approved drug', 'Biomedical Science', 'Peptide', 'Small molecule']

- Get the terminology artifacts

In [13]:
# Find the list items under Terminology Artifacts
# <p><span class="heavier">Terminology Artifacts</span></p>
# <ul class="record-list-link">
# 	<li class="small"><a href="/bsg-s000039" target="_blank">Chemical Entities of Biological Interest</a></li>
# 	<li class="small"><a href="/bsg-s000136" target="_blank">PSI Molecular Interaction Controlled Vocabulary</a></li>
# </ul>
ta = html_content.xpath('//span[text()="Terminology Artifacts"]/../../ul/li/a/text()')
ta = [x.strip() for x in ta]

In [14]:
ta

['Chemical Entities of Biological Interest',
 'PSI Molecular Interaction Controlled Vocabulary']

- Get the license

In [15]:
# Get license
# <div class="standard-unit">
    # <p class="section-title"><span class="heavier">Conditions of Use</span></p>
lic_groups = html_content.xpath('//span[text()="Conditions of Use"]/../../span[@class="section-header"]')

lic_info = []
for lic_group in lic_groups:
    applies_to = lic_group.xpath('text()') # Get the "Applies to" text and fix weird whitespace
    applies_to = ' '.join(applies_to[0].split())
    licenses = lic_group.xpath('following-sibling::ul[1]/li/span//text()')     # Get the licenses
    licenses = [x.strip() for x in licenses]
    lic_info.append((applies_to, licenses))     # Add the license information as a tuple

In [16]:
lic_info

[('Applies to: Data use',
  ['Creative Commons Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)'])]

In [17]:
lic_strings = []
sep = '; '

In [18]:
for lic in lic_info:
    lic_strings.append(lic[0] + " = {" + sep.join(lic[1]) + "}")
    lic_string = sep.join(lic_strings)

In [19]:
licence = [lic_string]
licence

['Applies to: Data use = {Creative Commons Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)}']

- FAIR Scrapper elements  
url, title, sad, ta, lic_info

#### fair_table(fpss, file_output)
Writes a list of preliminary statistics from multiple FAIRsharing.org urls to a CSV file

    :param fpss: List of FAIRPrelimStats
    :param file_output: Path to output file to write to (String)
    :return:

In [20]:
fpss = [[url], title, sad, ta, licence]
fpss

[['https://fairsharing.org/biodbcore-000015'],
 ['ChEMBL: a large-scale bioactivity database for drug discovery'],
 ['Approved drug', 'Biomedical Science', 'Peptide', 'Small molecule'],
 ['Chemical Entities of Biological Interest',
  'PSI Molecular Interaction Controlled Vocabulary'],
 ['Applies to: Data use = {Creative Commons Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)}']]

In [21]:
num_fpss = len(fpss)
num_fpss

5

In [22]:
titles = ['url', 'title', 'scope and data types', 'terminology artifacts', 'license']

In [23]:
summary = {key: value for (key, value) in zip(titles, fpss)}

In [24]:
summary

{'license': ['Applies to: Data use = {Creative Commons Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)}'],
 'scope and data types': ['Approved drug',
  'Biomedical Science',
  'Peptide',
  'Small molecule'],
 'terminology artifacts': ['Chemical Entities of Biological Interest',
  'PSI Molecular Interaction Controlled Vocabulary'],
 'title': ['ChEMBL: a large-scale bioactivity database for drug discovery'],
 'url': ['https://fairsharing.org/biodbcore-000015']}

In [96]:
def writeToJSONFile(path, fileName, data):
    filePathNameWExt = './' + path + '/' + fileName + '.json'
    with open(filePathNameWExt, 'w') as fp:
        json.dump(data, fp)

In [97]:
writeToJSONFile('./','metrics',summary)