## FAIRSharing metrics

In [1]:
from rdflib import Graph, Literal, URIRef, Namespace, RDF
from rdflib.namespace import DCTERMS, XSD

In [2]:
import json

In [3]:
metrics = json.loads(open('metrics.json').read())

In [4]:
metrics

{'license': ['Applies to: Data use = {Creative Commons Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)}'],
 'scope and data types': ['Approved drug',
  'Biomedical Science',
  'Peptide',
  'Small molecule'],
 'terminology artifacts': ['Chemical Entities of Biological Interest',
  'PSI Molecular Interaction Controlled Vocabulary'],
 'title': 'ChEMBL: a large-scale bioactivity database for drug discovery',
 'url': 'https://fairsharing.org/biodbcore-000015'}

In [5]:
catalog = json.loads(open('downloadURL.json').read())

In [16]:
for i,j in catalog.items():
    print(i, j)

https://fairsharing.org/biodbcore-000015 http://bio2rdf.org#CHEMBL
https://fairsharing.org/biodbcore-000456 http://bio2rdf.org#pubmed
https://fairsharing.org/biodbcore-000739 http://bio2rdf.org#clinvar
https://fairsharing.org/bsg-s000909 http://bio2rdf.org#wikipathways
https://fairsharing.org/biodbcore-000304 http://bio2rdf.org#drugbank


In [7]:
#!conda install -c conda-forge rdflib -y
#do it once

Write out dataset data quality metrics in RDF using W3C data vocabulary.

            :param dataset_id: ID to be used in URI for this data set (String)
            :param fps: FAIRsharing preliminary stats (FAIRPrelimStats) [optional]
            :param down_url: Download URL of dataset (String) [optional]
            :param byte_size: Size of dataset in bytes [optional]
            :return: None

Converting preliminary statistics to W3C DQV

In [17]:
# Define namespaces
#ns_local = Namespace("http://ncats.nih.gov/")
dqv = Namespace("http://www.w3.org/ns/dqv#")
hcls = Namespace("http://www.w3.org/hcls#")
bio2rdf = Namespace("http://bio2rdf.org#")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
prov = Namespace("https://www.w3.org/ns/prov#")
dcat = Namespace("http://www.w3.org/ns/dcat#")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
dcterms = Namespace("http://purl.org/dc/terms/")

In [18]:
g = Graph()

In [19]:
dataset = "http://bio2rdf.org#CHEMBL"
#this is linked to metrics['url']

In [20]:
distribution = dataset + 'datasetDistribution'

- General

In [9]:
# Add information about the data set
g.add((URIRef(dataset), rdf.type, dcat.Dataset))
g.add((URIRef(dataset), dcterms.title, Literal(metrics['title'], lang="en")))
g.add((URIRef(dataset), dcat.distribution, URIRef(distribution)))

In [10]:
# Add information about the distribution
g.add((URIRef(distribution), rdf.type, dcat.distribution))
g.add((URIRef(distribution), dcat.downloadURL, URIRef(metrics['url'])))
g.add((URIRef(distribution), dcterms.title, Literal(metrics['title'], lang="en")))

- Licensing

if metrics is licence then do:
    

In [21]:
measurement = metrics['license']
measurement

['Applies to: Data use = {Creative Commons Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)}']

In [None]:
#this changes
g.add((URIRef(distribution), dqv.hasQualityMeasurement, hcls.licensing))

In [13]:
# Add information about the Mesurement
g.add((hcls.licence, rdf.type, dqv.QualityMeasurement))
g.add((hcls.licence, dqv.computedOn, URIRef(distribution))
g.add((hcls.licence, dqv.value, Literal(measurement, lang="en")))
g.add((hcls.licence, dqv.isMeasurementOf, dqv.L2metric))

In [14]:
# Add information about the Coverage (Metric)
g.add((dqv.L2metric, rdf.type, dqv.QualityMeasurement))
g.add((dqv.L2metric, skos.type, Literal("Detection of a license in the documentation of the dataset", lang="en")))
g.add((dqv.L2metric, dqv.expectedDatatype, xsd.string))
g.add((dqv.L2metric, dqv.inDimension, dqv.Licensing))

In [15]:
# Add information about the Dimension
g.add((dqv.Licensing, rdf.type, dqv.Dimension))
g.add((dqv.Licensing, skos.prefLabel, Literal("Licensing", lang="en")))
g.add((dqv.Licensing, skos.definition, Literal("Licensing is defined as the granting of permission for a consumer to re-use a dataset under defined conditions", lang="en")))
g.add((dqv.Licensing, dqv.inCategory, dqv.Accessibility))
g.add((dqv.Accessibility, rdf.type, dqv.Category))

- Interoperability

In [22]:
measurement = metrics['terminology artifacts']
measurement

['Chemical Entities of Biological Interest',
 'PSI Molecular Interaction Controlled Vocabulary']

In [17]:
g.add((URIRef(distribution, dqv.hasQualityMeasurement, hcls.vocabReuse))

In [19]:
# Add information about the Mesurement
g.add((hcls.vocabReuse, rdf.type, dqv.QualityMeasurement))
g.add((hcls.vocabReuse, dqv.computedOn, URIRef(distribution))
      
g.add((hcls.vocabReuse, dqv.value, Literal(measurement[0], lang="en")))
g.add((hcls.vocabReuse, dqv.value, Literal(measurement[1], lang="en")))
g.add((hcls.vocabReuse, dqv.isMeasurementOf, dqv.IO2metric))

In [20]:
# Add information about the Coverage (Metric)
g.add((dqv.IO2metric, rdf.type, dqv.QualityMeasurement))
g.add((dqv.IO2metric, skos.type, Literal("Re-use of existing vocabularies.", lang="en")))
g.add((dqv.IO2metric, dqv.expectedDatatype, xsd.string))
g.add((dqv.IO2metric, dqv.inDimension, dqv.Interoperability))

In [21]:
# Add information about the Dimension
g.add((dqv.Interoperability, rdf.type, dqv.Dimension))
g.add((dqv.Interoperability, skos.prefLabel, Literal("Interoperability", lang="en")))
g.add((dqv.Interoperability, skos.definition, Literal("Interoperability is the degree to which the format and structure of the information conforms to previously returned information as well as data from other sources.", lang="en")))
g.add((dqv.Interoperability, dqv.inCategory, dqv.Representational))
g.add((dqv.Representational, rdf.type, dqv.Category))

- Relevancy

In [22]:
# Add information about the data set
# Done

In [23]:
# Add information about the distribution
# Done
g.add((URIRef(dataset+'datasetDistribution'), dqv.hasQualityMeasurement, hcls.coverage))

In [24]:
measurement = metrics['scope and data types']
measurement

['Approved drug', 'Biomedical Science', 'Peptide', 'Small molecule']

In [25]:
# Add information about the Mesurement
g.add((hcls.coverage, rdf.type, dqv.QualityMeasurement))
g.add((hcls.coverage, dqv.computedOn, URIRef(dataset+'datasetDistribution')))
g.add((hcls.coverage, dqv.value, Literal(measurement[0], lang="en")))
g.add((hcls.coverage, dqv.value, Literal(measurement[1], lang="en")))
g.add((hcls.coverage, dqv.value, Literal(measurement[2], lang="en")))
g.add((hcls.coverage, dqv.value, Literal(measurement[3], lang="en")))
g.add((hcls.coverage, dqv.isMeasurementOf, dqv.R2metric))

In [26]:
# Add information about the Coverage (Metric)
g.add((dqv.R2metric, rdf.type, dqv.QualityMeasurement))
g.add((dqv.R2metric, skos.type, Literal("Coverage of scope and datatypes in the dataset.", lang="en")))
g.add((dqv.R2metric, dqv.expectedDatatype, xsd.string))
g.add((dqv.R2metric, dqv.inDimension, dqv.Relevancy))

In [27]:
# Add information about the Dimension
g.add((dqv.Relevancy, rdf.type, dqv.Dimension))
g.add((dqv.Relevancy, skos.prefLabel, Literal("Relevancy", lang="en")))
g.add((dqv.Relevancy, skos.definition, Literal("Relevancy refers to the provision of information which is in accordance with the task at hand and important to the users query", lang="en")))
g.add((dqv.Relevancy, dqv.inCategory, dqv.Contextual))
g.add((dqv.Contextual, rdf.type, dqv.Category))

In [29]:
g.serialize(destination='test.ttl')

In [30]:
g.serialize(destination='test.html')

In [None]:
#reates and adds a new measurement to the graph
#:param measurement_label: A unique label for the measurement. Leave empty for auto naming.
#:return: The new measurement node
if len(measurement_label) == 0:
    # Create a new measurement label
    n_measurements += 1
    measurement_label = 'measurement' + '%04d' % n_measurements

In [None]:
def serialize(self, file, format='ttl'):
    """Writes the RDF graph to file in the specified format

    :param file: Path to the file to write to (String)
    :param format: RDF format (default: 'ttl')
    :return:
    """
    try:
        # Write out turtle file
        self.g.serialize(destination=file, format=format)

        # Output message
        if config.verbose:
            print('Preliminary statistics in W3C DQV written to: ' + file)
    except IOError:
        sys.stderr.write('Error while trying to serialize preliminary stats RDF graph to file: ' + file + '\n')

## Testing

In [None]:
# FAIRsharing.org URLs to test
urls = ['https://biosharing.org/biodbcore-000015',
        'https://biosharing.org/biodbcore-000037',
        'https://biosharing.org/biodbcore-000081',
        'https://biosharing.org/biodbcore-000095',
        'https://biosharing.org/biodbcore-000104',
        'https://biosharing.org/biodbcore-000137',
        'https://biosharing.org/biodbcore-000155',
        'https://biosharing.org/biodbcore-000156',
        'https://biosharing.org/biodbcore-000173',
        'https://biosharing.org/biodbcore-000304',
        'https://biosharing.org/biodbcore-000329',
        'https://biosharing.org/biodbcore-000330',
        'https://biosharing.org/biodbcore-000341',
        'https://biosharing.org/biodbcore-000417',
        'https://biosharing.org/biodbcore-000438',
        'https://biosharing.org/biodbcore-000441',
        'https://biosharing.org/biodbcore-000455',
        'https://biosharing.org/biodbcore-000470',
        'https://biosharing.org/biodbcore-000495',
        'https://biosharing.org/biodbcore-000525',
        'https://biosharing.org/biodbcore-000544',
        'https://biosharing.org/biodbcore-000552',
        'https://biosharing.org/biodbcore-000663',
        'https://biosharing.org/biodbcore-000730',
        'https://biosharing.org/biodbcore-000805',
        'https://biosharing.org/biodbcore-000826',
        'https://biosharing.org/biodbcore-000842',
        'https://fairsharing.org/biodbcore-000618',
        'https://fairsharing.org/biodbcore-000340']

# Write the results to the configured output folder
dir_output = config.path_output
if not os.path.exists(dir_output):
    os.mkdir(dir_output)

# List of preliminary statistics results
stats_list = []

# Process each url
for url in urls:
    # Scrape the page
    stats = fair_scraper.fair_scraper(url)
    stats_list.append(stats)

    # Output filename based on url
    filename = url.split('/')[-1] + '_rdf.ttl'
    output_file = os.path.join(dir_output, filename)

    # Use the dataset title as the local identifier
    dataset_id = "".join([c for c in stats.title if c.isalnum()]) + 'Dataset'

    # Write out preliminary statistics using W3C DQV
    stats_rdf = prelim_stats_rdf.PrelimStatsRDF(dataset_id, stats)
    stats_rdf.serialize(output_file, format='ttl')

# Run the scraper and write the results to CSV
file_output = os.path.join(dir_output, 'FAIRsharing_table.csv')
#fair_scraper.fair_table(stats_list, file_output)