# Preparation

In [None]:
## If using Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# install dependencies

!pip install pyshacl
!pip install rdflib
!pip install sparqlwrapper

import regex as re
import time

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from pyshacl import validate
from rdflib import Graph, URIRef, BNode, Literal 
from SPARQLWrapper import SPARQLWrapper, JSON
from tqdm import tqdm

# Data Collection

In [None]:
def query_sparql(query, sparql_endpoint):
  """
  Query to certain SPARQL endpoint, such as Wikidata SPARQL.

  Parameters
  ----------
  query : str
    A SPARQL query to be run
  sparql_endpoint : str
    A SPARQL API endpoint 

  Returns
  -------
  DataFrame
    A table consisting of instances to be validated
  """

  # set up the query
  sparqlwd = SPARQLWrapper(sparql_endpoint)
  sparqlwd.setQuery(query)
  sparqlwd.setReturnFormat(JSON)

  # get the data and transform the result into pandas dataframe
  while True:
    try:
      results = sparqlwd.query().convert()
      results_df = pd.json_normalize(results['results']['bindings'])
      break
    except:
      continue
  
  # return the result in dataframe
  return results_df

First of all, get all the entities which will be validated using SHACL.

In [None]:
# replace the query as you defined below
query = """
...
"""

# example: retrieve all instances of Country class with a limit of 1000
query = """
SELECT DISTINCT ?entity
WHERE {
    ?entity a dbo:Country .
}
LIMIT 1000
"""

# execute the query using query_sparql function
data = query_sparql(query, "http://dbpedia.org/sparql")
data.to_csv("DEFINED/PATH", index=False)
data.head()

Then, get all the required property values for all entities.

In [None]:
def get_data_prop(df, prop_list, window_size, sparql_endpoint):
  """
  Query the property value given all the instances to be validated.

  Parameters
  ----------
  df : DataFrame
    A table containing all the instances to be validated
  prop_list : list
    A list of properties to be checked
  window_size : int
    A number of data instances used in one query
  sparql_endpoint : str
    A SPARQL API endpoint 

  Returns
  -------
  DataFrame
    A table consisting of all the properties of instances along with their values
  """

  # initiate the variables
  size = df.shape[0]
  list_data = []

  for prop in prop_list:
    # initiate index
    idx_lower = 0
    idx_upper = window_size

    # window-looping through length of data
    while idx_lower <= size:
      if idx_upper > size:
        idx_upper = size

      # check the index
      print(idx_lower, idx_upper)

      while True:
        try:
          query = f"""
SELECT ?s ?p ?o
WHERE {{
  VALUES ?s {{{' '.join(data['entity'][idx_lower:idx_upper]) }}}
  BIND({prop} AS ?p)
  ?s ?p ?o .
}}
"""
          res = query_sparql(query, sparql_endpoint)
          list_data.append(res)

        except:
          time.sleep(5)
          continue

        # update idx
        idx_lower += window_size
        idx_upper += window_size

        break

  return pd.concat(list_data, ignore_index=True, sort=False)

In [None]:
# initiate data and variable before function call
data['entity'] = data['entity.value'].apply(lambda x: f"<{x}>")
# SPARQL endpoint URL i.e. http://dbpedia.org/sparql
endpoint = "..."
# list all the property to be visualized
prop_list = ['...']

# execute the query to get all the property value
data_prop = get_data_prop(data, prop_list, 50, endpoint)

# Shapes Generation

## Manual

In [None]:
shapes_graph = \
"""
...
"""

# example: the shapes graph to check all instances of Country class should
# have a label and description property
shapes_graph = \
"""
@prefix : <http://example.org/ns#> .
@prefix dash: <http://datashapes.org/dash#> .
@prefix dbo: <http://dbpedia.org/ontology/> .
@prefix dbp: <http://dbpedia.org/property/> .
@prefix dbr: <http://dbpedia.org/resource/> .
@prefix dct: <http://purl.org/dc/terms/> .
@prefix ex: <http://example.org/ns#> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix geo: <http://www.opengis.net/ont/geosparql#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix schema: <http://schema.org/> .
@prefix sh: <http://www.w3.org/ns/shacl#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix wd: <http://www.wikidata.org/entity/> .
@prefix wdt: <http://www.wikidata.org/prop/direct/> .

ex:CountryLabelDescriptionShape
    a sh:NodeShape;
    sh:targetClass dbo:Country ;
    sh:property [
        sh:path rdfs:label ;
        sh:minCount 1 ;
    ] ;
    sh:property [
        sh:path rdfs:comment ;
        sh:minCount 1 ;
    ] ;
    sh:property [
        sh:path dbo:abstract ;
        sh:minCount 1 ;
    ] .
"""

In [None]:
# save the shapes graph
with open("DEFINED/PATH", "w") as text_file:
    print(shapes_graph, file=text_file)

## Automated

In [None]:
prefixes = """
@prefix : <http://example.org/ns#> .
@prefix dash: <http://datashapes.org/dash#> .
@prefix dbc: <http://dbpedia.org/resource/Category:> .
@prefix dbo: <http://dbpedia.org/ontology/> .
@prefix dbp: <http://dbpedia.org/property/> .
@prefix dbr: <http://dbpedia.org/resource/> .
@prefix dct: <http://purl.org/dc/terms/> .
@prefix ex: <http://example.org/ns#> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix geo: <http://www.opengis.net/ont/geosparql#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix person: <http://dbpedia.org/ontology/Person> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix schema: <http://schema.org/> .
@prefix sh: <http://www.w3.org/ns/shacl#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix sock: <https://cs.ui.ac.id/ns/sock#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix wd: <http://www.wikidata.org/entity/> .
@prefix wdt: <http://www.wikidata.org/prop/direct/> .
"""

In [None]:
def construct_property_shape(props):
  """
  Construct the property shape.

  Parameters
  ----------
  props : list
    A list of properties used to create property shape

  Returns
  -------
  (str, list)
    A string of property shapes, a list of property shapes' name
  """

  # required variable initiation
  list_prop_shape_name = []
  property_shape = ''

  # construct the property shape by iterates over props and props_label
  for prop in props:

    # clean the prop_label text
    prop_label = prop[0].upper() + prop[1:]
    property_shape_name = prop_label + "PropertyShape"

    # property shape template
    property_shape_pattern = \
f"""
ex:{property_shape_name} a sh:PropertyShape ;
    sh:path dbo:{prop} ;
    sh:minCount 1 .
"""

    # property shape formatting
    property_shape += property_shape_pattern
    list_prop_shape_name.append(property_shape_name)

  return property_shape, list_prop_shape_name

In [None]:
def construct_node_shape(cls, node_shape_name, list_prop_shape_name):
  """
  Construct the node shape.

  Parameters
  ----------
  cls : str
    A class name as a node shape target 
  node_shape_name : str
    A name for a node shape
  list_prop_shape_name : list
    A list of property shape name

  Returns
  -------
  (str, str)
    A string of node shape, a string of node shape's name
  """

  # revise the properties by adding their prefixes
  list_property_shape = ['ex:' + prop for prop in list_prop_shape_name]
  list_property_shape = ', \n\t\t'.join(list_property_shape)

  # node shape template
  node_shape = \
f"""
ex:{node_shape_name}Shape
    a sh:NodeShape ;
    sh:targetClass dbo:{cls} ;
    sh:property {list_property_shape} .
"""

  return node_shape, node_shape_name

In [None]:
# create property shape
prop_list = [prop.split("/")[-1] for prop in prop['p.value']]
property_shape, list_prop_shape_name = construct_property_shape(prop_list)

# create node_shape
node_shape, node_shape_name = construct_node_shape("Country",
                                                   "CountrySchemaShapes",
                                                   list_prop_shape_name)

shapes_graph = prefixes + node_shape + property_shape

with open("DEFINED/PATH", "w") as text_file:
    print(shapes_graph, file=text_file)

# Data Validation

In [None]:
# load data containing all the property values for the entities
data_prop = pd.read_csv("DEFINED/PATH")

# handle NaN values in language attribute
# if there is no column "o.xml:lang", just skip it
data_prop[['o.xml:lang']] = data_prop[['o.xml:lang']].fillna('not specified')

# convert data into data graph
data_graph = Graph()

# add instance relation for all entities
# only used for checking with target for a certain class
# if not, just skip it
for _, row in data.iterrows():
  s = URIRef(row['entity.value'])
  p = URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')
  o = URIRef('http://dbpedia.org/ontology/Country') # example: Country class
  data_graph.add((s, p, o))

# add node-property relation for all entities
for _, row in data_prop.iterrows():
  s = URIRef(row['s.value'])
  p = URIRef(row['p.value'])
  if row['o.type'] == 'literal':
    if row['o.xml:lang'] == 'not specified':
      o = o = Literal(row['o.value'])
    else:
      o = Literal(row['o.value'], lang=row['o.xml:lang'])
  elif row['o.type'] == 'typed-literal':
    o = Literal(row['o.value'], datatype=row['o.datatype'])
  else:
    o = URIRef(row['o.value'])
  data_graph.add((s, p, o))

In [None]:
# load shapes graph
shapes_graph = Graph()
shapes_file = "DEFINED/PATH"
shapes_graph.parse(shapes_file)

In [None]:
def validate_graph(shapes_graph, data_graph, is_advanced=False):
  """
  Validate the data graph over the shapes graph with the SHACL engine provided by PySHACL.
  
  Parameters
  ----------
  shapes_graph : Graph
    The shapes graph containing all the constraints
  data_graph : Graph
    The data graph containing all the instances to be validated along with their property values
  is_advanced : boolean, optional
    ...

  Returns
  -------
  (bool, Graph, str)
    value of conformation, validation report in the shape of a graph, and
    validation report in the shape of a text
  """

  result = validate(
    data_graph = data_graph,
    shacl_graph = shapes_graph,
    advanced = is_advanced,
    serialize_report_graph="ttl",
    )
  
  return result

# validate the data graph
conforms, report_graph, report_text = validate_graph(shapes_graph, data_graph)

In [None]:
def create_report_validation(df, use_col, report_graph, prop_list):
  """
  Transform validation result from a form of graph to a form of table (Pandas dataframe).

  Parameters
  ----------
  df : DataFrame
    A table containing all the instances to be validated
  use_col : list
    A sublist of the variable list_prop used for lookup through the report graph 
  report_graph : Graph
    A validation report in the shape of graph as a result of SHACL validation
  prop_list : list
    A list of property to be checked

  Returns
  -------
  DataFrame
    A table consisting of instances with the value of validation
  """

  report = Graph()
  report.parse(data=report_graph)

  # list all incompleteness
  list_incomplete = []

  for prop in prop_list:
    report_query = f"""
PREFIX dbo: <http://dbpedia.org/ontology/>
SELECT ?focusNode
WHERE {{
[] <http://www.w3.org/ns/shacl#result> ?id .
?id <http://www.w3.org/ns/shacl#focusNode> ?focusNode ;
    <http://www.w3.org/ns/shacl#resultPath> {prop} .
}}
"""
    res = report.query(report_query)

    list_entities = []
    for row in res:
      list_entities.append([str(row.focusNode), 0])

    list_incomplete.append(list_entities)

  # convert to dict of incompleteness
  validation = df[[use_col]]
  incomplete_dict = dict()

  for idx, prop in enumerate(prop_list):
    incomplete_dict[f"df_incomplete_{prop}"] = pd.DataFrame(list_incomplete[idx], columns=[use_col, prop])

  # merge the information
  for key in incomplete_dict.keys():
    validation = pd.merge(validation, incomplete_dict[key], on=use_col, how='left').fillna(1)

  validation['complete_all'] = validation.iloc[:,1:].sum(axis=1)/len(prop_list)

  return validation

prop_list = ['rdfs:label', 'rdfs:comment', 'dbo:abstract']
validation = create_report_validation(data, "entity.value", report_graph, prop_list)
validation.to_csv(path+"DBP-LDC01M-20220518-Country-validation.csv", index=False)

# Visualization

In [None]:
def create_completeness_info_viz(validation, prop_list, title):
  """
  Visualize the result validation

  Parameters
  ----------
  validation : DataFrame
    A table consisting of validated instances
  prop_list : list
    A list of properties to be checked
  title : str
    A title to be shown in the chart
  """

  # transform validation data into suitable one for the chart
  res_list = []
  for prop in prop_list:
    res = validation[prop].value_counts(normalize=True).rename_axis('label').reset_index(name='percentage')
    res['property'] = prop
    res_list.append(res)

  comp_summary = pd.concat(res_list, ignore_index=True, sort=False)

  # transform value of 1 and 0
  dict_map = {1: 'Complete', 0: 'Incomplete'}
  comp_summary['label'] = comp_summary['label'].map(dict_map)

  # create stacked bar plot
  fig = px.bar(comp_summary,
             x='percentage',
             y='property',
             color='label',
             title=title)

  # adjust plot layout
  fig.update_layout(
      autosize=False,
      width=800,
      height=100*len(prop_list) if len(prop_list) > 3 else 300,
      xaxis = dict(
          tickmode = 'array',
          tickvals = [0, 0.2, 0.4, 0.6, 0.8, 1],
          ticktext = ['0%', '20%', '40%', '60%', '80%', '100%']
      ))

  fig.show()

# list all the property to be visualized
prop_list = ['...']
# prop_list = ['rdfs:label', 'rdfs:comment', 'dbo:abstract'] # example


create_completeness_info_viz(validation,
                             prop_list,
                             '...',) # insert title