This script assumes that you have following data and directories:
* "/.BEAR/rawdata/bearb/hour/alldata.IC.nt/"
* "/.BEAR/rawdata/bearb/hour/alldata.CB.nt/"

In alldata.IC.nt and alldata.CB.nt you should have the independent copies (IC) and change sets (CB) from the BEAR-B hourly datasets.
BEAR webpage: https://aic.ai.wu.ac.at/qadlod/bear.html
on the webpage go to Description of the dataset/Get the dataset/hour and download alldata.IC.nt.tar.gz and 	alldata.CB.nt.tar.gz

In [2]:
import pandas as pd
from pathlib import Path
# conda install -c conda-forge sparqlwrapper
from SPARQLWrapper import SPARQLWrapper, JSON, Wrapper, GET
from rdflib import Graph
from rdflib.term import Literal

pd.options.display.max_columns=300
pd.set_option('display.max_rows', None)

In [6]:
""" Parameters """
dataset_dir = str(Path.home()) + "/.BEAR/rawdata/bearb/hour"
get_endpoint_graphdb = "http://192.168.0.241:7200/repositories/BEAR-B_hourly_TB"

""" Functions """
def _to_df(result: Wrapper.QueryResult) -> pd.DataFrame:
    """
    :param result:
    :return: Dataframe
    """
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_colwidth', None)

    def format_value(res_value):
        value = res_value["value"]
        lang = res_value.get("xml:lang", None)
        datatype = res_value.get("datatype", None)
        if lang is not None:
            value += "@" + lang
        if datatype is not None:
            value += " [" + datatype + "]"
        return value

    results = result.convert()

    column_names = []
    for var in results["head"]["vars"]:
        column_names.append(var)
    df = pd.DataFrame(columns=column_names)

    values = []
    for r in results["results"]["bindings"]:
        row = []
        for col in results["head"]["vars"]:
            if col in r:
                result_value = format_value(r[col])
            else:
                result_value = None
            row.append(result_value)
        values.append(row)
    df = df.append(pd.DataFrame(values, columns=df.columns))

    return df

def number_of_triples(version: int):
    ic0_ds_path = "{0}/alldata.IC.nt/00{1}.nt".format(dataset_dir, str(version).zfill(4))
    ic0_list = []
    with open(ic0_ds_path, "r") as ic0:
        for triple in ic0:
            tr_array = triple[:-2].split(" ", 2)
            if len(tr_array) == 3:
                ic0_list.append(tr_array)

    ic0_df = pd.DataFrame(ic0_list, columns=['s', 'p', 'o'])
    return len(ic0_df)

def cb_to_df(version: int, added_or_deleted: str = "added"):
    """
    version: the actual version. E.g. for triples added between v1 and v2 (data-added_1-2.nt) 2 would
    be the parameter value.
    """

    version_prev = version - 1
    if version == 1:
        return "This is the initial version. There is no change set previous to this version. Choose are version " \
               "higher than 1 and lower than 1300."
    cb0_ds_path = "{0}/alldata.CB.nt/data-{1}_{2}-{3}.nt".format(dataset_dir, added_or_deleted, version_prev, version)
    cb0_list = []
    with open(cb0_ds_path, "r") as ic0:
        for triple in ic0:
            tr_array = triple[:-2].split(" ", 2)
            if len(tr_array) == 3:
                cb0_list.append(tr_array)

    cb0_df = pd.DataFrame(cb0_list, columns=['s', 'p', 'o'])
    return cb0_df


def ic_to_df(version: int):
    ic0_ds_path = "{0}/alldata.IC.nt/00{1}.nt".format(dataset_dir, str(version).zfill(4))
    ic0_list = []
    with open(ic0_ds_path, "r") as ic0:
        for triple in ic0:
            tr_array = triple[:-2].split(" ", 2)
            if len(tr_array) == 3:
                ic0_list.append(tr_array)

    ic0_df = pd.DataFrame(ic0_list, columns=['s', 'p', 'o'])
    return ic0_df


def check_ic_cb_consistency():
    l = []
    for i in range(2, 1300):
        check_flag = False
        if len(ic_to_df(i - 1)) + len(cb_to_df(i)) - len(cb_to_df(i, "deleted")) == len(ic_to_df(i)):
            check_flag = True
        l.append([i, len(ic_to_df(i)), len(ic_to_df(i-1)), len(cb_to_df(i)), len(cb_to_df(i, "deleted")),
                   len(cb_to_df(i)) - len(cb_to_df(i, "deleted")), len(ic_to_df(i)) - len(ic_to_df(i-1)),
                   check_flag])
    df = pd.DataFrame(l, columns=['version', 'cnt_trpls_IC', 'cnt_trpls_IC_prev', 'cnt_added_trpls',
                                  'cnt_deleted_trpls', 'cnt_trpls_added_net',
                                  'cnt_trpls_diff_ICs', 'flag_changes_consistent?'])
    return df

def check_ic_tb_consistency(get_endpoint: str):
    # TODO: check whether the ICs and individual versions 
    # in alldata.TB_star.ttl are consistent with each other

    df_checks = pd.DataFrame(columns=['version', 'cnt_trpls_ic', 'cnt_trpls_tb', 'check_flag'])
    
    for version in range (1, 6):
        # TB
        sparql = SPARQLWrapper(get_endpoint)
        sparql.setQuery(""" 
            SELECT *  {{
                GRAPH <http://example.org/versions> {{
                    ?graph <http://www.w3.org/2002/07/owl#versionInfo> {0} .
                }}
                graph ?graph {{
                    ?s ?p ?o.
                }}
            }}
        """.format(version-1))
        sparql.setReturnFormat(JSON)
        results = sparql.query()
        
        df_tb = _to_df(results)
        #cnt_trpls_tb = int(df_tb.loc[0,'cnt_trpls'].split(' ')[0])
        cnt_trpls_tb = len(df_tb)

        # IC
        df_ic = ic_to_df(version)
        cnt_trpls_ic = len(df_ic)

        df_checks.loc[len(df_checks)] = [version, cnt_trpls_ic, cnt_trpls_tb, cnt_trpls_tb==cnt_trpls_ic]
        print("Version {0} processed.".format(version))
    
    return df_checks

def check_ic_tb_star_consistency(get_endpoint: str):
    """"
    PREFIX vers:<https://github.com/GreenfishK/DataCitation/versioning/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    select (count(*) as ?cnt_trpls) where {
        select ?s ?p ?o where {
            <<?s ?p ?o>> vers:valid_from ?valid_from.
            <<?s ?p ?o>> vers:valid_until ?valid_until.
            bind("2021-12-22T15:43:56.493+02:00"^^xsd:dateTime as ?TimeOfExecution)
            filter(?valid_from <= ?TimeOfExecution &&  ?TimeOfExecution < ?valid_until)
        }
    }
    
    """

def print_stats(version: int):
    print("Number of triples in IC version {0}: {1}".format(version, len(ic_to_df(version))))
    print("Number of triples in previous IC version {0}: {1}".format(version-1, len(ic_to_df(version-1))))
    print("Number of added triples in version {0} compared to previous version: {1}".
          format(version, len(cb_to_df(version))))
    print("Number of deleted triples in version {0} compared to previous version: {1}".
          format(version, len(cb_to_df(version, "deleted"))))

    check_flag = False
    if len(ic_to_df(version-1)) + len(cb_to_df(version)) - len(cb_to_df(version, "deleted")) == len(ic_to_df(version)):
        check_flag = True

    print("Check whether the change numbers reflect the difference between two ICs: {0} + {1} - {2} = {3}: Equation {4}".
          format(len(ic_to_df(version-1)), len(cb_to_df(version)), len(cb_to_df(version, "deleted")),
                 len(ic_to_df(version)), check_flag))

    df1 = ic_to_df(version-1)
    cb_add = cb_to_df(version)
    cb_del = cb_to_df(version, "deleted")
    df_diff1 = cb_add.merge(cb_del, on=['s', 'p', 'o'], how="inner")
    df1 = df1.merge(cb_add, on=['s', 'p', 'o'], how="inner")
    assert len(df1) == len(df_diff1)

    print("Number of triples that have been deleted and added again (or vice versa) "
          "in version {0} compared to previous version: {1}".format(version, len(df_diff)))

In [5]:
print("Verify that the number of triples in version 1 is 33502")
assert number_of_triples(1) == 33502

print("Verify that the number of triples in version 1299 is 43907")
assert number_of_triples(1299) == 43907

print("Verify that all added triples between version 1 and 2 are included in version 2")
v2_df = ic_to_df(2)
v1_v2_cb = cb_to_df(2)
df = v2_df.merge(v1_v2_cb, on=['s', 'p', 'o'], how="inner")
assert len(df) == len(v1_v2_cb)


print("Verify that all triples that are included in version 1 and version 2 are also reflected in the "
      "intersection of the added and deleted change sets. Thus, they must have been deleted and then added again.")
# There are some triples that have been deleted and then added again between version 1 and 2
v1_df = ic_to_df(1)
v1_v2_cb = cb_to_df(2)
v1_v2_cb_del = cb_to_df(2, "deleted")
df_diff = v1_v2_cb.merge(v1_v2_cb_del, on=['s', 'p', 'o'], how="inner")
df = v1_df.merge(v1_v2_cb, on=['s', 'p', 'o'], how="inner")
assert len(df) == len(df_diff)


print("Verify that the deleted triples between version 1 and version 2 are included in version 1")
v1_df = ic_to_df(1)
v1_v2_cb_del = cb_to_df(2, "deleted")
df = v1_df.merge(v1_v2_cb_del, on=['s', 'p', 'o'], how="inner")
assert len(df) == len(v1_v2_cb_del)

Verify that the number of triples in version 1 is 33502
Verify that the number of triples in version 1299 is 43907
Verify that all added triples between version 1 and 2 are included in version 2
Verify that all triples that are included in version 1 and version 2 are also reflected in the intersection of the added and deleted change sets. Thus, they must have been deleted and then added again.
Verify that the deleted triples between version 1 and version 2 are included in version 1


In [4]:
check_ic_cb_consistency()

Unnamed: 0,version,cnt_trpls_IC,cnt_trpls_IC_prev,cnt_added_trpls,cnt_deleted_trpls,cnt_trpls_added_net,cnt_trpls_diff_ICs,flag_changes_consistent?
0,2,33502,33502,155,81,74,0,False
1,3,33501,33502,26,12,14,-1,False
2,4,33501,33501,27,11,16,0,False
3,5,33498,33501,17,20,-3,-3,True
4,6,33497,33498,52,31,21,-1,False
5,7,33484,33497,28,24,4,-13,False
6,8,33486,33484,23,11,12,2,False
7,9,33487,33486,23,6,17,1,False
8,10,33487,33487,11,6,5,0,False
9,11,33487,33487,17,17,0,0,True


In [4]:
# local SPARQL endpoint from GraphDB free. Needs to be installed and the alldata.TB.nq dataset imported into a new 
# repository 'BEAR-B_hourly_TB' using the server files import as the dataset is too large to be imported normally.
# To use the server files import create a folder graphdb-import in your home directory and place the file 
# alldata.TB.nq there
check_ic_tb_consistency(get_endpoint=get_endpoint_graphdb)

Version 1 processed.
Version 2 processed.
Version 3 processed.
Version 4 processed.
Version 5 processed.


Unnamed: 0,version,cnt_trpls_ic,cnt_trpls_tb,check_flag
0,1,33502,33502,True
1,2,33502,33502,True
2,3,33501,33501,True
3,4,33501,33501,True
4,5,33498,33498,True
