# Preamble

In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON
from json import JSONDecodeError
from nltk import word_tokenize
from datetime import datetime
import pandas as pd
import unicodedata
import nltk
import json
import bz2

In [2]:
PATH_CAUSENET = "../../data/causality-graphs/causenet-full.jsonl.bz2"
PATH_CONCEPTNET = "../../data/external/knowledge-bases/conceptnet-assertions-5.6.0.csv"
PATH_WIKIDATA = "../../data/external/knowledge-bases/wikidata-20181001-all.json.bz2"

PATH_QA_TRAIN = "../../data/question-answering/causality-qa-training.json"
PATH_QA_VALID = "../../data/question-answering/causality-qa-validation.json"

PATH_NLTK_RESOURCES = "../../data/external/nltk/"

In [3]:
nltk.download('punkt', PATH_NLTK_RESOURCES)
nltk.data.path.append(PATH_NLTK_RESOURCES)

[nltk_data] Downloading package punkt to
[nltk_data]     ../../data/downloads/nltk_data/...
[nltk_data]   Package punkt is already up-to-date!


# Question Answering

In [4]:
train = json.load(open(PATH_QA_TRAIN))
valid = json.load(open(PATH_QA_VALID))

# merge MS MARCO datasets for causality-QA
causality_qa_dataset = train + valid

In [5]:
evaluation_matrix = None

In [46]:
class QueryException(Exception):
    pass

In [47]:
def evaluate_causality_graph(graph_name, causality_graph,
                             query, evaluation_matrix,
                             query_with_entities=False):
    yes_yes, yes_no, no_yes, no_no = 0, 0, 0, 0

    for question in causality_qa_dataset:

        if query_with_entities and None in question['entities:dbo']:
            # consider only questions that could be linked
            continue
        
        try:
            in_causality_graph = query(causality_graph, question)
        except QueryException:
            continue
            
        truth = question['answer:Extracted']

        if truth == ['Yes'] and in_causality_graph:
            yes_yes += 1
            continue

        if truth == ['Yes'] and not in_causality_graph:
            yes_no += 1
            continue

        if truth == ['No'] and in_causality_graph:
            no_yes += 1
            continue

        if truth == ['No'] and not in_causality_graph:
            no_no += 1
            continue

    total = yes_yes + yes_no + no_yes + no_no

    confusion_matrix = pd.DataFrame({
        "Yes>Yes": {graph_name: yes_yes},
        "Yes>No": {graph_name: yes_no},
        "No>Yes": {graph_name: no_yes},
        "No>No": {graph_name: no_no},
        "Total": {graph_name: total}
    })

    return pd.concat([evaluation_matrix, confusion_matrix])

## CauseNet

In [4]:
def load_jsonl(path):
    print("Loading... " + path)
    lines = []
    document = bz2.open(path, mode='rt')
    for line in document:
        lines.append(json.loads(line))
    return lines

In [8]:
def get_subgraph(graph, source_type):
    subgraph = []
    for relation in graph:
        belongs_to_subgraph = False
        for source in relation['sources']:
            if source['type'] == source_type:
                belongs_to_subgraph = True
                break
        if belongs_to_subgraph:
            subgraph.append(relation)
    return subgraph

In [9]:
# CauseNet-Wrapper for fast queries
class Graph():

    def __init__(self, causality_graph):
        self.relations = []

        for relation in causality_graph:
            cause = relation['causal_relation']['cause']['concept']
            effect = relation['causal_relation']['effect']['concept']
            relation = cause + " -> " + effect
            self.relations.append(relation)

        self.relations = set(self.relations)

    def has_edge(self, cause, effect):
        return cause + " -> " + effect in self.relations

In [None]:
causenet = load_jsonl(PATH_CAUSENET)

Loading... ../../data/causality-graphs/causenet-full.jsonl.bz2


In [11]:
source_types = ['clueweb12_sentence', 'wikipedia_sentence',
                'wikipedia_list', 'wikipedia_infobox']
causality_graphs = {'causenet': Graph(causenet)}

for source_type in source_types:
    causality_graphs[source_type] = Graph(get_subgraph(causenet, source_type))

In [12]:
def normalize_causal_concept(string):
    # as in conceptNet
    # https://en.wikipedia.org/wiki/Unicode_equivalence#Normalization
    tokens = word_tokenize(string)
    tokens = [unicodedata.normalize('NFKC', token.lower())
              for token in tokens]
    return '_'.join(tokens)

In [13]:
def get_concept_from_question(question, index):
    start = question['query'][index][0]
    end = question['query'][index][1] + 1
    concept = [t[0] for t in question['question:POS'][start:end]]
    concept = ' '.join(concept)
    return normalize_causal_concept(concept)

In [14]:
def query_causenet(causality_graph, question):
    cause = get_concept_from_question(question, 0)
    effect = get_concept_from_question(question, 1)
    return causality_graph.has_edge(cause, effect)

In [15]:
for graph_key in causality_graphs:
    parameters = (graph_key, causality_graphs[graph_key],
                  query_causenet, evaluation_matrix)
    evaluation_matrix = evaluate_causality_graph(*parameters)

In [16]:
evaluation_matrix.style.format("{0:,}")

Unnamed: 0,Yes>Yes,Yes>No,No>Yes,No>No,Total
causenet,487,1342,53,287,2169
clueweb12_sentence,480,1349,52,288,2169
wikipedia_sentence,51,1778,8,332,2169
wikipedia_list,8,1821,0,340,2169
wikipedia_infobox,9,1820,1,339,2169


## ConceptNet

In [17]:
def load_conceptnet():
    conceptnet = open(PATH_CONCEPTNET).readlines()
    conceptnet_triples = []

    for row in conceptnet:
        elements = row.split("\t")
        triple = (elements[2], elements[1], elements[3])
        conceptnet_triples.append(triple)

    return conceptnet_triples

In [18]:
conceptnet = load_conceptnet()

In [19]:
causal_properties = ['/r/CausesDesire', '/r/Causes']
conceptnet_causality = set([t for t in conceptnet
                            if t[1] in causal_properties])

In [20]:
def query_conceptnet(causality_graph, question):
    cause = get_concept_from_question(question, 0)
    effect = get_concept_from_question(question, 1)
    relation = ('/c/en/' + cause, '/r/Causes', '/c/en/' + effect)
    return relation in causality_graph

In [21]:
parameters = ('conceptnet', conceptnet_causality,
              query_conceptnet, evaluation_matrix)
evaluation_matrix = evaluate_causality_graph(*parameters)

In [22]:
evaluation_matrix.style.format("{0:,}")

Unnamed: 0,Yes>Yes,Yes>No,No>Yes,No>No,Total
causenet,487,1342,53,287,2169
clueweb12_sentence,480,1349,52,288,2169
wikipedia_sentence,51,1778,8,332,2169
wikipedia_list,8,1821,0,340,2169
wikipedia_infobox,9,1820,1,339,2169
conceptnet,1,1828,0,340,2169


## DBpedia Live

In [23]:
not_causal_properties = [
    'http://www.w3.org/2000/01/rdf-schema#seeAlso',
    'http://dbpedia.org/ontology/wikiPageWikiLink',
    'http://purl.org/linguistics/gold/hypernym',
]

In [24]:
def send_query(endpoint, query):
    endpoint.setQuery(query)
    endpoint.setReturnFormat(JSON)
    results = endpoint.query().convert()
    return results


def in_dbpedia_live(endpoint, cause, effect):
    query = "SELECT ?p WHERE {<" + str(cause) + "> ?p <" + str(effect) + ">}"
    results = send_query(endpoint, query)

    for result in results['results']['bindings']:
        prop = result['p']['value']
        return prop not in not_causal_properties
    return False

In [25]:
def query_dbpedia_live(causality_graph, question):
    parameters = (causality_graph,
                  question['entities:dbo'][0],
                  question['entities:dbo'][1])
    return in_dbpedia_live(*parameters)

In [26]:
dbpedia_live_endpoint = SPARQLWrapper("http://live.dbpedia.org/sparql")

In [27]:
print(f"DBpedia Live ({datetime.now()}):")
parameters = ('dbpedia live', dbpedia_live_endpoint,
              query_dbpedia_live, evaluation_matrix, True)
evaluation_matrix = evaluate_causality_graph(*parameters)

DBpedia Live (2020-08-10 19:56:55.805839):


In [28]:
evaluation_matrix.style.format("{0:,}")

Unnamed: 0,Yes>Yes,Yes>No,No>Yes,No>No,Total
causenet,487,1342,53,287,2169
clueweb12_sentence,480,1349,52,288,2169
wikipedia_sentence,51,1778,8,332,2169
wikipedia_list,8,1821,0,340,2169
wikipedia_infobox,9,1820,1,339,2169
conceptnet,1,1828,0,340,2169
dbpedia live,11,1179,2,231,1423


## Wikidata

In [29]:
wikidata_causal_predicates = [
    'P509',  # cause of death
    'P780',  # symptoms
    'P828',  # has cause
    'P1542',  # has effect
    'P770',  # cause of destruction
    'P1478',  # has immediate cause
    'P1479',  # has contributing factor
    'P1534',  # end cause
]

In [30]:
def load_wikidata_causality(wikidata_causal_predicates):
    causal_wikidata = []

    for line in bz2.open(PATH_WIKIDATA, mode='rt'):
        try:
            item = json.loads(line.strip()[:-1])
        except JSONDecodeError:
            continue

        for wikidata_property in item['claims'].keys():
            if wikidata_property in wikidata_causal_predicates:
                for snack in item['claims'][wikidata_property]:
                    if 'datavalue' not in snack['mainsnak']:
                        continue
                    value = snack['mainsnak']['datavalue']['value']
                    if 'id' not in value:
                        continue
                    wikidata_object = value['id']
                    relation = (item['id'], wikidata_property, wikidata_object)
                    causal_wikidata.append(relation)
    return causal_wikidata

In [31]:
wikidata_causality = load_wikidata_causality(wikidata_causal_predicates)
wikidata_causality = [(t[0], t[2]) for t in wikidata_causality]

In [54]:
def get_wikidata_items(question, endpoint):
    # use DBpedia’s interwiki-links for linking against Wikidata
    items = []

    for entity in question['entities:dbo']:
        sparql_query = """
            SELECT ?o WHERE { <""" + entity + """> owl:sameAs ?o
                 FILTER (STRSTARTS(str(?o),
                     "http://wikidata.dbpedia.org/resource/"))
            }
        """
        endpoint.setQuery(sparql_query)
        endpoint.setReturnFormat(JSON)
        results = endpoint.query().convert()

        for result in results["results"]["bindings"]:
            items.append(result["o"]["value"])

    return tuple([x.split("/")[-1][1:] for x in items])

In [55]:
def query_wikidata(wikidata, question):
    expected_link = get_wikidata_items(question, dbpedia_live_endpoint)
    if len(expected_link) != 2:
        raise QueryException("no interwiki-links available")
    
    expected_link = ('Q' + expected_link[0], 'Q' + expected_link[1])
    return expected_link in wikidata

In [56]:
parameters = ('wikidata', wikidata_causality,
              query_wikidata, evaluation_matrix, True)
evaluation_matrix = evaluate_causality_graph(*parameters)

In [57]:
evaluation_matrix.style.format("{0:,}")

Unnamed: 0,Yes>Yes,Yes>No,No>Yes,No>No,Total
causenet,487,1342,53,287,2169
clueweb12_sentence,480,1349,52,288,2169
wikipedia_sentence,51,1778,8,332,2169
wikipedia_list,8,1821,0,340,2169
wikipedia_infobox,9,1820,1,339,2169
conceptnet,1,1828,0,340,2169
dbpedia live,11,1179,2,231,1423
wikidata,10,1098,0,223,1331
