In [9]:
import rdflib
import networkx as nx
import os
import pandas as pd
%matplotlib inline

In [165]:
def construct_graph(abox_file, _format='turtle'):
    """Constructs an `networkX` graph from a rdf file
    @param abox_file: the path of rdf file
    @param _format: in which format is the rdf encoded?
    @returns a `networkX` graph
    """
    graph = nx.DiGraph()
    g=rdflib.Graph()
    g.load(abox_file, format=_format)
    for s,p,o in g:      
        graph.add_edge(s, o, name=p)
        subject_graph = rdflib.Graph()
        # For our subjects we extract type assertions from dbpedia ontology
        subject_graph.load(s)
        for s2,p2,o2 in subject_graph:  
            if str(p2) == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' and 'dbpedia.org/ontology' in str(o2):
                graph.add_edge(s2, o2, name=p2)
                
        # TODO: Can we increase performance by doing the same for our objects?
        object_graph = rdflib.Graph()
        object_graph.load(o)
        for s2,p2,o2 in object_graph:  
            if str(p2) == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' and 'dbpedia.org/ontology' in str(o2):
                graph.add_edge(s2, o2, name=p2)
                
    return graph

def extract_paths(graph, max_depth=2):
    """Extract all random walks from a graph, having a maximum depth of max_depth
    @param graph: `networkX` graph to extract the paths from
    @param max_depth: maximum length of the extracted paths
    @returns a list of extracted paths; each entry is of the form (entity -> property -> entity -> ...)
    """
    paths = []
    for node in graph.nodes_iter():
        if graph.out_degree(node):
            for path in (nx.all_simple_paths(graph, node, target, max_depth) for target in graph.nodes_iter()):
                path = list(path)
                if path:
                    path=list(path)[0]
                    new_path=[]
                    for i in range(len(path)-1):
                        new_path.append(graph[path[i]][path[i+1]]['name'])
                        new_path.append(path[i+1])
                    paths.append(tuple(new_path))
    return set(paths)

def find_path(graph, path):
    """Can we find a certain path in the given graph?
    @param graph: `networkX` graph to locate the path in
    @param path: the path that needs to be located
    @returns a boolean, whether or not the path was found
    """
    current_node = graph
    for i in range(1, len(path), 2):
        if not path[i] in current_node:
            return False
    return True

def path_to_string(path):
    string = ''
    for i in range(len(path)):
        string += str(path[i].split('/')[-1].split('#')[-1]) + '_'
#     string += path[-1].split('/')[-1]
    return string[:-1]

In [166]:
# Parse all triples from file and put them in NetworkX DiGraph
graph = construct_graph('lincoln-abox')

# Extract all random walks from the graph
paths = extract_paths(graph)

# Can we find one of our paths back in our graph?    
for test_path in paths:
    if find_path(graph, test_path):
        print('found')
    else:
        print('not found')

found
found
found
found
found
found


In [167]:
all_paths = []
for abox in os.listdir('aboxes'):
    graph = construct_graph('aboxes'+os.sep+abox)
    all_paths += extract_paths(graph, max_depth=2)
    
all_paths = set(all_paths)    
print(len(set(all_paths)))

396


In [169]:
dbpedia_111 = pd.read_csv('datasets/dbpedia2014_dolce_hermit_111_rpk.csv', sep='\t').fillna(0)

In [170]:
vectors = []
for abox in os.listdir('aboxes'):
    graph = construct_graph('aboxes'+os.sep+abox)
    label = dbpedia_111[dbpedia_111['Statement'] == open('aboxes'+os.sep+abox).read()].iloc[0, :]['Consistent']
    vector = [abox, label]
    for path in all_paths:
        if find_path(graph, path):
            vector.append( 1 )
        else:
            vector.append( 0 )
    vectors.append(vector)
df = pd.DataFrame(vectors, columns=['abox', 'label']+[path_to_string(path) for path in all_paths])
df.to_csv('reconstructed_dbpedia_111.csv', index=False)

In [174]:
print(len(df.columns[list(df.sum() > 1)]))
print(len(df.columns))

68
398


In [173]:
print(len(dbpedia_111.columns[list(dbpedia_111.sum() > 1)]))
print(len(dbpedia_111.columns))

92
218


In [144]:
dbpedia_11

Unnamed: 0,Consistent,subject_http://dbpedia.org/ontology/Town,subject_http://dbpedia.org/ontology/Settlement,subject_http://dbpedia.org/ontology/PopulatedPlace,subject_http://dbpedia.org/ontology/Wikidata:Q532,subject_http://dbpedia.org/ontology/Place,http://dbpedia.org/ontology/isPartOf_http://dbpedia.org/ontology/AdministrativeRegion,http://dbpedia.org/ontology/isPartOf_http://dbpedia.org/ontology/Region,http://dbpedia.org/ontology/isPartOf_http://dbpedia.org/ontology/PopulatedPlace,http://dbpedia.org/ontology/isPartOf_http://dbpedia.org/ontology/Wikidata:Q532,...,http://dbpedia.org/ontology/country_http://dbpedia.org/ontology/Country,http://dbpedia.org/ontology/country_http://dbpedia.org/ontology/PopulatedPlace,http://dbpedia.org/ontology/country_http://dbpedia.org/ontology/Wikidata:Q532,http://dbpedia.org/ontology/country_http://dbpedia.org/ontology/Place,subject_http://dbpedia.org/ontology/CareerStation,http://dbpedia.org/ontology/team_http://dbpedia.org/ontology/SoccerClub,http://dbpedia.org/ontology/team_http://dbpedia.org/ontology/SportsTeam,http://dbpedia.org/ontology/team_http://dbpedia.org/ontology/Organisation,http://dbpedia.org/ontology/team_http://dbpedia.org/ontology/Agent,http://dbpedia.org/ontology/isPartOf_http://dbpedia.org/ontology/Settlement
0,1,1,1,1,1,1,1,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1,0,1,1,1,1,0,0,0,0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0


In [155]:
from decisiontree import DecisionTree
from treeconstructor import C45Constructor, CARTConstructor
from ensemble import RFClassification
from collections import Counter
from sklearn.metrics import accuracy_score

dbpedia_11 = pd.read_csv('datasets/dbpedia2014_dolce_hermit_11_prk.csv', sep='\t', index_col=[0]).fillna(0)

feature_cols_11 = list(dbpedia_11.drop('Consistent', axis=1).columns)
label_col = 'Consistent'

dbpedia_11_train = dbpedia_11.head(int(len(dbpedia_11)*0.8)).astype(int)
# dbpedia_11_test = dbpedia_11.tail(int(len(dbpedia_11)*0.2)).astype(int)

c45 = C45Constructor()
cart = CARTConstructor()
rf = RFClassification()

tree = cart.construct_classifier(dbpedia_11_train, feature_cols_11, label_col)
tree.visualise('test.pdf')
# predictions = tree.evaluate_multiple(dbpedia_11_test.drop('Consistent', axis=1)).astype(int)
# print(accuracy_score(dbpedia_11_test['Consistent'].astype(int), predictions))

In [164]:
dbpedia_11 = pd.read_csv('reconstructed_dbpedia_11.csv', sep=',', index_col=[0, 1]).fillna(0)

print(dbpedia_11)

feature_cols_11 = list(dbpedia_11.drop('label', axis=1).columns)
label_col = 'label'

dbpedia_11_train = dbpedia_11.head(int(len(dbpedia_11)*0.8)).astype(int)
# dbpedia_11_test = dbpedia_11.tail(int(len(dbpedia_11)*0.2)).astype(int)

c45 = C45Constructor()
cart = CARTConstructor()
rf = RFClassification()

tree = cart.construct_classifier(dbpedia_11_train, feature_cols_11, label_col)
tree.visualise('test.pdf')

                        label  type_IceHockeyPlayer  \
   abox                                               
0  abox_dbpedia_9.ttl   False                     0   
1  abox_dbpedia_4.ttl    True                     0   
2  abox_dbpedia_2.ttl    True                     0   
3  abox_dbpedia_7.ttl   False                     0   
4  abox_dbpedia_5.ttl    True                     0   
5  abox_dbpedia_0.ttl    True                     0   
6  abox_dbpedia_10.ttl   True                     0   
7  abox_dbpedia_3.ttl    True                     1   
8  abox_dbpedia_1.ttl    True                     0   
9  abox_dbpedia_8.ttl    True                     0   
10 abox_dbpedia_6.ttl   False                     0   

                        location_Mumbai_type_Place  type_Organisation  \
   abox                                                                 
0  abox_dbpedia_9.ttl                            0                  1   
1  abox_dbpedia_4.ttl                            0               