In [1]:
import rdflib
import networkx as nx
import os
import pandas as pd
%matplotlib inline

In [58]:
def construct_graph(abox_file, _format='turtle'):
    """Constructs an `networkX` graph from a rdf file
    @param abox_file: the path of rdf file
    @param _format: in which format is the rdf encoded?
    @returns a `networkX` graph
    """
    graph = nx.DiGraph()
    g=rdflib.Graph()
    g.load(abox_file, format=_format)
    for s,p,o in g:   
        graph.add_edge(s, o, name=p)
        subject_graph = rdflib.Graph()
        # For our subjects we extract type assertions from dbpedia ontology
        subject_graph.load(s)
        for s2,p2,o2 in subject_graph: 
            if p2.encode('utf-8') == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' and 'dbpedia.org/ontology' in o2.encode('utf-8'):
                graph.add_edge(s2, o2, name=p2)
                
        # TODO: Can we increase performance by doing the same for our objects?
        object_graph = rdflib.Graph()
        object_graph.load(o)
        for s2,p2,o2 in object_graph:  
            if p2.encode('utf-8') == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' and 'dbpedia.org/ontology' in o2.encode('utf-8'):
                graph.add_edge(s2, o2, name=p2)
    return graph

def extract_paths(graph, max_depth=2):
    """Extract all random walks from a graph, having a maximum depth of max_depth
    @param graph: `networkX` graph to extract the paths from
    @param max_depth: maximum length of the extracted paths
    @returns a list of extracted paths; each entry is of the form (entity -> property -> entity -> ...)
    """
    paths = []
    for node in graph.nodes_iter():
        if graph.out_degree(node):
            for path in (nx.all_simple_paths(graph, node, target, max_depth) for target in graph.nodes_iter()):
                path = list(path)
                if path:
                    path=list(path)[0]
                    new_path=[]
                    for i in range(len(path)-1):
                        new_path.append(graph[path[i]][path[i+1]]['name'])
                        new_path.append(path[i+1])
                    paths.append(tuple(new_path))
    return set(paths)

def find_path(graph, path):
    """Can we find a certain path in the given graph?
    @param graph: `networkX` graph to locate the path in
    @param path: the path that needs to be located
    @returns a boolean, whether or not the path was found
    """
    current_node = graph
    for i in range(1, len(path), 2):
        if not path[i] in current_node:
            return False
    return True

def path_to_string(path):
    string = ''
    for i in range(len(path)):
        string += str(path[i].split('/')[-1].split('#')[-1]) + '_'
#     string += path[-1].split('/')[-1]
    return string[:-1]

In [136]:
# Parse all triples from file and put them in NetworkX DiGraph
graph = construct_graph('lincoln-abox')

# Extract all random walks from the graph
paths = extract_paths(graph)
print(paths)
# Can we find one of our paths back in our graph?    
for test_path in paths:
    if find_path(graph, test_path):
        print('found')
    else:
        print('not found')

http://dbpedia.org/resource/Abraham_Lincoln
http://dbpedia.org/ontology/OfficeHolder
http://dbpedia.org/ontology/TimePeriod
http://dbpedia.org/resource/Abraham_Lincoln__1
http://dbpedia.org/ontology/Agent
http://dbpedia.org/ontology/Person
set([(rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef(u'http://dbpedia.org/ontology/TimePeriod')), (rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef(u'http://dbpedia.org/ontology/Agent')), (rdflib.term.URIRef(u'http://dbpedia.org/ontology/termPeriod'), rdflib.term.URIRef(u'http://dbpedia.org/resource/Abraham_Lincoln__1'), rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef(u'http://dbpedia.org/ontology/TimePeriod')), (rdflib.term.URIRef(u'http://dbpedia.org/ontology/termPeriod'), rdflib.term.URIRef(u'http://dbpedia.org/resource/Abraham_Lincoln__1')), (rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib

In [133]:
def extract_data(graph):
    new_graph=rdflib.Graph()
    for s,p,o in g:
        new_graph.add((s,p,o))
        subject_graph = rdflib.Graph()
        # For our subjects we extract type assertions from dbpedia ontology
        subject_graph.load(s)
        for s2,p2,o2 in subject_graph: 
            if p2.encode('utf-8') == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' and 'dbpedia.org/ontology' in o2.encode('utf-8'):
                new_graph.add((s2,p2,o2))
        obj_graph = rdflib.Graph()
        # For our subjects we extract type assertions from dbpedia ontology
        obj_graph.load(o)
        for s2,p2,o2 in obj_graph: 
            if p2.encode('utf-8') == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' and 'dbpedia.org/ontology' in o2.encode('utf-8'):
                new_graph.add((s2,p2,o2))
        
    return new_graph

def gen_paths(graph,node,depth,max_depth,all_paths,path):
    #print('current depth:',depth,' max depth: ',max_depth)
    #print('node: ', node)
    if depth<max_depth:
        if  'dbpedia.org/ontology' in node.encode('utf-8') or 'dbpedia.org/resource' in node.encode('utf-8'):
            for p,o in graph.predicate_objects(node):               
                if p.encode('utf-8') == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' and 'dbpedia.org/ontology' in o.encode('utf-8'):
                    if len(path) == 0:
                        #print('  found new subj: ', 'subj_'+str(o))
                        all_paths.append('subj_'+str(o))
                    else:
                        new_path=path+'_'+str(o)
                        #print('  found new extended type: ',new_path)
                        all_paths.append(new_path)
                else:
                    #print('    extending: ',path+'_'+p)
                    gen_paths(graph,o,depth+1,max_depth,all_paths,path+'_'+str(p))
    else:
        all_paths.append(path)
        

In [138]:
#version pieter
all_paths = []
for abox in os.listdir('aboxes'):
    file = 'aboxes'+os.sep+abox
    g=rdflib.Graph()
    g.load(file, format='turtle')
    ext_data = extract_data(g)

    for s,p,o in g:
        gen_paths(ext_data,s,0,2,all_paths,'')
all_paths = set(all_paths)
print(all_paths)
print(len(all_paths))

set(['_http://dbpedia.org/ontology/spokenIn_http://dbpedia.org/ontology/Country', 'subj_http://dbpedia.org/ontology/PopulatedPlace', '_http://dbpedia.org/ontology/doctoralAdvisor_http://dbpedia.org/ontology/Scientist', 'subj_http://dbpedia.org/ontology/Satellite', '_http://dbpedia.org/ontology/headquarter_http://dbpedia.org/ontology/Settlement', '_http://dbpedia.org/ontology/occupation_http://dbpedia.org/ontology/PersonFunction', 'subj_http://dbpedia.org/ontology/SocietalEvent', '_http://dbpedia.org/ontology/deathPlace_http://dbpedia.org/ontology/AdministrativeRegion', '_http://dbpedia.org/ontology/launchPad_http://dbpedia.org/ontology/Location', '_http://dbpedia.org/ontology/spokenIn_http://dbpedia.org/ontology/Place', 'subj_http://dbpedia.org/ontology/Weapon', '_http://dbpedia.org/ontology/leaderName_http://dbpedia.org/ontology/Agent', 'subj_http://dbpedia.org/ontology/Monarch', 'subj_http://dbpedia.org/ontology/SportsTeamMember', '_http://dbpedia.org/ontology/notableWork_http://dbpe

In [None]:
#version gilles
all_paths = []
print(os.sep)
for abox in os.listdir('aboxes'):
    graph = construct_graph('aboxes'+os.sep+abox)
    print('aboxes'+os.sep+abox)
    new_paths = extract_paths(graph, max_depth=2)
    print(new_paths)
    all_paths += new_paths
    
all_paths = set(all_paths)    
print(len(set(all_paths)))

In [139]:
dbpedia_111 = pd.read_csv('datasets/dbpedia2014_dolce_hermit_111_rpk.csv', sep='\t').fillna(0)

In [None]:
#version gilles
vectors = []
for abox in os.listdir('aboxes'):
    graph = construct_graph('aboxes'+os.sep+abox)
    label = dbpedia_111[dbpedia_111['Statement'] == open('aboxes'+os.sep+abox).read()].iloc[0, :]['Consistent']
    vector = [abox, label]
    for path in all_paths:
        if find_path(graph, path):
            vector.append( 1 )
        else:
            vector.append( 0 )
    vectors.append(vector)
df = pd.DataFrame(vectors, columns=['abox', 'label']+[path_to_string(path) for path in all_paths])
df.to_csv('reconstructed_dbpedia_111.csv', index=False)

In [152]:
#version pieter
vectors = []
for abox in os.listdir('aboxes'):
    label = dbpedia_111[dbpedia_111['Statement'] == open('aboxes'+os.sep+abox).read()].iloc[0, :]['Consistent']
    vector = [abox, label]
    local_paths = []
    g=rdflib.Graph()
    g.load('aboxes'+os.sep+abox, format='turtle')
    ext_data = extract_data(g)

    for s,p,o in g:
        gen_paths(ext_data,s,0,2,local_paths,'')
    for path in all_paths:
        if path in local_paths:
            vector.append( 1 )
        else:
            vector.append( 0 )
    vectors.append(vector)
df = pd.DataFrame(vectors, columns=['abox', 'label']+[path_to_string(path) for path in all_paths])
df.to_csv('reconstructed_dbpedia_111.csv', index=False)

In [153]:
print(len(df.columns[list(df.sum() > 1)]))
print(len(df.columns))
print(df.columns)

81
208
Index([u'abox', u'label',
       u'__h_t_t_p_:___d_b_p_e_d_i_a_._o_r_g__o_n_t_o_l_o_g_y__s_p_o_k_e_n_I_n___h_t_t_p_:___d_b_p_e_d_i_a_._o_r_g__o_n_t_o_l_o_g_y__C_o_u_n_t_r_y',
       u's_u_b_j___h_t_t_p_:___d_b_p_e_d_i_a_._o_r_g__o_n_t_o_l_o_g_y__P_o_p_u_l_a_t_e_d_P_l_a_c_e',
       u'__h_t_t_p_:___d_b_p_e_d_i_a_._o_r_g__o_n_t_o_l_o_g_y__d_o_c_t_o_r_a_l_A_d_v_i_s_o_r___h_t_t_p_:___d_b_p_e_d_i_a_._o_r_g__o_n_t_o_l_o_g_y__S_c_i_e_n_t_i_s_t',
       u's_u_b_j___h_t_t_p_:___d_b_p_e_d_i_a_._o_r_g__o_n_t_o_l_o_g_y__S_a_t_e_l_l_i_t_e',
       u'__h_t_t_p_:___d_b_p_e_d_i_a_._o_r_g__o_n_t_o_l_o_g_y__h_e_a_d_q_u_a_r_t_e_r___h_t_t_p_:___d_b_p_e_d_i_a_._o_r_g__o_n_t_o_l_o_g_y__S_e_t_t_l_e_m_e_n_t',
       u'__h_t_t_p_:___d_b_p_e_d_i_a_._o_r_g__o_n_t_o_l_o_g_y__o_c_c_u_p_a_t_i_o_n___h_t_t_p_:___d_b_p_e_d_i_a_._o_r_g__o_n_t_o_l_o_g_y__P_e_r_s_o_n_F_u_n_c_t_i_o_n',
       u's_u_b_j___h_t_t_p_:___d_b_p_e_d_i_a_._o_r_g__o_n_t_o_l_o_g_y__S_o_c_i_e_t_a_l_E_v_e_n_t',
       u'__h_t_t_p_:___d_b_p_e_d

In [29]:
print(len(dbpedia_111.columns[list(dbpedia_111.sum() > 1)]))
print(len(dbpedia_111.columns))

92
218


In [154]:
print(dbpedia_111.columns)

Index([u'Statement', u'Consistent',
       u'subject_http://dbpedia.org/ontology/OfficeHolder',
       u'subject_http://dbpedia.org/ontology/Person',
       u'subject_http://dbpedia.org/ontology/Agent',
       u'http://dbpedia.org/ontology/termPeriod_http://dbpedia.org/ontology/TimePeriod',
       u'subject_http://dbpedia.org/ontology/Country',
       u'subject_http://dbpedia.org/ontology/PopulatedPlace',
       u'subject_http://dbpedia.org/ontology/Wikidata:Q532',
       u'subject_http://dbpedia.org/ontology/Place',
       ...
       u'http://dbpedia.org/ontology/child_http://dbpedia.org/ontology/MilitaryPerson',
       u'http://dbpedia.org/ontology/child_http://dbpedia.org/ontology/Person',
       u'http://dbpedia.org/ontology/child_http://dbpedia.org/ontology/Agent',
       u'http://dbpedia.org/ontology/parent_http://dbpedia.org/ontology/BritishRoyalty',
       u'http://dbpedia.org/ontology/parent_http://dbpedia.org/ontology/Royalty',
       u'http://dbpedia.org/ontology/parent_http

In [31]:
dbpedia_111

Unnamed: 0,Statement,Consistent,subject_http://dbpedia.org/ontology/OfficeHolder,subject_http://dbpedia.org/ontology/Person,subject_http://dbpedia.org/ontology/Agent,http://dbpedia.org/ontology/termPeriod_http://dbpedia.org/ontology/TimePeriod,subject_http://dbpedia.org/ontology/Country,subject_http://dbpedia.org/ontology/PopulatedPlace,subject_http://dbpedia.org/ontology/Wikidata:Q532,subject_http://dbpedia.org/ontology/Place,...,http://dbpedia.org/ontology/child_http://dbpedia.org/ontology/MilitaryPerson,http://dbpedia.org/ontology/child_http://dbpedia.org/ontology/Person,http://dbpedia.org/ontology/child_http://dbpedia.org/ontology/Agent,http://dbpedia.org/ontology/parent_http://dbpedia.org/ontology/BritishRoyalty,http://dbpedia.org/ontology/parent_http://dbpedia.org/ontology/Royalty,http://dbpedia.org/ontology/parent_http://dbpedia.org/ontology/Person,http://dbpedia.org/ontology/parent_http://dbpedia.org/ontology/Agent,subject_http://dbpedia.org/ontology/Insect,subject_http://dbpedia.org/ontology/Manga,subject_http://dbpedia.org/ontology/Comics
0,<http://dbpedia.org/resource/Abraham_Lincoln> ...,True,1,1,1,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,<http://dbpedia.org/resource/Algeria> <http://...,True,0,0,0,0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,<http://dbpedia.org/resource/Aldous_Huxley> <h...,True,0,1,1,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,<http://dbpedia.org/resource/A._A._Milne> <htt...,True,0,1,1,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,<http://dbpedia.org/resource/Apiaceae> <http:/...,False,0,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,<http://dbpedia.org/resource/Albert_Camus> <ht...,True,0,1,1,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,<http://dbpedia.org/resource/Allen_Ginsberg> <...,True,0,1,1,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,<http://dbpedia.org/resource/Alexis_Carrel> <h...,True,0,1,1,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,<http://dbpedia.org/resource/Alan_Garner> <htt...,True,0,1,1,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,<http://dbpedia.org/resource/Andr%C3%A9-Marie_...,True,0,1,1,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [155]:
from decisiontree import DecisionTree
from treeconstructor import C45Constructor, CARTConstructor
from ensemble import RFClassification
from collections import Counter
from sklearn.metrics import accuracy_score

dbpedia_11 = pd.read_csv('datasets/dbpedia2014_dolce_hermit_11_prk.csv', sep='\t', index_col=[0]).fillna(0)

feature_cols_11 = list(dbpedia_11.drop('Consistent', axis=1).columns)
label_col = 'Consistent'

dbpedia_11_train = dbpedia_11.head(int(len(dbpedia_11)*0.8)).astype(int)
# dbpedia_11_test = dbpedia_11.tail(int(len(dbpedia_11)*0.2)).astype(int)

c45 = C45Constructor()
cart = CARTConstructor()
rf = RFClassification()

tree = cart.construct_classifier(dbpedia_11_train, feature_cols_11, label_col)
tree.visualise('test.pdf')
# predictions = tree.evaluate_multiple(dbpedia_11_test.drop('Consistent', axis=1)).astype(int)
# print(accuracy_score(dbpedia_11_test['Consistent'].astype(int), predictions))



In [156]:
dbpedia_11 = pd.read_csv('reconstructed_dbpedia_11.csv', sep=',', index_col=[0, 1]).fillna(0)

print(dbpedia_11)

feature_cols_11 = list(dbpedia_11.drop('label', axis=1).columns)
label_col = 'label'

dbpedia_11_train = dbpedia_11.head(int(len(dbpedia_11)*0.8)).astype(int)
# dbpedia_11_test = dbpedia_11.tail(int(len(dbpedia_11)*0.2)).astype(int)

c45 = C45Constructor()
cart = CARTConstructor()
rf = RFClassification()

tree = cart.construct_classifier(dbpedia_11_train, feature_cols_11, label_col)
tree.visualise('test.pdf')

                        label  type_IceHockeyPlayer  \
   abox                                               
0  abox_dbpedia_9.ttl   False                     0   
1  abox_dbpedia_4.ttl    True                     0   
2  abox_dbpedia_2.ttl    True                     0   
3  abox_dbpedia_7.ttl   False                     0   
4  abox_dbpedia_5.ttl    True                     0   
5  abox_dbpedia_0.ttl    True                     0   
6  abox_dbpedia_10.ttl   True                     0   
7  abox_dbpedia_3.ttl    True                     1   
8  abox_dbpedia_1.ttl    True                     0   
9  abox_dbpedia_8.ttl    True                     0   
10 abox_dbpedia_6.ttl   False                     0   

                        location_Mumbai_type_Place  type_Organisation  \
   abox                                                                 
0  abox_dbpedia_9.ttl                            0                  1   
1  abox_dbpedia_4.ttl                            0               