In [None]:
##import necessary modules

%matplotlib inline

import os
import csv
import json
import rdflib
import arcpy
import glob
import requests
import visJS2jupyter.visJS_module
import zipfile
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd

from rdflib import Graph
from gastrodon import LocalEndpoint, one, QName
from arcgis.gis import GIS

##Variables

#sourcePath = r'C:\Users\jame9353\Box Sync\Data\NetOwl\esri_all_out2'
docsPath = r'C:\Users\jame9353\Documents\GitHub\RDFtoNeo4J\Demo\Sample Documents'
rdfOutDir = r'C:\Users\jame9353\Documents\GitHub\RDFtoNeo4J\Demo\Data'
rdfOutExt = ".rdf"
fileOutDir = r'C:\Users\jame9353\Documents\GitHub\RDFtoNeo4J\Demo\outData'

#Makes a connection to ArcGIS Online
gis = GIS("https://esrifederal.maps.arcgis.com", client_id = "Sama2eyhY8UFPwQb")

In [None]:
#Defines a function that will pass documents derived from the list
# above to the NetOwl API.  
#Function checks the type of document and makes necessary adjustment 
# to the POST command.
#Function has three inputs:
#    1.  inFile:  This is the file that will be passed to the NetOwl API
#    2.  outPath: Path where the output file will be saved
#    3.  outExtension:  the file type that will be saved (RDF, etc..)

def netowlCurl(inFile, outPath, outExtension):
    headers = {
    'accept': 'application/rdf+xml',
    'Authorization': 'netowl ff5e6185-5d63-459b-9765-4ebb905affc8',
    }
    
    if inFile.endswith(".txt"):
        headers['Content-Type'] = 'text/plain'
        print("Document is a text file...")
    elif inFile.endswith(".html") or inFile.endswith(".htm"):
        headers['Content-Type'] = 'text/html'
        print("Document is a PDF...")
    elif inFile.endswith(".pdf"):
        headers['Content-Type'] = 'application/pdf'
        print("Document is a PDF...")
    elif inFile.endswith(".docx"):
        headers['Content-Type'] = 'application/msword'
        print("Document is a Word Document...")
    
    params = (
        ('language', 'english'),
    )
    
    data = open(inFile, 'rb').read()
    response = requests.post('https://api.netowl.com/api/v2/_process', headers=headers, params=params, data=data, verify=False)
    r = response.text
    outPath = outPath
    fileName = os.path.split(d)[1]
    if os.path.exists(outPath) == False:
        os.mkdir(outPath, mode=0o777,)
    outFile = os.path.join(outPath, fileName + outExtension)
    #print(len(r))
    #print(outFile)
    open(outFile, "w", encoding="utf-8").write(r)

In [None]:
#Walks through the docsPath, identifying files, and appends them to a list.
docs = []
for root, dirs, files in os.walk(docsPath):
    for f in files:
        filePath = os.path.join(root, f)
        docs.append(filePath)
        
#Iterates though the docs list created previously and 
# runs the function for each of the documents found. 
#Passes the function a document derived from the list,
# and two variables created in a previous step. 

for d in docs:
    netowlCurl(d, rdfOutDir, rdfOutExt)

#Creates a Graph Object that will store all the result of a parse operation 
# in the next step. 
g = Graph()

#Walks through output path from the netowlCurl function and parses all RDF/XML Documents
for root, dir, files in os.walk(rdfOutDir):
    for file in files:
        if file.endswith('.rdf'):
            filePath = os.path.join(root, file)
            print("Parsing " + file + "...")
            try:
                g.parse(filePath, format='xml')
            except Exception as ex:
                print(ex)
                
#Create Local SPARQL Endpoint on graph created in previous step
e = LocalEndpoint(g)

In [None]:
len(g)

In [None]:
#Uses the SPARQL endpoint to query all of the relationship types and returns the top 10
triplesSel=e.select("""
   SELECT ?s ?p ?o ?label ?type{
      ?s ?p ?o .
      ?s rdfs:label ?label .
      ?s rdf:type ?type .
    }
""")
tripleList = os.path.join(fileOutDir, 'triples.csv')
triplesSel.to_csv(tripleList, sep=',', encoding='utf-8')
df = pd.read_csv(tripleList)

In [None]:
#Uses the SPARQL endpoint to query all of the relationship types and returns the top 10
properties=e.select("""
   SELECT ?p ?type(COUNT(*) AS ?cnt) {
      ?s ?p ?o .
      ?s rdf:type ?type . 
   } GROUP BY ?type ORDER BY DESC(?cnt)
""")
properties.head(10)
    
#Writes the full relationship types data frame to a CSV document
file_name = os.path.join(fileOutDir, 'predicates.csv') 
properties.to_csv(file_name, sep=',', encoding='utf-8')

In [None]:
#Graphs the top 10 relationship types
with plt.xkcd():
    properties.head(10)["cnt"].plot.pie(figsize=(6,6)).set_ylabel('')

In [None]:
#SPARQL query to identify various entity types inside of the graph
pd.set_option("display.width",150)
pd.set_option("display.max_colwidth",150)
sparql=e.select("""
   SELECT ?s ?o ?label ?type{
      ?s netowl:Entity.Person..name ?o .
      ?s rdfs:label ?label .
      ?s rdf:type ?type .
    }
""")
sparql.set_index("label")

In [None]:
#Queries the SPARQL endpoint for the various addresses located in the documents
address = df[df['p'].str.contains('Place')]

#Geocodes the addresses and adds them to the map widget as a feature collection
locations = gis.content.import_data(address, {"Address" : "label"})

#Creates a hosted feature service from the feature 
# collection created in the previous step
loc_properties = {
    "title":"EarlyBird_Articles",
    "text": json.dumps({"featureCollection": {"layers": [dict(locations.layer)]}}),
    "type":"Feature Collection"}
loc = gis.content.add(loc_properties)

In [None]:
map = gis.map()
map

In [None]:
map.add_layer(locations)

In [None]:
nxG = nx.Graph()

#Queries the SPARQL endpoint and generates a list of the edges to be used in the Graph
edgeGraph=e.select("""
   SELECT ?labelS ?labelO ?p ?label ?type{
      ?s ?p ?o .
      ?s rdfs:label ?labelS .
      ?o rdfs:label ?labelO .
      ?s rdf:type ?type .
    }
""")

#Adds edges to NetworkX Graph
for i, elrow in edgeGraph.iterrows():
    nxG.add_edge(elrow[0], elrow[1], typeDict=elrow[1:].to_dict())
    
nodes = list(nxG.nodes()) # must cast to list to maintain compatibility between nx 1.11 and 2.0
edges = list(nxG.edges()) 

pos = nx.spring_layout(nxG)

nodes_dict = [{"id":n,
              "x":pos[n][0]*1000,
              "y":pos[n][1]*1000} for n in nodes]

node_map = dict(zip(nodes,range(len(nodes))))

edges_dict = [{"source":node_map[edges[i][0]], "target":node_map[edges[i][1]], 
              "title":'test'} for i in range(len(edges))]

In [None]:
visJS2jupyter.visJS_module.visjs_network(nodes_dict,edges_dict)

In [None]:
import nxviz as nv

In [None]:
mp = nv.MatrixPlot(nxG)
mp.draw() 

In [None]:
ap = nv.ArcPlot(nxG)
ap.draw() 

In [None]:
cp = nv.CircosPlot(nxG)
cp.draw() 

In [None]:
peopleGraph = nx.Graph()

peopleQuery=e.select("""
   SELECT ?label ?s ?o{
      ?s netowl:Entity.Person..name ?o .
      ?s rdfs:label ?label .
    }
""")
peopleQuery.set_index("label")

for i, elrow in peopleQuery.iterrows():
    peopleGraph.add_edge(elrow[0], elrow[1], typeDict=elrow[1:].to_dict())
    
nodes = list(peopleGraph.nodes()) # must cast to list to maintain compatibility between nx 1.11 and 2.0
edges = list(peopleGraph.edges()) 

pos = nx.spring_layout(peopleGraph)

nodes_dict = [{"id":n,
              "x":pos[n][0]*1000,
              "y":pos[n][1]*1000} for n in nodes]

node_map = dict(zip(nodes,range(len(nodes))))

edges_dict = [{"source":node_map[edges[i][0]], "target":node_map[edges[i][1]], 
              "title":'test'} for i in range(len(edges))]

In [None]:
peopleQuery

In [None]:
len(peopleGraph)

In [None]:
pcp = nv.CircosPlot(peopleGraph)
pcp.draw() 

In [None]:
#Queries the SPARQL endpoint and generates a list of the edges to be used in the Graph
testEdgeGraph=e.select("""
   SELECT ?labelS ?labelO ?p ?label ?type{
      ?s ?p ?o .
      ?s rdfs:label ?labelS .
      ?o rdfs:label ?labelO .
      ?s rdf:type ?type .
    }
""")

In [None]:
testEdgeGraph