In [1]:
import os
import json
import arcpy
import requests
import glob
import zipfile
import pandas as pd

from rdflib import Graph
from gastrodon import LocalEndpoint, one, QName
from arcgis.gis import GIS

INFO:rdflib:RDFLib Version: 4.2.2


In [2]:
docsPath = r'C:\Users\jame9353\Documents\GitHub\RDFtoNeo4J\Demo\Early Bird'
rdfOutDir = r'C:\Users\jame9353\Documents\GitHub\RDFtoNeo4J\Demo\Data'
outBaseName = 'EarlyBird'
fileOutDir = 'http://esrifederal.maps.arcgis.com'
userName = 'james_jones_federal'
passWord = 'QWerty654321@!'

gis = GIS(fileOutDir, username=userName, password=passWord)

rdfOutExt = ".rdf"

In [3]:
def netowlCurl(inFile, outPath, outExtension):
    headers = {
        'accept': 'application/rdf+xml',
        'Authorization': 'netowl ff5e6185-5d63-459b-9765-4ebb905affc8',
    }

    if inFile.endswith(".txt"):
        headers['Content-Type'] = 'text/text'
        print("Document is a text file...")
    elif inFile.endswith(".pdf"):
        headers['Content-Type'] = 'application/pdf'
        print("Document is a PDF...")
    elif inFile.endswith(".docx"):
        headers['Content-Type'] = 'application/msword'
        print("Document is a Word Document...")

    params = (
        ('language', 'english'),
    )

    data = open(inFile, 'rb').read()
    response = requests.post('https://api.netowl.com/api/v2/_process', headers=headers, params=params, data=data,
                             verify=False)
    r = response.text
    outPath = outPath
    fileName = os.path.split(d)[1]
    if os.path.exists(outPath) == False:
        os.mkdir(outPath, mode=0o777, )
    outFile = os.path.join(outPath, fileName + outExtension)
    # print(len(r))
    # print(outFile)
    open(outFile, "w", encoding="utf-8").write(r)

In [4]:
docs = []
for root, dirs, files in os.walk(docsPath):
    for f in files:
        filePath = os.path.join(root, f)
        docs.append(filePath)

In [5]:
for d in docs:
    netowlCurl(d, rdfOutDir, rdfOutExt)

Document is a Word Document...




Document is a Word Document...




Document is a Word Document...




In [6]:
g = Graph()

In [7]:
for root, dir, files in os.walk(rdfOutDir):
    for file in files:
        if file.endswith(rdfOutExt):
            filePath = os.path.join(root, file)
            print("Parsing " + file + "...")
            try:
                g.parse(filePath, format='xml')
            except Exception as ex:
                print(ex)

Parsing Coordinates.docx.rdf...
Parsing e20070103_01.htm.rdf...
Parsing e20070103_02.htm.rdf...
Parsing e20070103_03.htm.rdf...
Parsing e20070103_04.htm.rdf...
Parsing e20070103_05.htm.rdf...
Parsing e20070103_06.htm.rdf...
Parsing e20070103_07.htm.rdf...
Parsing Sample Document 2.docx.rdf...
Parsing Sample Document 3.docx.rdf...


In [8]:
e = LocalEndpoint(g)

In [9]:
address=e.select("""
   SELECT ?s ?o ?label{
      ?s netowl:Entity.Address.Mail..name ?o .
      ?s rdfs:label ?label .
    }
""")
address.set_index("label")

Unnamed: 0_level_0,s,o
label,Unnamed: 1_level_1,Unnamed: 2_level_1
"10824 Cross School Rd, Reston, VA 20191",file:///C:/Users/jame9353/Documents/GitHub/RDF...,"10824 Cross School Rd, Reston, VA 20191"
"8615 Westwood Center Drive, Vienna, VA 22182",file:///C:/Users/jame9353/Documents/GitHub/RDF...,"8615 Westwood Center Drive, Vienna, VA 22182"
"380 New York Street, Redlands, CA 92373",file:///C:/Users/jame9353/Documents/GitHub/RDF...,"380 New York Street, Redlands, CA 92373"
"8605 Westwood Center Drive, Tysons, VA 22182",file:///C:/Users/jame9353/Documents/GitHub/RDF...,"8605 Westwood Center Drive, Tysons, VA 22182"


In [10]:
locations = gis.content.import_data(address, {"Address" : "label"})

INFO:arcgis._impl.connection:Token expired during post request, fetching a new token and retrying


In [11]:
titleAdd  = outBaseName + "_Address"
loc_properties = {
    "title":titleAdd,
    "text": json.dumps({"featureCollection": {"layers": [dict(locations.layer)]}}),
    "type":"Feature Collection"}
loc = gis.content.add(loc_properties)

In [12]:
triplesSel=e.select("""
   SELECT ?s ?p ?o ?label ?type{
      ?s ?p ?o .
      ?s rdfs:label ?label .
      ?s rdf:type ?type .
    }
""")

In [13]:
tripleList = os.path.join(rdfOutDir, 'triples.csv')
triplesSel.to_csv(tripleList, sep=',', encoding='utf-8')
df = pd.read_csv(tripleList)

In [14]:
address = df[df['p'].str.contains('Place')]

In [15]:
locations = gis.content.import_data(address, {"Address" : "label"})

In [16]:
titlePlace = outBaseName + "_Place"
loc_properties = {
    "title":titlePlace,
    "text": json.dumps({"featureCollection": {"layers": [dict(locations.layer)]}}),
    "type":"Feature Collection"}
loc = gis.content.add(loc_properties)

In [17]:
#Queries for the MGRS coordinates and writes them to a CSV file
mgrs=e.select("""
   SELECT ?s ?o ?label{
      ?s netowl:Entity.Numeric.Coordinate.Mgrs..name ?o .
      ?s rdfs:label ?label .
    }
""")

In [18]:
mgrs.set_index("o")
mgrs_file = os.path.join(rdfOutDir,'mgrs_coords.csv')
mgrs.to_csv(mgrs_file, sep=',', encoding='utf-8')

In [19]:
#Converts coordinates located in the MGRS CSV into Lat/Longs, turns this into a shapefile
outShpDir = os.path.join(rdfOutDir, 'OutShp')
outName = outBaseName + 'MGRS.shp'
if os.path.exists(outShpDir) == False:
    os.mkdir(outShpDir, mode=0o777,)
arcpy.ConvertCoordinateNotation_management(in_table=mgrs_file, out_featureclass=os.path.join(outShpDir,outName), x_field="o", y_field="o", input_coordinate_format="MGRS", output_coordinate_format="DD_NUMERIC", id_field="", 
                                           spatial_reference="GEOGCS['GCS_WGS_1984',DATUM['D_WGS_1984',SPHEROID['WGS_1984',6378137.0,298.257223563]],PRIMEM['Greenwich',0.0],UNIT['Degree',0.0174532925199433]];-400 -400 1000000000;-100000 10000;-100000 10000;8.98315284119522E-09;0.001;0.001;IsHighPrecision", 
                                           in_coor_system="GEOGCS['GCS_WGS_1984',DATUM['D_WGS_1984',SPHEROID['WGS_1984',6378137.0,298.257223563]],PRIMEM['Greenwich',0.0],UNIT['Degree',0.0174532925199433]]", exclude_invalid_records="INCLUDE_INVALID")

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

<Result 'C:\\Users\\jame9353\\Documents\\GitHub\\RDFtoNeo4J\\Demo\\Data\\OutShp\\EarlyBirdMGRS.shp'>

In [20]:
#Zips up the shapefile created in the previous step
outZip = fileOutDir

def zipShapefilesInDir(inDir, outDir):
    if not os.path.exists(inDir):
        arcpy.AddMessage("Input directory %s does not exist!" % inDir)
        return False

    if not os.path.exists(outDir):
        arcpy.AddMessage("Creating output directory %s" % outDir)
        os.mkdir(outDir)

    arcpy.AddMessage("Zipping shapefile(s) in folder %s to output folder %s" % (inDir, outDir))

    for inShp in glob.glob(os.path.join(inDir, "*.shp")):
        global outZip
        outZip = os.path.join(outDir, os.path.splitext(os.path.basename(inShp))[0] + ".zip")

        zipShapefile(inShp, outZip)
    return True


def zipShapefile(inShapefile, newZipFN):
    arcpy.AddMessage('Starting to Zip ' + (inShapefile) + ' to ' + (newZipFN))

    if not (os.path.exists(inShapefile)):
        arcpy.AddMessage(inShapefile + ' Does Not Exist')
        return False

    if (os.path.exists(newZipFN)):
        arcpy.AddMessage('Deleting ' + newZipFN)
        os.remove(newZipFN)

    if (os.path.exists(newZipFN)):
        arcpy.AddMessage('Unable to Delete' + newZipFN)
        return False

    zipobj = zipfile.ZipFile(newZipFN, 'w')

    for infile in glob.glob(inShapefile.lower().replace(".shp", ".*")):
        if os.path.splitext(infile)[1].lower() != ".zip":
            arcpy.AddMessage("Zipping %s" % (infile))
            zipobj.write(infile, os.path.basename(infile), zipfile.ZIP_DEFLATED)

    zipobj.close()
    return True

zipShapefilesInDir(outShpDir, rdfOutDir)

True

In [21]:
#Uploads the shapefile to ArcGIS Online and publishes it as a feature service
titleMGRS = outBaseName + '_MGRS'
tempItem = gis.content.add({"title":titleMGRS, "type":"Shapefile"}, outZip)
mgrsLyr = tempItem.publish()

In [22]:
#Queries for the Lat/Long coordinates located in the document
latlong=e.select("""
   SELECT ?s ?o ?label{
      ?s netowl:Entity.Numeric.Coordinate.Latlong..name ?o .
      ?s rdfs:label ?label .
    }
""")
latlong.set_index("label")

Unnamed: 0_level_0,s,o
label,Unnamed: 1_level_1,Unnamed: 2_level_1
