In [None]:
import datetime
import time
import json
import os
import string
import requests
import sys
import traceback

import azure.cosmos.cosmos_client as cosmos_client

from helpers import keys
from helpers import nlp_helper
from gremlin_python.driver import client, serializer

In [None]:
config = {
    'ENDPOINT': keys.cosmos_uri,
    'PRIMARYKEY': keys.cosmos_primary_key,
    'DATABASE': 'NetOwl',
    'CONTAINER': 'Entities',
    'LINK-CONTAINER': 'Links',
    'EVENT-CONTAINER': 'Events'
}

In [None]:
docs_path = r'C:\Users\jame9353\Box Sync\Data\Early Bird'
json_out_dir = r'C:\Data\json'
geoevent_url = r'https://ge-1.eastus.cloudapp.azure.com:6143/geoevent/rest/receiver/netowl-geoentities-in'

out_ext = ".json"

In [None]:
print("Connecting to Cosmos DB Graph Client...")
graph_client = client.Client('wss://pilot-graph.gremlin.cosmosdb.azure.com:443/','g', 
        username="/dbs/NetOwl/colls/Links", 
        password="whE1lJjFxzVSCQ7ppNDc5hMCwNl7x8C0BeMTF6dGq4pTN3c8qDVyUBLutYwQZJW1haxJP6W8wckzqBepDcGlAQ==",
        message_serializer=serializer.GraphSONMessageSerializer()
    )
print("Successfully connected to Cosmos DB Graph Client")

In [None]:
# Initialize the Cosmos client

print("Connecting to Cosmos DB SQL API...")
client = cosmos_client.CosmosClient(url_connection=config['ENDPOINT'], auth={
                                    'masterKey': config['PRIMARYKEY']})

print("Creating Database...")
# Create a database
db = client.CreateDatabase({'id': config['DATABASE']})

# Create container options
options = {
    'offerThroughput': 400
}

container_definition = {
    'id': config['CONTAINER']
}

link_container_definition = {
    'id': config['LINK-CONTAINER']
}

event_container_definition = {
    'id': config['EVENT-CONTAINER']
}

# Create a container for Entities
print("Creating " + str(config['CONTAINER']) + " container...")
container = client.CreateContainer(db['_self'], container_definition, options)

# Create a container for Links
print("Creating " + str(config['LINK-CONTAINER']) + " container...")
link_container = client.CreateContainer(db['_self'], link_container_definition, options)

# Create a container for Events
print("Creating " + str(config['EVENT-CONTAINER']) + " container...")
event_container = client.CreateContainer(db['_self'], event_container_definition, options)

In [None]:
numchars = 100  # number of characters to retrieve for head/tail

# Function to watch a folder and detect new images on a 1 second refresh interval
#before = dict ([(f, None) for f in os.listdir (docs_path)])
before = {}
count = 0
errors = 0

print("Beginning monitor of " + str(docs_path) + " at " + str(datetime.datetime.now()))

while True:   
    
    # Compares the folder contents after the sleep to what existed beforehand, and makes a list of adds and removes
    after = dict ([(f, None) for f in os.listdir (docs_path)])
    added = [f for f in after if not f in before]
    removed = [f for f in before if not f in after]

    if added: print("Added: ", ", ".join (added))
    if removed: print("Removed: ", ", ".join (removed))
    before = after
    
    for filename in added:
        if filename.endswith(".htm"):
            print("Processing " + filename + " at " + str(datetime.datetime.now()))
            start = time.time()
            
            # create empty lists for objects
            rdfobjs = []
            rdfobjsGeo = []
            linkobjs = []
            eventobjs = []
            orgdocs = []

            haslinks = False
            bigstring = ""  # keeps track of what was sent

            newhead = ""  # empty string to catch empty head/tail
            newtail = ""
            
            filepath = os.path.join(docs_path, filename)
            nlp_helper.netowl_curl(filepath, json_out_dir, out_ext, keys.netowl_key)
            outfile = os.path.join(json_out_dir, filename + out_ext)
            
            with open(outfile, 'r', encoding="utf-8") as file:
                rdfstring = json.load(file)
                uniquets = str(time.time())  # unique time stamp for each doc
                doc = rdfstring['document'][0]  # gets main part
                
                if 'text' in doc:
                    v = doc['text'][0]
                    if 'content' in v:
                        bigstring = v['content']
                
                if 'entity' not in doc:
                    print("ERROR: Nothing returned from NetOwl, or other unspecified error.")  # NOQA E501
                    break
                    
                ents = (doc['entity'])  # gets all entities in doc
                
                for e in ents:

                    # gather data from each entity
                    # rdfvalue = nof.cleanup_text(e['value'])  # value (ie name)
                    rdfvalue = nlp_helper.cleanup_text(e['value'])  # value (ie name)
                    rdfid = e['id']
                    rdfid = filename.split(".")[0] + "-" + rdfid  # unique to each entity

                    # test for geo (decide which type of obj to make - geo or non-geo)
                    
                    if 'geodetic' in e:

                        if 'link-ref' in e:
                            refrels = []
                            linkdescs = []
                            haslinks = True
                            for k in e['link-ref']:  # every link-ref per entity
                                refrels.append(k['idref'])  # keep these - all references  # noqa: E501
                                if 'role-type' in k:  # test the role type is source  # noqa: E501
                                    if k['role-type'] == "source":
                                        linkdesc = rdfvalue + " is a " + k['role'] + " in " + k['entity-arg'][0]['value']  # noqa: E501
                                        linkdescs.append(linkdesc)
                                    else:
                                        linkdescs.append("This item has parent links but no children")  # noqa: E501
                        else:
                            haslinks = False
                            
                        if 'entity-ref' in e:
                            isGeo = False  # already plotted, relegate to rdfobj list  # noqa: E501
                        else:
                            lat = float(e['geodetic']['latitude'])
                            longg = float(e['geodetic']['longitude'])
                            isGeo = True
                            
                    else:
                        isGeo = False
                        
                    # check for addresses
                    if e['ontology'] == "entity:address:mail":
                        address = e['value']
                        # location = nof.geocode_address(address)  # returns x,y
                        location = geocode_address(address)  # returns x,y
                        isGeo = True
                        # set lat long
                        lat = location['y']
                        longg = location['x']
                        # check for links
                        if 'link-ref' in e:
                            refrels = []
                            linkdescs = []
                            haslinks = True
                            for k in e['link-ref']:  # every link-ref per entity
                                refrels.append(k['idref'])  # keep these - all references  # noqa: E501
                                if 'role-type' in k:  # test the role type is source  # noqa: E501
                                    if k['role-type'] == "source":
                                        linkdesc = rdfvalue + " is a " + k['role'] + " in " + k['entity-arg'][0]['value']  # noqa: E501
                                        linkdescs.append(linkdesc)
                                    else:
                                        linkdescs.append("This item has parent links but no children")  # noqa: E501
                    else:
                        haslinks = False
                        
                    # set up head and tail
                    if 'entity-mention' in e:
                        em = e['entity-mention'][0]
                        if 'head' in em:
                            newhead = nlp_helper.get_head(bigstring, int(em['head']), numchars)
                        if 'tail' in em:
                            newtail = nlp_helper.get_tail(bigstring, int(em['tail']), numchars)
                            
                    else:
                        em = None
                        
                    if isGeo:

                        if haslinks:
                            # add refrels to new obj
                            rdfobj = nlp_helper.RDFitemGeo(rdfid, rdfvalue, longg, lat, uniquets, filename,  # noqa: E501
                                        refrels)
                            ld = str(linkdescs)
                            if len(ld) > 255:
                                ld = ld[:254]  # shorten long ones
                            rdfobj.set_link_details(ld)
                        else:
                            rdfobj = nlp_helper.RDFitemGeo(rdfid, rdfvalue, longg, lat, uniquets, filename)  # noqa: E501
                            rdfobj.set_link_details("No links for this point")
                            
                        # set type for symbology
                        rdfobj.set_type("placename")  # default
                        rdfobj.set_subtype("unknown")  # default
                        if e['ontology'] == "entity:place:city":
                            rdfobj.set_type("placename")
                            rdfobj.set_subtype("city")
                        if e['ontology'] == "entity:place:country":
                            rdfobj.set_type("placename")
                            rdfobj.set_subtype("country")
                        if e['ontology'] == "entity:place:province":
                            rdfobj.set_type("placename")
                            rdfobj.set_subtype("province")
                        if e['ontology'] == "entity:place:continent":
                            rdfobj.set_type("placename")
                            rdfobj.set_subtype("continent")
                        if e['ontology'] == "entity:numeric:coordinate:mgrs":
                            rdfobj.set_type("coordinate")
                            rdfobj.set_subtype("MGRS")
                        if e['ontology'] == "entity:numeric:coordinate:latlong":  # noqa: E501
                            rdfobj.set_type("coordinate")
                            rdfobj.set_subtype("latlong")
                        if e['ontology'] == "entity:address:mail":
                            rdfobj.set_type("address")
                            rdfobj.set_subtype("mail")
                        if e['ontology'] == "entity:place:other":
                            rdfobj.set_type("placename")
                            rdfobj.set_subtype("descriptor")
                        if e['ontology'] == "entity:place:landform":
                            rdfobj.set_type("placename")
                            rdfobj.set_subtype("landform")
                        if e['ontology'] == "entity:organization:facility":
                            rdfobj.set_type("placename")
                            rdfobj.set_subtype("facility")
                        if e['ontology'] == "entity:place:water":
                            rdfobj.set_type("placename")
                            rdfobj.set_subtype("water")
                        if e['ontology'] == "entity:place:county":
                            rdfobj.set_type("placename")
                            rdfobj.set_subtype("county")

                        rdfobj.set_head(newhead)
                        rdfobj.set_tail(newtail)
                        item = rdfobj.toJSON()
                        cosmos_item = client.CreateItem(container['_self'],{
                                "head": rdfobj.head,
                                "id": rdfobj.id,
                                "lat": rdfobj.lat,
                                "linkdetails": rdfobj.linkdetails,
                                "links": rdfobj.links,
                                "long": rdfobj.long,
                                "orgdoc": rdfobj.orgdoc,
                                "subtype": rdfobj.subtype,
                                "tail": rdfobj.tail,
                                "timest": rdfobj.timest,
                                "type": rdfobj.type,
                                "value": rdfobj.value
                        })
                        
                        rdfobjsGeo.append(rdfobj)
                        
                        nlp_helper.post_to_geoevent(item, geoevent_url)
                        
                    else:  # not geo
                        ontology = e['ontology']
                        if haslinks:
                            rdfobj = nlp_helper.RDFitem(rdfid, rdfvalue, uniquets, filename, ontology, refrels)  # noqa: E501
                        else:  # has neither links nor address
                            rdfobj = nlp_helper.RDFitem(rdfid, rdfvalue, uniquets, filename, ontology)

                            rdfobj.set_head(newhead)
                            rdfobj.set_tail(newtail)
                            
                            cosmos_item = client.CreateItem(container['_self'],{
                                    "value": rdfobj.value,
                                    "links": rdfobj.links,
                                    "orgdoc": rdfobj.orgdoc, 
                                    "id": rdfobj.id, 
                                    "type": rdfobj.type,
                                    "head": rdfobj.head,
                                    "tail": rdfobj.tail
                            })

                            rdfobjs.append(rdfobj)
                
                if 'link' in doc:
                    linksys = (doc['link'])
                    for l in linksys:
                        linkid = filename.split(".")[0] + "-" + l['id']
                        if 'entity-arg' in l:
                            fromid = filename.split(".")[0] + "-" + l['entity-arg'][0]['idref']
                            toid = filename.split(".")[0] + "-" + l['entity-arg'][1]['idref']
                            fromvalue = l['entity-arg'][0]['value']
                            tovalue = l['entity-arg'][1]['value']
                            fromrole = l['entity-arg'][0]['role']
                            torole = l['entity-arg'][1]['role']
                            fromroletype = l['entity-arg'][0]['role-type']
                            toroletype = l['entity-arg'][1]['role-type']
                        # build link objects
                        linkobj = nlp_helper.RDFlinkItem(linkid, fromid, toid, fromvalue, tovalue,
                                      fromrole, torole, fromroletype,
                                      toroletype, uniquets)
                        
                        cosmos_link_item = client.CreateItem(link_container['_self'],{
                                "linkid": linkobj.linkid,
                                "fromid": linkobj.fromid,
                                "toid": linkobj.toid,
                                "fromvalue":linkobj.fromvalue,
                                "tovalue":linkobj.tovalue,
                                "fromrole":linkobj.fromrole,
                                "torole": linkobj.torole,
                                "fromroletype": linkobj.fromroletype,
                                "toroletype": linkobj.toroletype
                        })
                        
                        linkobjs.append(linkobj)
                        
                        _gremlin_insert_vertices = ["g.addV('{0}').property('type', '{1}').property('id', '{0}')".format(fromvalue, fromroletype),
                        "g.addV('{0}').property('type', '{1}').property('id', '{0}')".format(tovalue, toroletype)]
                        
                        _gremlin_insert_edges = ["g.V('{0}').addE('linked').to(g.V('{1}'))".format(fromvalue, tovalue)]
                        
                        #try:
                            #nlp_helper.insert_vertices(_gremlin_insert_vertices, graph_client)
                        #except:
                            #print('Error on node insertion')
                        
                        #nlp_helper.insert_edges(_gremlin_insert_edges, graph_client)
                        
                    
                if 'event' in doc:
                    events = doc['event']
                    for e in events:
                        evid = e['id']
                        evvalue = e['value']
                        if 'entity-arg' in e:
                            fromid = filename.split(".")[0] + "-" + e['entity-arg'][0]['idref']
                            fromvalue = e['entity-arg'][0]['value']
                            fromrole = e['entity-arg'][0]['role']
                            if len(e['entity-arg']) > 1:
                                toid = filename.split(".")[0] + "-" + e['entity-arg'][1]['idref']
                                tovalue = e['entity-arg'][1]['value']
                                torole = e['entity-arg'][1]['role']
                            else:
                                toid = None
                                tovalue = None
                                torole = None
                                
    
                        eventobj = nlp_helper.RDFeventItem(evvalue, evid, fromid, toid,
                                        fromvalue, tovalue, fromrole,
                                        torole, filename, uniquets)
        
                        cosmos_event_item = client.CreateItem(event_container['_self'],{
                                    "eventvalue": eventobj.eventvalue,
                                    "eventid": eventobj.eventid,
                                    "fromid": eventobj.fromid,
                                    "toid": eventobj.toid,
                                    "fromvalue": eventobj.fromvalue,
                                    "tovalue": eventobj.tovalue,
                                    "fromrole": eventobj.fromrole,
                                    "torole": eventobj.torole,
                                    "orgdoc": eventobj.orgdoc
                        })
        
                        eventobjs.append(eventobj)
            
                        _gremlin_insert_vertices = ["g.addV('{0}').property('type', '{1}').property('id', '{0}')".format(fromvalue, fromroletype),
                        "g.addV('{0}').property('type', '{1}').property('id', '{0}')".format(tovalue, toroletype)]
                        
                        _gremlin_insert_edges = ["g.V('{0}').addE('{1}').to(g.V('{2}'))".format(fromvalue, fromrole, tovalue)]
                        
                        #try:
                            #nlp_helper.insert_vertices(_gremlin_insert_vertices, graph_client)
                        #except:
                            #print('Error on node insertion')
                        
                        #nlp_helper.insert_edges(_gremlin_insert_edges, graph_client)
                            
            end = time.time()
            process_time = end - start
            print("Non-Geospatial Entities Found: " + str(len(rdfobjs)))
            print("Geospatial Entities Found: " + str(len(rdfobjsGeo)))
            print("Links Found: " + str(len(linkobjs)))
            print("Events Found: " + str(len(eventobjs)))
            print("Process took {0} seconds".format(str(process_time)))
            count +=1
            
    if count > 4:
        print("Exiting")
        break