In [9]:
from gremlin_python import statics
from gremlin_python.structure.graph import Graph
from gremlin_python.process.graph_traversal import __
from gremlin_python.process.strategies import *
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection

In [10]:
graph = Graph()
connection = DriverRemoteConnection('ws://localhost:8182/gremlin', 'g')
# The connection should be closed on shut down to close open connections with connection.close()
g = graph.traversal().withRemote(connection)
# Reuse 'g' across the application

In [11]:
# added to start from a fresh graph with no nodes or edges
g.V().drop().iterate()

[['V'], ['drop'], ['none'], ['values', '_ipython_canary_method_should_not_exist_'], ['values', '_ipython_canary_method_should_not_exist_']]

# Exercise 1

In [12]:
'''Function to create a new node if it doesn't exist,
and get the vertex if already present in the graph'''
def getOrCreate(name):
    # Treating None as no node/relationship in the graph
    if name == 'None':
        return None
    # Getting vertex if already present
    vertex = g.V().has(name, 'name', name).toList()
    if len(vertex) != 0:
        return vertex[0]
    
    # Creating vertex if not present with property 'name'
    return g.addV(name).property('name', name).next()

In [13]:
'''Creating the graph from alerts.csv'''
with open('data/alerts.csv', 'r') as file:
    for line in file:
        # CSV is comma delimited
        columns = line.strip().split(',')
        
        # Getting vertices corresponding to the columns in the CSV
        v1 = getOrCreate(columns[0])
        v2 = getOrCreate(columns[1])
        v3 = getOrCreate(columns[2])
        
        # Creating the edge between the first and second column
        e1 = g.V(v2).addE('is').to(v1).property('type', 'is').iterate()
        
        # Some vertices in the third column are None indicating no relationship
        # Edges are being created only when the vertex in the third column is Not None
        # Condition applied only for third node but is easily extendible to all nodes if the contain None
        if v3 != None:
            e2 = g.V(v3).addE('is').to(v1).property('type', 'is').iterate()
            e3 = g.V(v2).addE('knows').to(v3).property('type', 'knows').iterate()

# Exercise 2

In [14]:
# List of vertices to iterate over
verticesList = g.V().valueMap().toList()

In [134]:
'''The solutions are written to a file named exercise2.txt'''
with open('exercise2.txt', 'w') as file:
    
    file.write("QUERY 1:\n")
    
    for vertex in verticesList:
        nodeName = vertex['name'][0]
        
        # Getting degree (in)
        inDegree = g.V().has(nodeName, 'name', nodeName).inE().count().toList()[0]
        
        # Getting degree (out)
        outDegree = g.V().has(nodeName, 'name', nodeName).outE().count().toList()[0]
        
        #Getting degree (in/out)
        # overallDegree = g.V().has(nodeName, 'name', nodeName).both().count()
        # More efficient to just add the inDegree and outDegree
        overallDegree = inDegree + outDegree
        file.write("For node {:20s}\n".format(nodeName))
        file.write("In degree      -> {:20s}\n".format(str(inDegree)))
        file.write("Out degree     -> {:20s}\n".format(str(outDegree)))
        file.write("Overall degree -> {:20s}\n".format(str(overallDegree)))

    file.write("\n\n")    
    file.write("*"*40)
    file.write("\n\n")
    
    file.write("QUERY 2:\n")
    
    # Initializing the max length and vertex the maximum chain length belongs to
    maximumChainLength = 0
    maximumChainVertex = ''
    
    for vertex in verticesList:
        
        nodeName = vertex['name'][0]
        
        # The last path emited is the longest path for that node
        # The chain count is a combination of the vertex count and the edge count
        # This was done to print not just the vertex but also the relationship between the vertex
        # Easily changable to get only the vertex count
        longestChainLength = g.V().has(nodeName, 'name',nodeName).repeat(__.inE().outV()).emit().tail().path().unfold().count().toList()[0]
        
        # The longest chain length for each Node is written
        file.write("For node {:20s}\n".format(nodeName))
        file.write("Longest Chain for this Node -> {}\n".format(longestChainLength))
        
        # Finding the maximum chain length, and the vertex that chain belongs to
        if longestChainLength >= maximumChainLength:
            maximumChainLength = longestChainLength
            maximumChainVertex = nodeName
            
    file.write("*"*20)
    file.write("\nLongest Chain   -> {}\n".format(maximumChainLength))
    file.write("Longest Path:\n")
    
    # There can be multiple paths that have the maximum length, to get all those chains
    # All chains or paths of the vertex is iterated through
    for path in g.V().has(maximumChainVertex, 'name', maximumChainVertex).repeat(
        __.inE().outV()).emit().path().by(__.label()).toList():
        
        # When the length of that path matches the maximum length calculated above
        # The path is written to the file as the solution
        if len(path) == maximumChainLength:
            for node in path:
                file.write("{} ".format(node))
                file.write("<- ")
            file.write("start\n")
    
    
    file.write("\n\n")    
    file.write("*"*40)
    file.write("\n\n")
    
    file.write("QUERY 3:\n")
    
    # Finding the number of vertices connected to 'ztf4' node
    connectedVerticesCount = g.V().has('ztf4', 'name', 'ztf4').both().count().toList()[0]
    # Finding the vertices connected to 'ztf4' node
    connectedVertices = g.V().has('ztf4', 'name', 'ztf4').both().valueMap().toList()
    file.write("Vertices connected to ztf4  -> {}\n".format(connectedVerticesCount))
    file.write("The vertices are ")
    for vertex in connectedVertices:
        file.write("{} ".format(vertex['name'][0]))
        
        
    file.write("\n\n")    
    file.write("*"*40)
    file.write("\n\n")
    
    file.write("QUERY 4:\n")
    
    # Getting a subgraph in gremlin-python wasn't straight forward like it would have been in pure gremlin
    # So a workaround was used here, which gets the edges and vertices connected to the 'unknown' node
    # A gremlin solution "g.V().has('unknown', 'name', 'unknown').bothE().subgraph('subgraph').outV().bothE().subgraph(
    # 'subgraph').cap('subgraph').next()" can be used for a simple subgraph retrieval
    # A solution using connecting to the client and submitting a query to the gremlin server and retrieving just the
    # the result was considered, but I also read somewhere that that isn't recommended, if this solution isn't
    # up to the mark, that solution can also be used
    subGraph = g.V().has('unknown', 'name', 'unknown').bothE().otherV().as_(
    'vertex2').bothE().dedup().project('v', 'IN', 'OUT').by(
        __.valueMap(True)).by(__.inV().label().fold()).by(
        __.outV().label().fold()).toList()
    # dedup has been used because of the presence of parallel edges
    # valueMap along with chained by clauses have been used to output 
    # the source and destination of the edge
    
    file.write("Nodes and edges in subgraph:\n")
    
#     properties = []
    
#     for node in subGraph:
#         print(node)
    for node in subGraph:
        file.write("{} {} {}\n".format(node['OUT'][0], node['v']['type'],
                                      node['IN'][0]))