In [1]:
!pip install py2neo pandas matplotlib



In [2]:
from py2neo import Graph
import pandas as pd

import matplotlib 
import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import pandas as pd
from collections import Counter
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

In [13]:
graph = Graph("bolt://localhost:7687", auth=("neo4j", "neo"))

In [15]:
query = """
MATCH (cr1:Crashes)-[:LOCATED_AT]->(int:Intersections)<-[:LOCATED_AT]-(cr2:Crashes)
WHERE cr1.Type = "nAutoV" AND cr2.Type = "AutoV"
WITH cr1, cr2, int
ORDER BY cr1, cr1.Year
WITH cr1, cr2, collect(cr1)[0].Year AS year, count(*) AS coint
MERGE (cr1)-[col:COLOC {Year: year}]-(cr2)
SET col.coint = coint;
"""

graph.run(query).stats()

constraints_added: 0
constraints_removed: 0
contained_updates: True
indexes_added: 0
indexes_removed: 0
labels_added: 0
labels_removed: 0
nodes_created: 0
nodes_deleted: 0
properties_set: 520854
relationships_created: 260427
relationships_deleted: 0

In [16]:
query = """
MATCH (cr1:Crashes)-[:LOCATED_AT]->(int:Intersections)<-[:LOCATED_AT]-(cr2:Crashes)
WHERE cr1.Type = "nAutoV" AND cr2.Type = "AutoV"
WITH cr1, cr2, int
ORDER BY cr1, cr1.Year
WITH cr1, cr2, collect(cr1)[0].Year AS year, count(*) AS coint
WHERE year < 2015
MERGE (cr1)-[col:COLOC_EARLY {Year: year}]-(cr2)
SET col.coint = coint;
"""

graph.run(query).stats()

constraints_added: 0
constraints_removed: 0
contained_updates: True
indexes_added: 0
indexes_removed: 0
labels_added: 0
labels_removed: 0
nodes_created: 0
nodes_deleted: 0
properties_set: 412464
relationships_created: 206232
relationships_deleted: 0

In [17]:
query = """
MATCH (cr1:Crashes)-[:LOCATED_AT]->(int:Intersections)<-[:LOCATED_AT]-(cr2:Crashes)
WHERE cr1.Type = "nAutoV" AND cr2.Type = "AutoV"
WITH cr1, cr2, int
ORDER BY cr1, cr1.Year
WITH cr1, cr2, collect(cr1)[0].Year AS year, count(*) AS coint
WHERE year = 2015
MERGE (cr1)-[col:COLOC_LATE {Year: year}]-(cr2)
SET col.coint = coint;
"""

graph.run(query).stats()

constraints_added: 0
constraints_removed: 0
contained_updates: True
indexes_added: 0
indexes_removed: 0
labels_added: 0
labels_removed: 0
nodes_created: 0
nodes_deleted: 0
properties_set: 108390
relationships_created: 54195
relationships_deleted: 0

In [18]:
query = """
MATCH ()-[:COLOC_LATE]->()
RETURN count(*) AS count
"""

graph.run(query).to_data_frame()

Unnamed: 0,count
0,54195


In [19]:
query = """
MATCH ()-[:COLOC_EARLY]->()
RETURN count(*) AS count
"""

graph.run(query).to_data_frame()

Unnamed: 0,count
0,206232


In [20]:
def down_sample(df):
    copy = df.copy()
    zero = Counter(copy.label.values)[0]
    un = Counter(copy.label.values)[1]
    n = zero - un
    copy = copy.drop(copy[copy.label == 0].sample(n=n, random_state=1).index)
    return copy.sample(frac=1)

In [23]:
train_existing_links = graph.run("""
MATCH (cr1:Crashes)-[:COLOC_EARLY]->(cr2:Crashes)
RETURN id(cr1) AS node1, id(cr2) AS node2, 1 AS label
""").to_data_frame()



In [24]:
train_missing_links = graph.run("""
MATCH (cr:Crashes)
WHERE (cr)-[:COLOC_EARLY]-()
MATCH (cr)-[:COLOC_EARLY*2..3]-(other)
WHERE not((cr)-[:COLOC_EARLY]-(other))
RETURN id(cr) AS node1, id(other) AS node2, 0 AS label
""").to_data_frame()
train_missing_links = train_missing_links.drop_duplicates()

In [25]:
training_df = train_missing_links.append(train_existing_links, ignore_index=True)
training_df['label'] = training_df['label'].astype('category')
training_df = down_sample(training_df)

In [28]:
training_df.head(10)

Unnamed: 0,label,node1,node2
72602,0,38671,57635
484438,1,74409,60263
578885,1,116064,131109
187221,0,79102,93496
502261,1,81444,35660
534242,1,97998,56683
156300,0,73369,115710
197119,0,82098,119013
446203,1,34712,107112
529145,1,96304,121773


In [29]:
test_existing_links = graph.run("""
MATCH (cr1:Crashes)-[:COLOC_LATE]->(cr2:Crashes)
RETURN id(cr1) AS node1, id(cr2) AS node2, 1 AS label
""").to_data_frame()

In [30]:
test_missing_links = graph.run("""
MATCH (cr:Crashes)
WHERE (cr)-[:COLOC_LATE]-()
MATCH (cr)-[:COLOC_LATE*2..3]-(other)
WHERE not((cr)-[:COLOC_LATE]-(other))
RETURN id(cr) AS node1, id(other) AS node2, 0 AS label
""").to_data_frame()
test_missing_links = test_missing_links.drop_duplicates()

In [31]:
test_df = test_missing_links.append(test_existing_links, ignore_index=True)
test_df['label'] = test_df['label'].astype('category')
test_df = down_sample(test_df)

In [32]:
test_df.head()

Unnamed: 0,label,node1,node2
75297,0,62430,71816
218790,1,52368,116394
234555,1,59494,131326
194850,0,127336,88935
146972,0,103433,62622


In [33]:

classifier = RandomForestClassifier(n_estimators=30, max_depth=10, random_state=0)

In [34]:
def apply_graphy_features(data, rel_type):
    query = """
    UNWIND $pairs AS pair
    MATCH (p1) WHERE id(p1) = pair.node1
    MATCH (p2) WHERE id(p2) = pair.node2
    RETURN pair.node1 AS node1,
           pair.node2 AS node2,
           algo.linkprediction.commonNeighbors(
               p1, p2, {relationshipQuery: $relType}) AS cn,
           algo.linkprediction.preferentialAttachment(
               p1, p2, {relationshipQuery: $relType}) AS pa,
           algo.linkprediction.totalNeighbors(
               p1, p2, {relationshipQuery: $relType}) AS tn
    """
    pairs = [{"node1": node1, "node2": node2}  for node1,node2 in data[["node1", "node2"]].values.tolist()]
    features = graph.run(query, {"pairs": pairs, "relType": rel_type}).to_data_frame()
    return pd.merge(data, features, on = ["node1", "node2"])

In [35]:
training_df = apply_graphy_features(training_df, "COLOC_EARLY")

In [36]:
training_df.head()

Unnamed: 0,label,node1,node2,cn,pa,tn
0,0,38671,57635,31.0,961.0,31.0
1,1,74409,60263,0.0,90.0,19.0
2,1,116064,131109,0.0,7533.0,174.0
3,0,79102,93496,11.0,121.0,11.0
4,1,81444,35660,0.0,14605.0,242.0


In [37]:

test_df = apply_graphy_features(test_df, "COLOC_LATE")

In [38]:
test_df.head()

Unnamed: 0,label,node1,node2,cn,pa,tn
0,0,62430,71816,4.0,16.0,4.0
1,1,52368,116394,0.0,1771.0,100.0
2,1,59494,131326,0.0,429.0,46.0
3,0,127336,88935,18.0,324.0,18.0
4,0,103433,62622,9.0,81.0,9.0


In [39]:
columns = ["cn"]

X = training_df[columns]
y = training_df["label"]
classifier.fit(X, y)

predictions = classifier.predict(test_df[columns])
y_test = test_df["label"]

display("Accuracy", accuracy_score(y_test, predictions))
display("Precision", precision_score(y_test, predictions))
display("Recall", recall_score(y_test, predictions))

sorted(list(zip(columns, classifier.feature_importances_)), key = lambda x: x[1]*-1)

'Accuracy'

1.0

'Precision'

1.0

'Recall'

1.0

[('cn', 1.0)]

In [40]:
columns = ["cn", "pa", "tn"]

X = training_df[columns]
y = training_df["label"]
classifier.fit(X, y)

predictions = classifier.predict(test_df[columns])
y_test = test_df["label"]

display("Accuracy", accuracy_score(y_test, predictions))
display("Precision", precision_score(y_test, predictions))
display("Recall", recall_score(y_test, predictions))

sorted(list(zip(columns, classifier.feature_importances_)), key = lambda x: x[1]*-1)

'Accuracy'

1.0

'Precision'

1.0

'Recall'

1.0

[('cn', 0.7187707968470105),
 ('tn', 0.19612490409954997),
 ('pa', 0.0851042990534395)]

In [51]:
graph.run("""
CALL algo.triangleCount('Crashes', 'COLOC_EARLY', {concurrency:4, write:true,
writeProperty:'trianglesTrain', clusteringCoefficientProperty:'coefficientTrain'});
""").to_data_frame()

Unnamed: 0,averageClusteringCoefficient,clusteringCoefficientProperty,computeMillis,loadMillis,nodeCount,p1,p10,p100,p25,p5,p50,p75,p90,p95,p99,postProcessingMillis,triangleCount,write,writeMillis,writeProperty
0,0.0,coefficientTrain,49,93,111500,0,0,0,0,0,0,0,0,0,0,58,0,True,1,trianglesTrain


In [44]:
graph.run("""
CALL algo.triangleCount('Crashes', 'COLOC_LATE', { write:true,
writeProperty:'trianglesTest', clusteringCoefficientProperty:'coefficientTest'});
""").to_data_frame()

Unnamed: 0,averageClusteringCoefficient,clusteringCoefficientProperty,computeMillis,loadMillis,nodeCount,p1,p10,p100,p25,p5,p50,p75,p90,p95,p99,postProcessingMillis,triangleCount,write,writeMillis,writeProperty
0,0.0,coefficientTest,20,77,111500,0,0,0,0,0,0,0,0,0,0,795,0,True,1,trianglesTest


In [45]:
def apply_triangles_features(data, triangles_prop, coefficient_prop):
    query = """
    UNWIND $pairs AS pair
    MATCH (p1) WHERE id(p1) = pair.node1
    MATCH (p2) WHERE id(p2) = pair.node2
    RETURN pair.node1 AS node1,
    pair.node2 AS node2,
    apoc.coll.min([p1[$trianglesProp], p2[$trianglesProp]]) AS minTriangles,
    apoc.coll.max([p1[$trianglesProp], p2[$trianglesProp]]) AS maxTriangles,
    apoc.coll.min([p1[$coefficientProp], p2[$coefficientProp]]) AS minCoefficient,
    apoc.coll.max([p1[$coefficientProp], p2[$coefficientProp]]) AS maxCoefficient
    """
    pairs = [{"node1": node1, "node2": node2}  for node1,node2 in data[["node1", "node2"]].values.tolist()]
    params = {
    "pairs": pairs,
    "trianglesProp": triangles_prop,
    "coefficientProp": coefficient_prop
    }
    features = graph.run(query, params).to_data_frame()
    return pd.merge(data, features, on = ["node1", "node2"])

In [46]:
training_df = apply_triangles_features(training_df, "trianglesTrain", "coefficientTrain")
test_df = apply_triangles_features(test_df, "trianglesTest", "coefficientTest")

In [47]:
columns = [
    "cn", "pa", "tn", # graph features
    "minTriangles", "maxTriangles", "minCoefficient", "maxCoefficient" # triangle features  
]

X = training_df[columns]
y = training_df["label"]
classifier.fit(X, y)

predictions = classifier.predict(test_df[columns])
y_test = test_df["label"]

display("Accuracy", accuracy_score(y_test, predictions))
display("Precision", precision_score(y_test, predictions))
display("Recall", recall_score(y_test, predictions))

sorted(list(zip(columns, classifier.feature_importances_)), key = lambda x: x[1]*-1)

'Accuracy'

1.0

'Precision'

1.0

'Recall'

1.0

[('cn', 0.7678281959822872),
 ('tn', 0.18136962022697048),
 ('pa', 0.05080218379074231),
 ('minTriangles', 0.0),
 ('maxTriangles', 0.0),
 ('minCoefficient', 0.0),
 ('maxCoefficient', 0.0)]

In [50]:
graph.run("""
CALL algo.labelPropagation("Crashes", "COLOC_EARLY", 
{iterations: 10, writeProperty: 'partitionTrain', write: true, direction: 'BOTH'});
""").to_data_frame()

Unnamed: 0,communityCount,computeMillis,didConverge,iterations,loadMillis,nodes,p1,p10,p100,p25,...,p75,p90,p95,p99,postProcessingMillis,seedProperty,weightProperty,write,writeMillis,writeProperty
0,97769,131,True,3,114,111500,1,1,276,1,...,1,1,1,2,717,,,True,39,partitionTrain


In [52]:
graph.run("""
CALL algo.labelPropagation("Crashes", "COLOC_LATE", 
{iterations: 10, writeProperty: 'partitionTest', write: true, direction: 'BOTH'});
""").to_data_frame()

Unnamed: 0,communityCount,computeMillis,didConverge,iterations,loadMillis,nodes,p1,p10,p100,p25,...,p75,p90,p95,p99,postProcessingMillis,seedProperty,weightProperty,write,writeMillis,writeProperty
0,104082,39,True,3,53,111500,1,1,167,1,...,1,1,1,1,428,,,True,19,partitionTest


In [53]:
graph.run("""
CALL algo.louvain.stream("Crashes", "COLOC_EARLY", {includeIntermediateCommunities:true})
YIELD nodeId, community, communities
WITH algo.getNodeById(nodeId) AS node, communities[0] AS smallestCommunity
SET node.louvainTrain = smallestCommunity;
""").stats()

constraints_added: 0
constraints_removed: 0
contained_updates: True
indexes_added: 0
indexes_removed: 0
labels_added: 0
labels_removed: 0
nodes_created: 0
nodes_deleted: 0
properties_set: 111500
relationships_created: 0
relationships_deleted: 0

In [54]:
graph.run("""
CALL algo.louvain.stream("Crashes", "COLOC_LATE", {includeIntermediateCommunities:true})
YIELD nodeId, community, communities
WITH algo.getNodeById(nodeId) AS node, communities[0] AS smallestCommunity
SET node.louvainTest = smallestCommunity;
""").stats()

constraints_added: 0
constraints_removed: 0
contained_updates: True
indexes_added: 0
indexes_removed: 0
labels_added: 0
labels_removed: 0
nodes_created: 0
nodes_deleted: 0
properties_set: 111500
relationships_created: 0
relationships_deleted: 0

In [55]:
def apply_community_features(data, partition_prop, louvain_prop):
    query = """
    UNWIND $pairs AS pair
    MATCH (p1) WHERE id(p1) = pair.node1
    MATCH (p2) WHERE id(p2) = pair.node2
    RETURN pair.node1 AS node1,
    pair.node2 AS node2,
    algo.linkprediction.sameCommunity(p1, p2, $partitionProp) AS sp,    
    algo.linkprediction.sameCommunity(p1, p2, $louvainProp) AS sl
    """
    pairs = [{"node1": node1, "node2": node2}  for node1,node2 in data[["node1", "node2"]].values.tolist()]
    params = {
    "pairs": pairs,
    "partitionProp": partition_prop,
    "louvainProp": louvain_prop
    }
    features = graph.run(query, params).to_data_frame()
    return pd.merge(data, features, on = ["node1", "node2"])

In [56]:
training_df = apply_community_features(training_df, "partitionTrain", "louvainTrain")
test_df = apply_community_features(test_df, "partitionTest", "louvainTest")

In [57]:
columns = [
    "cn", "pa", "tn", # graph features
    "minTriangles", "maxTriangles", "minCoefficient", "maxCoefficient", # triangle features  
    "sp", "sl" # community features
]

X = training_df[columns]
y = training_df["label"]
classifier.fit(X, y)

predictions = classifier.predict(test_df[columns])
y_test = test_df["label"]

display("Accuracy", accuracy_score(y_test, predictions))
display("Precision", precision_score(y_test, predictions))
display("Recall", recall_score(y_test, predictions))

sorted(list(zip(columns, classifier.feature_importances_)), key = lambda x: x[1]*-1)

'Accuracy'

1.0

'Precision'

1.0

'Recall'

1.0

[('cn', 0.770354725082145),
 ('tn', 0.14840471761155746),
 ('pa', 0.08114053875225755),
 ('sl', 0.00010001855404000553),
 ('minTriangles', 0.0),
 ('maxTriangles', 0.0),
 ('minCoefficient', 0.0),
 ('maxCoefficient', 0.0),
 ('sp', 0.0)]