In [1]:
import neo4j
from neo4j import GraphDatabase

import pandas

uri = "bolt://localhost:7687"
pwd = "password"

driver = GraphDatabase.driver(uri, auth=("neo4j", pwd))

In [2]:
# Create Shared PII relationship
query = """
MATCH ( c:Client )-[ :HAS_EMAIL|HAS_PHONE|HAS_SSN ] -> (n) <-[ :HAS_EMAIL|HAS_PHONE|HAS_SSN ]- ( d:Client )
WHERE elementId(c) < elementId(d)
WITH c, d, count(*) AS cnt
MERGE (c) - [ :SHARED_PII { count: cnt } ] -> (d)
"""

with driver.session() as session:
    result = session.run(query)

ServiceUnavailable: Couldn't connect to localhost:7687 (resolved to ()):
Failed to establish connection to ResolvedIPv6Address(('::1', 7687, 0, 0)) (reason [Errno 111] Connection refused)
Failed to establish connection to ResolvedIPv4Address(('127.0.0.1', 7687)) (reason [Errno 111] Connection refused)

In [None]:
# Create graph projection
query = """
CALL gds.graph.project(
    'clientClusters' ,
    { 
        Client: { label: 'Client' }
    },
    { 
        SHARED_PII: {
            type: 'SHARED_PII',
            orientation: 'UNDIRECTED',
            properties: { count: { property: 'count' }}
        }
    }
)
YIELD graphName, nodeCount, relationshipCount
"""

with driver.session() as session:
    result = session.run(query)

In [None]:
query = """
CALL gds.wcc.stream(
    'clientClusters', 
    {
        nodeLabels: ['Client'],
        relationshipTypes: ['SHARED_PII'],
        consecutiveIds: true
    }
) 
YIELD nodeId, componentId
WITH gds.util.asNode(nodeId) AS clientId, componentId AS clusterId
WITH clusterId, collect(clientId.id) AS clients
WITH clusterId, clients, size(clients) AS clusterSize where clusterSize >= 2
    UNWIND clients AS client
    MATCH (c:Client) WHERE c.id = client
    SET c.secondPartyFraudRing = clusterId
"""

with driver.session() as session: 
    result = session.run(query)


In [None]:
query = """
CALL gds.graph.project(
    'similarity',
    ['Client', 'Email', 'Phone', 'SSN'],
    {
        HAS_EMAIL: {},
        HAS_PHONE: {},
        HAS_SSN: {}
    }
)
YIELD graphName, nodeCount, relationshipCount
WITH graphName, nodeCount, relationshipCount
RETURN graphName, nodeCount, relationshipCount
"""

with driver.session() as session: 
    result = session.run(query)
    print(result.values())

[['similarity', 9134, 7299]]


In [None]:
# 

query = """
CALL gds.nodeSimilarity.mutate('similarity', { mutateProperty: 'jaccardScore',
mutateRelationshipType: 'SIMILAR_TO' ,
topK: 15
})
"""

with driver.session() as session: 
    result = session.run(query)

In [None]:
# Write `SIMILAR_TO` to `jaccardScore` 

query = """
CALL gds.graph.relationship.write('similarity', 'SIMILAR_TO', 'jaccardScore', {}) YIELD graphName;
"""

with driver.session() as session: 
    result = session.run(query)


In [None]:
# Write the similarity to the main graph

query = """
CALL gds.degree.write('similarity', { nodeLabels: ['Client'],
relationshipTypes: ['SIMILAR_TO'],
relationshipWeightProperty: 'jaccardScore',
writeProperty: 'secondPartyFraudScore'
})
"""

with driver.session() as session: 
    result = session.run(query)

In [None]:
# Add SecondPartyFraudster property if we are very sure

query = """
MATCH (c:Client)
WHERE c.secondPartyFraudScore IS NOT NULL
WITH percentileCont(c.secondPartyFraudScore, 0.95) AS threshold
MATCH (c:Client)
WHERE c.secondPartyFraudScore > threshold
SET c:SecondPartyFraudster
"""

with driver.session() as session: 
    result = session.run(query)


In [None]:
# Get the name and ID of potential fraudsters

query = """
MATCH (c:Client)
WHERE c:SecondPartyFraudster
RETURN c.name, c.id
"""

panda_df = driver.execute_query(query, database_="neo4j", result_transformer_=neo4j.Result.to_df)

pandas.set_option('display.max_columns', 2)

panda_df

Unnamed: 0,c.name,c.id
0,Audrey Snider,4446527462545779
1,Ayden Mayo,4777446725804270
2,Jack Chan,4282733546081412
3,Ella Dixon,4536072826730040
4,Brooklyn Harrison,4029043591201321
...,...,...
128,Elijah Warren,4413385955087620
129,Xavier Welch,4550448544478545
130,Nathaniel Myers,4114683318919154
131,Katherine Jacobson,4172817689754167


In [None]:
# Drop databases so it can be rerun

query = "CALL gds.graph.drop('clientClusters') YIELD graphName;"

with driver.session() as session:
    result = session.run(query)

query = "CALL gds.graph.drop('similarity') YIELD graphName;"

with driver.session() as session:
    result = session.run(query)