Step 4

In [1]:
import pandas as pd
import os

In [2]:
import neo4j

In [3]:
from neo4j import GraphDatabase

In [4]:
from graphdatascience import GraphDataScience

In [5]:
NEO4J_URI = os.environ.get("NEO4J_URI", "bolt://localhost:7687")

In [6]:
NEO4J_AUTH = ( "neo4j",  "assignment3")

In [7]:
driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)

In [8]:
gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH)



Step 5

In [9]:
pii_query = '''
MATCH (c:Client)-[ :HAS_EMAIL |:HAS_PHONE |:HAS_SSN ]-> (n) <-[ :HAS_EMAIL |:HAS_PHONE |:HAS_SSN ]-(d:Client)
WHERE id(c) < id(d)
RETURN c.id, d.id, count(*) AS freq
ORDER BY freq DESC
'''

In [10]:
gds.run_cypher(pii_query)



Unnamed: 0,c.id,d.id,freq
0,4952527271473904,4816336012071985,3
1,4883445100935916,4708373581412325,3
2,4658150168863397,4100374538108184,3
3,4673951123644611,4795773320377768,3
4,4192214340630620,4912097363222923,3
...,...,...,...
754,4910140986334626,4114683318919154,1
755,4454780847105236,4210575070378533,1
756,4721862020593706,4210575070378533,1
757,4445521165797820,4210575070378533,1


Step 6

In [11]:
project_query = '''
CALL gds.graph.project(
    'clientClusters' ,
    { Client: {
        label: 'Client' }
    },
    { SHARED_PII: {
        type: 'SHARED_PII',
        orientation: 'UNDIRECTED',
        properties: {
        count: {
        property: 'count' }
            }
        }
    }
)
YIELD graphName, nodeCount, relationshipCount
'''

In [12]:
gds.run_cypher(project_query)

Unnamed: 0,graphName,nodeCount,relationshipCount
0,clientClusters,2433,1518


Step 7

In [13]:
streaming_query = '''
CALL gds.wcc.stream(
    'clientClusters',
    {
    nodeLabels: ['Client'],
    relationshipTypes: ['SHARED_PII'],
    consecutiveIds: true
    }
)
YIELD nodeId, componentId
RETURN gds.util.asNode(nodeId).id AS clientId, 
componentId AS clusterId
'''

In [14]:
pandasDF = driver.execute_query(
    streaming_query,
    database_="neo4j",
    result_transformer_=  neo4j.Result.to_df
)

In [15]:
print(type(pandasDF))

<class 'pandas.core.frame.DataFrame'>


In [16]:
with driver.session() as session:
    result = session.run(streaming_query)
    print(result.data())

[{'clientId': '4997933060327094', 'clusterId': 0}, {'clientId': '4776276949898423', 'clusterId': 1}, {'clientId': '4858607188760216', 'clusterId': 2}, {'clientId': '4287186486553145', 'clusterId': 3}, {'clientId': '4661202154682409', 'clusterId': 4}, {'clientId': '4649268238636650', 'clusterId': 5}, {'clientId': '4426707672690219', 'clusterId': 6}, {'clientId': '4922246870240518', 'clusterId': 7}, {'clientId': '4415848797892554', 'clusterId': 8}, {'clientId': '4548497513788330', 'clusterId': 9}, {'clientId': '4950634022082174', 'clusterId': 10}, {'clientId': '4860903977910377', 'clusterId': 11}, {'clientId': '4234798486577769', 'clusterId': 12}, {'clientId': '4175792657809755', 'clusterId': 13}, {'clientId': '4361287590543243', 'clusterId': 14}, {'clientId': '4833833649287561', 'clusterId': 15}, {'clientId': '4495151043368906', 'clusterId': 16}, {'clientId': '4028666746330768', 'clusterId': 17}, {'clientId': '4664274185403862', 'clusterId': 18}, {'clientId': '4699974121949998', 'cluste

Step 8

In [17]:
possible_fraud_members = '''
CALL gds.wcc.stream(
    'clientClusters',
    {
    nodeLabels: ['Client'],
    relationshipTypes: ['SHARED_PII'],
    consecutiveIds: true
    }
)
YIELD nodeId, componentId
WITH gds.util.asNode(nodeId) AS clientId , componentId AS clusterId
WITH clusterId, collect(clientId.id) AS clients
WITH clusterId, clients, size(clients) AS clusterSize WHERE clusterSize >= 2
UNWIND clients AS client
MATCH (c:Client) WHERE c.id = client
SET c.secondPartyFraudRing = clusterId
'''

In [18]:
pandasDF = driver.execute_query(
    possible_fraud_members,
    database_="neo4j",
    result_transformer_=  neo4j.Result.to_df
)

In [19]:
with driver.session() as session:
    result = session.run(possible_fraud_members)
    print(result.single())

None


Step 9

In [20]:
bipartite_graph = '''
MATCH (c:Client) WHERE c.secondPartyFraudRing is NOT NULL
WITH collect(c) as clients

MATCH (n) WHERE n:Email OR n:Phone OR n:SSN

WITH clients, collect(n) AS piis
WITH clients + piis AS nodes

MATCH (c:Client) -[:HAS_EMAIL | :HAS_PHONE | :HAS_SSN]->(p)
WHERE c.secondPartyFraudRing is NOT NULL

WITH nodes, collect({source: c, target: p}) as relationships

CALL gds.graph.project.cypher(
    'similarity',
    "UNWIND $nodes as n
        RETURN id(n) AS id,labels(n) AS labels",
    "UNWIND $relationships as r
        RETURN id(r['source']) AS source, id(r['target']) AS target,
    'HAS_PII' as type",
    { parameters:
        { nodes: nodes,
            relationships: relationships }
    }
)
YIELD graphName, nodeCount, relationshipCount
RETURN graphName, nodeCount, relationshipCount

'''

In [21]:
gds.run_cypher(bipartite_graph)



Unnamed: 0,graphName,nodeCount,relationshipCount
0,similarity,7037,1008


Step 10

In [22]:
nodeSimilarity_mutate = '''

CALL gds.nodeSimilarity.mutate(
  'similarity',
  {
    mutateProperty: 'jaccardScore',
    mutateRelationshipType: 'SIMILAR_TO',
    topK: 15
  }
)
YIELD
  preProcessingMillis,
  computeMillis,
  mutateMillis,
  postProcessingMillis,
  relationshipsWritten,
  nodesCompared,
  similarityDistribution,
  configuration
'''

In [23]:
gds.run_cypher(nodeSimilarity_mutate)

Unnamed: 0,preProcessingMillis,computeMillis,mutateMillis,postProcessingMillis,relationshipsWritten,nodesCompared,similarityDistribution,configuration
0,0,85,79,0,1518,336,"{'min': 0.19999980926513672, 'p5': 0.199999809...","{'mutateProperty': 'jaccardScore', 'jobId': 'a..."


Clearing Projections From Memory

In [24]:
gds.run_cypher('''CALL gds.graph.drop('clientClusters', false)''')
gds.run_cypher('''CALL gds.graph.drop('similarity', false)''')



Unnamed: 0,graphName,database,databaseLocation,memoryUsage,sizeInBytes,nodeCount,relationshipCount,configuration,density,creationTime,modificationTime,schema,schemaWithOrientation
0,similarity,neo4j,local,,-1,7037,2526,"{'readConcurrency': 4, 'jobId': '544c6a18-d515...",5.1e-05,2024-12-03T19:44:57.746851900-05:00,2024-12-03T19:44:57.993801100-05:00,"{'graphProperties': {}, 'nodes': {'Mule': {}, ...","{'graphProperties': {}, 'nodes': {'Mule': {}, ..."
