## Commuting Work Flows 
<BR>
<A HREF="https://www.census.gov/data/tables/2015/demo/metro-micro/commuting-flows-2015.html">Commuting Flows Data</A>

In [1]:
from py2neo import Graph
from py2neo import Node
from py2neo import Relationship
import pandas as pd
import numpy as np

I set the graph up in Neo4j with data cleaned up and saved in WorkplaceFlows_clean.csv


LOAD CSV WITH HEADERS FROM "file:///WorkplaceFlows_clean.csv" as row<BR>
MERGE (:County {geoid: row.ResGEOID, county: row.ResCounty, state: row.ResState});


LOAD CSV WITH HEADERS FROM "file:///WorkplaceFlows_clean.csv" as row<BR>
MERGE (res:County {geoid: row.ResGEOID, county: row.ResCounty, state: row.ResState})<BR>
MERGE (pow:County {geoid: row.PoWGEOID, county: row.PoWCounty, state: row.PoWState})<BR>
CREATE (res)-[:COMMUTES {workers: row.Workers, res: row.ResGEOID, pow: row.PoWGEOID}]->(pow);<BR>


CREATE INDEX INDX IF NOT EXISTS<BR>
FOR (n:County)<BR>
ON (n.geoid);<BR>

In [2]:
graph = Graph("bolt://localhost:7687", password='xxx', name='neo4j')

In [3]:
# Quick test to make sure things are connected and working, should be two Albany counties, one in NY, one in WY
query = """
MATCH (a:County) WHERE a.county = 'Albany' RETURN a.geoid, a.state;
"""
graph.run(query)

a.geoid,a.state
36001,New York
56001,Wyoming


In [4]:
query = """
CALL gds.graph.create('counties','County','COMMUTES');
"""
graph.run(query)

nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,createMillis
"{County: {label: 'County', properties: {}}}","{COMMUTES: {orientation: 'NATURAL', aggregation: 'DEFAULT', type: 'COMMUTES', properties: {}}}",counties,3223,137806,374


## Centrality Measures
<A HREF="https://neo4j-website.s3.eu-central-1.amazonaws.com/build/html/Algorithms/centrality/centrality.html">Centrality Measures</A>

In [5]:
# Counties with highest centrality score
query = """
CALL gds.alpha.closeness.stream({
    nodeProjection: "County",
    relationshipProjection: "COMMUTES"
})
YIELD nodeId, centrality
RETURN gds.util.asNode(nodeId).geoid AS geoid, 
       gds.util.asNode(nodeId).county AS county,
       gds.util.asNode(nodeId).state AS state,
       centrality AS centralScore
ORDER BY centrality DESC;
"""
result=graph.run(query)
centrality = result.to_data_frame()
centrality.head(10)

Unnamed: 0,geoid,county,state,centralScore
0,48201,Harris,Texas,0.587527
1,17031,Cook,Illinois,0.575357
2,48113,Dallas,Texas,0.552943
3,6037,Los Angeles,California,0.552753
4,13121,Fulton,Georgia,0.548986
5,4013,Maricopa,Arizona,0.547122
6,27053,Hennepin,Minnesota,0.542516
7,48439,Tarrant,Texas,0.540513
8,11001,District of Columbia,District of Columbia,0.540513
9,37119,Mecklenburg,North Carolina,0.539518


In [28]:
#centrality.to_csv("centrality.csv", index=False)

In [29]:
# Counties with highest betweeness score
query = """
CALL gds.betweenness.stream({nodeProjection: "County",relationshipProjection: "COMMUTES"})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).geoid AS geoid, 
       gds.util.asNode(nodeId).county AS county,
       gds.util.asNode(nodeId).state AS state,
       score AS betweenScore
ORDER BY score DESC;
"""
result=graph.run(query)
betweeness = result.to_data_frame()
betweeness.head(10)

Unnamed: 0,geoid,county,state,betweenScore
0,48201,Harris,Texas,618991.311468
1,17031,Cook,Illinois,469261.902272
2,48113,Dallas,Texas,302108.416209
3,6037,Los Angeles,California,271044.692642
4,4013,Maricopa,Arizona,255406.312504
5,13121,Fulton,Georgia,245489.327364
6,27053,Hennepin,Minnesota,243889.281553
7,48439,Tarrant,Texas,224589.664287
8,47157,Shelby,Tennessee,203442.900749
9,37119,Mecklenburg,North Carolina,192974.112976


In [30]:
#betweeness.to_csv("betweeness.csv", index=False)

In [32]:
# Counties with highest page rank score
query = """
CALL gds.pageRank.stream({  nodeProjection: "County",
relationshipProjection: "COMMUTES",  maxIterations: 50,  dampingFactor: 0.85})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).geoid AS geoid, 
       gds.util.asNode(nodeId).county AS county,
       gds.util.asNode(nodeId).state AS state,
       score AS pageRank
ORDER BY score DESC;
"""
result=graph.run(query)
pagerank = result.to_data_frame()
pagerank.head(10)

Unnamed: 0,geoid,county,state,pageRank
0,48201,Harris,Texas,17.910908
1,17031,Cook,Illinois,14.672512
2,48113,Dallas,Texas,11.690262
3,11001,District of Columbia,District of Columbia,10.996947
4,13121,Fulton,Georgia,10.884937
5,38105,Williams,North Dakota,10.763918
6,6037,Los Angeles,California,10.295381
7,27053,Hennepin,Minnesota,9.968158
8,47157,Shelby,Tennessee,9.316209
9,37119,Mecklenburg,North Carolina,8.608248


In [33]:
#pagerank.to_csv("pagerank.csv", index=False)

## Community Detection
<A HREF="https://neo4j-website.s3.eu-central-1.amazonaws.com/build/html/Algorithms/community/community.html">Class Notes on Community Detection</A>

<A HREF="https://neo4j.com/docs/graph-data-science/current/algorithms/louvain/">Cypher Louvain Modularity</A><BR>

In [36]:
# Louvain Modularity 
# Louvain quantifies how well a node is assigned to a group by looking at the density of connections within 
# a cluster in comparison to an average or random sample
query = """
CALL gds.louvain.stream({
  nodeProjection: "County",
  relationshipProjection: "COMMUTES",
  includeIntermediateCommunities: true
})
YIELD nodeId, communityId, intermediateCommunityIds
RETURN gds.util.asNode(nodeId).geoid AS geoid, 
       gds.util.asNode(nodeId).county AS county,
       gds.util.asNode(nodeId).state AS state,
       communityId, 
       intermediateCommunityIds
ORDER BY communityId ASC;
"""
result=graph.run(query)
louvainMod = result.to_data_frame()
louvainMod.head(10)

Unnamed: 0,geoid,county,state,communityId,intermediateCommunityIds
0,47029,Cocke,Tennessee,285,[285]
1,47089,Jefferson,Tennessee,285,[285]
2,47129,Morgan,Tennessee,285,[285]
3,47001,Anderson,Tennessee,285,[285]
4,47009,Blount,Tennessee,285,[285]
5,47013,Campbell,Tennessee,285,[285]
6,47073,Hawkins,Tennessee,285,[285]
7,47151,Scott,Tennessee,285,[285]
8,47155,Sevier,Tennessee,285,[285]
9,47063,Hamblen,Tennessee,285,[285]


In [37]:
#louvainMod.to_csv("louvainmod.csv", index=False)

<A HREF="https://neo4j.com/docs/graph-data-science/current/algorithms/label-propagation/">Cypher Label Propagation</A><BR>

In [41]:
# Label Propagation
# In LPA, nodes select their group based on their direct neighbors. 
# This process is well suited to networks where groupings are less clear and weights can 
# be used to help a node determine which community to place itself within
query = """
CALL gds.labelPropagation.stream({
nodeProjection: "County",
relationshipProjection: "COMMUTES",
maxIterations: 10
})
YIELD nodeId, communityId
RETURN collect(gds.util.asNode(nodeId).geoid) AS geoid,
       collect(gds.util.asNode(nodeId).county) AS county,
       collect(gds.util.asNode(nodeId).state) AS state,
       communityId AS label
ORDER BY label ASC;
"""
result=graph.run(query)
labelprop = result.to_data_frame()
labelprop.head(10)

Unnamed: 0,geoid,county,state,label
0,"[01001, 01013, 01015, 01021, 01027, 01029, 010...","[Autauga, Butler, Calhoun, Chilton, Clay, Cleb...","[Alabama, Alabama, Alabama, Alabama, Alabama, ...",0
1,[72049],[Culebra Municipio],[Puerto Rico],3217
2,[72147],[Vieques Municipio],[Puerto Rico],3221


In [42]:
# Not sure I understand the label propagation output, but here it is...
#labelprop.to_csv("labelprop.csv", index=False)

<A HREF="https://neo4j.com/docs/graph-data-science/current/algorithms/wcc/">Cypher Weakly Connected Components</A><BR>

In [71]:
# Weakly Connected Components (SCC)
query = """
CALL gds.wcc.stream({
nodeProjection: 'County',
relationshipProjection: 'COMMUTES'
})
YIELD nodeId, componentId
RETURN gds.util.asNode(nodeId).geoid AS geoid, 
       gds.util.asNode(nodeId).county AS county,
       gds.util.asNode(nodeId).state AS state,
       componentId AS Component
ORDER BY Component, county;
"""
result=graph.run(query)
wcc = result.to_data_frame()
wcc.head(10)
# WCC return all counties in one component, thus not very interesting results

Unnamed: 0,geoid,county,state,Component
0,72011,A??o Municipio,Puerto Rico,0
1,45001,Abbeville,South Carolina,0
2,22001,Acadia Parish,Louisiana,0
3,51001,Accomack,Virginia,0
4,16001,Ada,Idaho,0
5,19001,Adair,Iowa,0
6,40001,Adair,Oklahoma,0
7,29001,Adair,Missouri,0
8,21001,Adair,Kentucky,0
9,16003,Adams,Idaho,0


In [72]:
query = """
CALL gds.wcc.stats({
nodeProjection: 'County',
relationshipProjection: 'COMMUTES'
})
YIELD componentCount
"""
graph.run(query)
# yup, only 1 component

componentCount
1


<A HREF="https://neo4j.com/docs/graph-data-science/current/algorithms/modularity-optimization/">Cypher Modularity Optimization</A> aka Girvan-Newman

In [86]:
query = """
CALL gds.beta.modularityOptimization.stream({
nodeProjection: 'County',
relationshipProjection: 'COMMUTES'
})
YIELD nodeId, communityId
RETURN nodeId,
       gds.util.asNode(nodeId).geoid AS geoid, 
       gds.util.asNode(nodeId).county AS county,
       gds.util.asNode(nodeId).state AS state, 
       communityId
ORDER BY nodeId
"""
graph.run(query)

nodeId,geoid,county,state,communityId
0,1001,Autauga,Alabama,60
1,1013,Butler,Alabama,60
2,1015,Calhoun,Alabama,60


<A HREF="https://neo4j.com/docs/graph-data-science/current/algorithms/triangle-count/">Cypher Triangle Counts</A><BR>
This may not work for directed graphs, but here's a bunch of code anyway.

In [53]:
# General Triangle Count stats
query = """
CALL gds.triangleCount.stats({
nodeProjection:"County",
relationshipProjection:  {COMMUTES: {
type: "COMMUTES",
   orientation: "UNDIRECTED"
 }}
})
YIELD globalTriangleCount,nodeCount
"""
graph.run(query)


globalTriangleCount,nodeCount
1061592,3223


In [65]:
# Print Triangles
query = """
CALL gds.triangleCount.stream({
nodeProjection:"County",
relationshipProjection:  {COMMUTES: {
type: "COMMUTES",
   orientation: "UNDIRECTED"
 }}
})
YIELD nodeId , triangleCount
RETURN gds.util.asNode(nodeId).geoid as geoid,
       gds.util.asNode(nodeId).county as county,
       triangleCount as nbr_triangles
ORDER BY triangleCount DESC;
"""
graph.run(query)

geoid,county,nbr_triangles
17031,Cook,34071
48201,Harris,31655
48113,Dallas,22538


In [62]:
query = """
CALL gds.alpha.triangles({
nodeProjection: "County",
relationshipProjection:  {
    COMMUTES: {type: "COMMUTES",
    orientation: "UNDIRECTED"}
    }
    })
YIELD nodeA, nodeB, nodeC
RETURN gds.util.asNode(nodeA).county AS nodeA,
       gds.util.asNode(nodeB).county AS nodeB,
       gds.util.asNode(nodeC).county AS nodeC;
"""
graph.run(query)

nodeA,nodeB,nodeC
Autauga,Butler,Dallas
Autauga,Butler,Elmore
Autauga,Butler,Jefferson


<A HREF="https://neo4j.com/docs/graph-data-science/current/algorithms/local-clustering-coefficient/">Cypher Local Clustering Coeff</A>

In [64]:
# Clustering Coefficient - ahhh, ok does not support Directed 
query = """
CALL gds.localClusteringCoefficient.stream({
nodeProjection: "County",
relationshipProjection:  {
  DEPENDS_ON: {
    type: "COMMUTES",
    orientation: "UNDIRECTED"
  }
}
})
YIELD nodeId, localClusteringCoefficient
WHERE localClusteringCoefficient > 0
RETURN gds.util.asNode(nodeId).geoid as geoid,
       gds.util.asNode(nodeId).county as county,
       gds.util.asNode(nodeId).state as state,
       localClusteringCoefficient
ORDER BY localClusteringCoefficient DESC;
"""
graph.run(query)

geoid,county,state,localClusteringCoefficient
48261,Kenedy,Texas,1.0952380952380951
32009,Esmeralda,Nevada,0.8666666666666667
6091,Sierra,California,0.8214285714285714
