In [3]:
import pandas as pd
import numpy as np
from py2neo import Graph
from py2neo import Node
from py2neo import Relationship

# Commuting flow preprocessing
<BR>
<A HREF="https://www.census.gov/data/tables/2015/demo/metro-micro/commuting-flows-2015.html">Commuting Flows Data</A>

The data is read. The missing values result from lines indicating commuting flows to other workplaces outside the USA. These rows are deleted.

In [28]:
commuting_flow=pd.read_excel("RawData/commuting_flows.xlsx",dtype={'State FIPS Code': object,'County FIPS Code':object,
                                                                   'State FIPS Code.1':object,
                                                                   'County FIPS Code.1':object})
commuting_flow.drop(columns=[" Margin of Error"], inplace=True)
commuting_flow.dropna(inplace=True)

Combining state and county FIPS code to one FIPS code

In [29]:
commuting_flow['State FIPS Code.1']=commuting_flow['State FIPS Code.1'].str[1:]
commuting_flow["State FIPS Code"]=commuting_flow["State FIPS Code"]+commuting_flow["County FIPS Code"]
commuting_flow["State FIPS Code.1"]=commuting_flow["State FIPS Code.1"]+commuting_flow["County FIPS Code.1"]
commuting_flow.drop(columns=['County FIPS Code.1','County FIPS Code'],inplace=True)
commuting_flow.rename(columns={'State Name':'ResState','County Name':'ResCounty','State Name.1':'PoWState',
                              'County Name.1':'PoWCounty','Workers in Commuting Flow':'Workers',
                              'State FIPS Code.1':'PoWGEOID','State FIPS Code':'ResGEOID'},inplace=True)
commuting_flow['ResCounty']=commuting_flow['ResCounty'].str.replace(' County', '')
commuting_flow['PoWCounty']=commuting_flow['PoWCounty'].str.replace(' County', '')

In [31]:
commuting_flow.to_csv('Data_input_Neo4j/WorkPlaceFlows_clean.csv',index=False)

# Preparing graph based on commuting flows between states

We only consider the commuting flow between states.

In [5]:
between_state=commuting_flow[commuting_flow["ResState"]!=commuting_flow["PoWState"]]

We compute the nodes and edges and save the results in the folder Data_input_neo4j

In [6]:
commuting_edges=between_state.groupby(["ResState","PoWState"],as_index=False).agg({'Workers':'sum'})
commuting_nodes=pd.DataFrame(commuting_edges['ResState'].unique(),columns=['State'])

In [7]:
commuting_edges.to_csv('Data_input_neo4j/commuting_edges.csv',index=False)
commuting_nodes.to_csv('Data_input_neo4j/commuting_nodes.csv',index=False)

## Commuting flow degree centrality

<div class="alert alert-block alert-info"> Make sure that all datafiles that are in the folder Data_input_neo4j are copied into the input folder of your database.</div>

## Make graph

In [4]:
graph = Graph("bolt://localhost:7687", password='xxx', name='neo4j')

Insert the nodes and relationships.

In [8]:
graph.run("MATCH (n) DETACH DELETE n")
query="""LOAD CSV WITH HEADERS FROM $file as row
        with row
        CALL apoc.create.node(['State','Place'],{name:row.State}) YIELD node
        RETURN distinct true"""
graph.run(query,file='file:///commuting_nodes.csv')
query="""LOAD CSV WITH HEADERS FROM $file as row
        with row
        MATCH (source:State{name:row.ResState})
        MATCH (target:State{name:row.PoWState})
        CALL apoc.create.relationship(source,"COMMUTING",{weight:toFloat(row.Workers)},target) YIELD rel
        RETURN distinct true"""
graph.run(query,file='file:///commuting_edges.csv')

true
True


Create an in-memory graph, called 'Commuting'.

In [7]:
query="""CALL gds.graph.create('Commuting','State',{COMMUTING:{properties:'weight', orientation:'UNDIRECTED'}})"""
graph.run(query)

nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,createMillis
"{State: {properties: {}, label: 'State'}}","{COMMUTING: {orientation: 'UNDIRECTED', aggregation: 'DEFAULT', type: 'COMMUTING', properties: {weight: {property: 'weight', aggregation: 'DEFAULT', defaultValue: null}}}}",Commuting,52,4748,46


## Perform degree centrality

We can use the gds.degree.stream algorithm to calculate the degree centrality of the states.

In [8]:
query="""CALL gds.degree.stream('Commuting',{relationshipWeightProperty:'weight'})
        YIELD nodeId, score
        RETURN gds.util.asNode(nodeId).name AS Place, score AS Degree_Centrality_com
        ORDER by Degree_Centrality_com desc"""
result = graph.run(query).data() 
df_degree_commuting=pd.DataFrame(result)

The in-memory graph is dropped again.

In [9]:
graph.run("CALL gds.graph.drop('Commuting')")

graphName,database,memoryUsage,sizeInBytes,nodeProjection,relationshipProjection,nodeQuery,relationshipQuery,nodeCount,relationshipCount,nodeFilter,relationshipFilter,density,creationTime,modificationTime,schema
Commuting,neo4j,,-1,"{State: {properties: {}, label: 'State'}}","{COMMUTING: {orientation: 'UNDIRECTED', aggregation: 'DEFAULT', type: 'COMMUTING', properties: {weight: {property: 'weight', aggregation: 'DEFAULT', defaultValue: null}}}}",,,52,4748,,,1.790346907993967,datetime('2022-06-01T17:53:16.610593100+02:00'),datetime('2022-06-01T17:53:16.650904500+02:00'),"{relationships: {COMMUTING: {weight: 'Float (DefaultValue(NaN), PERSISTENT, Aggregation.DEFAULT)'}}, nodes: {State: {}}}"


The results are saved into a csv file.

In [10]:
df_degree_commuting.to_csv('PreprocessedData/degree_commuting_between_states.csv',index=False)

# Preparing graph based on commuting flows between counties


<div class="alert alert-block alert-info"> Make sure that all datafiles that are in the folder Data_input_neo4j are copied into the input folder of your database.</div>

In [5]:
graph.run("MATCH(n) DETACH DELETE n")

I set the graph up in Neo4j with data cleaned up and saved in WorkplaceFlows_clean.csv


LOAD CSV WITH HEADERS FROM "file:///WorkplaceFlows_clean.csv" as row<BR>
MERGE (:County {geoid: row.ResGEOID, county: row.ResCounty, state: row.ResState});


LOAD CSV WITH HEADERS FROM "file:///WorkplaceFlows_clean.csv" as row<BR>
MERGE (res:County {geoid: row.ResGEOID, county: row.ResCounty, state: row.ResState})<BR>
MERGE (pow:County {geoid: row.PoWGEOID, county: row.PoWCounty, state: row.PoWState})<BR>
CREATE (res)-[:COMMUTES {workers: row.Workers, res: row.ResGEOID, pow: row.PoWGEOID}]->(pow);<BR>


CREATE INDEX INDX IF NOT EXISTS<BR>
FOR (n:County)<BR>
ON (n.geoid);<BR>

In [6]:
query="""
LOAD CSV WITH HEADERS FROM "file:///WorkplaceFlows_clean.csv" as row
MERGE (:County {geoid: row.ResGEOID, county: row.ResCounty, state: row.ResState})
"""
graph.run(query)
print("Nodes inserted")

Nodes inserted


In [7]:
query="""
CREATE INDEX INDX IF NOT EXISTS
FOR (n:County)
ON (n.geoid)
"""
graph.run(query)
print("Index created")

Index created


In [8]:
query="""
LOAD CSV WITH HEADERS FROM "file:///WorkplaceFlows_clean.csv" as row
MERGE (res:County {geoid: row.ResGEOID, county: row.ResCounty, state: row.ResState})
MERGE (pow:County {geoid: row.PoWGEOID, county: row.PoWCounty, state: row.PoWState})
CREATE (res)-[:COMMUTES {workers: row.Workers, res: row.ResGEOID, pow: row.PoWGEOID}]->(pow)
"""
graph.run(query)
print("Relationships created")

Relationships created


In [9]:
# Quick test to make sure things are connected and working, should be two Albany counties, one in NY, one in WY
query = """
MATCH (a:County) WHERE a.county = 'Albany' RETURN a.geoid, a.state;
"""
graph.run(query)

a.geoid,a.state
36001,New York
56001,Wyoming


In [12]:
query = """
CALL gds.graph.create('counties','County','COMMUTES');
"""
graph.run(query)

nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,createMillis
"{County: {properties: {}, label: 'County'}}","{COMMUTES: {orientation: 'NATURAL', aggregation: 'DEFAULT', type: 'COMMUTES', properties: {}}}",counties,3220,137806,19


## Centrality Measures
<A HREF="https://neo4j-website.s3.eu-central-1.amazonaws.com/build/html/Algorithms/centrality/centrality.html">Centrality Measures</A>

In [13]:
# Counties with highest centrality score
query = """
CALL gds.alpha.closeness.stream({
    nodeProjection: "County",
    relationshipProjection: "COMMUTES"
})
YIELD nodeId, centrality
RETURN gds.util.asNode(nodeId).geoid AS geoid, 
       gds.util.asNode(nodeId).county AS county,
       gds.util.asNode(nodeId).state AS state,
       centrality AS centralScore
ORDER BY centrality DESC;
"""
result=graph.run(query)
centrality = result.to_data_frame()
centrality.head(10)

Unnamed: 0,geoid,county,state,centralScore
0,48201,Harris,Texas,0.587838
1,17031,Cook,Illinois,0.57585
2,48113,Dallas,Texas,0.553378
3,6037,Los Angeles,California,0.552903
4,13121,Fulton,Georgia,0.54913
5,4013,Maricopa,Arizona,0.547449
6,27053,Hennepin,Minnesota,0.542833
7,11001,District of Columbia,District of Columbia,0.540827
8,48439,Tarrant,Texas,0.540645
9,37119,Mecklenburg,North Carolina,0.539829


In [18]:
#centrality.to_csv("PreprocessedData/centrality.csv", index=False)

In [16]:
# Counties with highest betweeness score
query = """
CALL gds.betweenness.stream({nodeProjection: "County",relationshipProjection: "COMMUTES"})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).geoid AS geoid, 
       gds.util.asNode(nodeId).county AS county,
       gds.util.asNode(nodeId).state AS state,
       score AS betweenScore
ORDER BY score DESC;
"""
result=graph.run(query)
betweeness = result.to_data_frame()
betweeness.head(10)

Unnamed: 0,geoid,county,state,betweenScore
0,48201,Harris,Texas,617958.281608
1,17031,Cook,Illinois,469159.093054
2,48113,Dallas,Texas,301811.660055
3,6037,Los Angeles,California,270725.273712
4,4013,Maricopa,Arizona,255235.306
5,13121,Fulton,Georgia,245228.297783
6,27053,Hennepin,Minnesota,243818.914986
7,48439,Tarrant,Texas,223837.763622
8,47157,Shelby,Tennessee,203355.954
9,37119,Mecklenburg,North Carolina,192922.170706


In [19]:
#betweeness.to_csv("PreprocessedData/betweeness.csv", index=False)

In [20]:
# Counties with highest page rank score
query = """
CALL gds.pageRank.stream({  nodeProjection: "County",
relationshipProjection: "COMMUTES",  maxIterations: 50,  dampingFactor: 0.85})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).geoid AS geoid, 
       gds.util.asNode(nodeId).county AS county,
       gds.util.asNode(nodeId).state AS state,
       score AS pageRank
ORDER BY score DESC;
"""
result=graph.run(query)
pagerank = result.to_data_frame()
pagerank.head(10)

Unnamed: 0,geoid,county,state,pageRank
0,48201,Harris,Texas,17.906262
1,17031,Cook,Illinois,14.669887
2,48113,Dallas,Texas,11.687301
3,11001,District of Columbia,District of Columbia,10.992795
4,13121,Fulton,Georgia,10.881692
5,38105,Williams,North Dakota,10.762765
6,6037,Los Angeles,California,10.291407
7,27053,Hennepin,Minnesota,9.966452
8,47157,Shelby,Tennessee,9.31456
9,37119,Mecklenburg,North Carolina,8.606624


In [21]:
#pagerank.to_csv("PreprocessedData/pagerank.csv", index=False)

## Community Detection
<A HREF="https://neo4j-website.s3.eu-central-1.amazonaws.com/build/html/Algorithms/community/community.html">Class Notes on Community Detection</A>

<A HREF="https://neo4j.com/docs/graph-data-science/current/algorithms/louvain/">Cypher Louvain Modularity</A><BR>

In [26]:
# Louvain Modularity 
# Louvain quantifies how well a node is assigned to a group by looking at the density of connections within 
# a cluster in comparison to an average or random sample
query = """
CALL gds.louvain.stream({
  nodeProjection: "County",
  relationshipProjection: "COMMUTES",
  includeIntermediateCommunities: true
})
YIELD nodeId, communityId, intermediateCommunityIds
RETURN gds.util.asNode(nodeId).geoid AS geoid, 
       gds.util.asNode(nodeId).county AS county,
       gds.util.asNode(nodeId).state AS state,
       communityId, 
       intermediateCommunityIds
ORDER BY communityId ASC;
"""
result=graph.run(query)
louvainMod = result.to_data_frame()
louvainMod.head(10)

Unnamed: 0,geoid,county,state,communityId,intermediateCommunityIds
0,1001,Autauga,Alabama,88,"[88, 88]"
1,1013,Butler,Alabama,88,"[88, 88]"
2,1015,Calhoun,Alabama,88,"[88, 88]"
3,1021,Chilton,Alabama,88,"[88, 88]"
4,1027,Clay,Alabama,88,"[88, 88]"
5,1029,Cleburne,Alabama,88,"[2272, 88]"
6,1047,Dallas,Alabama,88,"[88, 88]"
7,1051,Elmore,Alabama,88,"[88, 88]"
8,1073,Jefferson,Alabama,88,"[88, 88]"
9,1081,Lee,Alabama,88,"[88, 88]"


In [37]:
#louvainMod.to_csv("PreprocessedData/louvainmod.csv", index=False)

<A HREF="https://neo4j.com/docs/graph-data-science/current/algorithms/label-propagation/">Cypher Label Propagation</A><BR>

In [27]:
# Label Propagation
# In LPA, nodes select their group based on their direct neighbors. 
# This process is well suited to networks where groupings are less clear and weights can 
# be used to help a node determine which community to place itself within
query = """
CALL gds.labelPropagation.stream({
nodeProjection: "County",
relationshipProjection: "COMMUTES",
maxIterations: 10
})
YIELD nodeId, communityId
RETURN collect(gds.util.asNode(nodeId).geoid) AS geoid,
       collect(gds.util.asNode(nodeId).county) AS county,
       collect(gds.util.asNode(nodeId).state) AS state,
       communityId AS label
ORDER BY label ASC;
"""
result=graph.run(query)
labelprop = result.to_data_frame()
labelprop.head(10)

Unnamed: 0,geoid,county,state,label
0,"[01001, 01013, 01015, 01021, 01027, 01029, 010...","[Autauga, Butler, Calhoun, Chilton, Clay, Cleb...","[Alabama, Alabama, Alabama, Alabama, Alabama, ...",0
1,[72049],[Culebra Municipio],[Puerto Rico],5968
2,[72147],[Vieques Municipio],[Puerto Rico],5970


In [42]:
#labelprop.to_csv("PreprocessedData/labelprop.csv", index=False)

<A HREF="https://neo4j.com/docs/graph-data-science/current/algorithms/wcc/">Cypher Weakly Connected Components</A><BR>

In [28]:
# Weakly Connected Components (SCC)
query = """
CALL gds.wcc.stream({
nodeProjection: 'County',
relationshipProjection: 'COMMUTES'
})
YIELD nodeId, componentId
RETURN gds.util.asNode(nodeId).geoid AS geoid, 
       gds.util.asNode(nodeId).county AS county,
       gds.util.asNode(nodeId).state AS state,
       componentId AS Component
ORDER BY Component, county;
"""
result=graph.run(query)
wcc = result.to_data_frame()
wcc.head(10)
# WCC return all counties in one component, thus not very interesting results

Unnamed: 0,geoid,county,state,Component
0,45001,Abbeville,South Carolina,0
1,22001,Acadia Parish,Louisiana,0
2,51001,Accomack,Virginia,0
3,16001,Ada,Idaho,0
4,19001,Adair,Iowa,0
5,40001,Adair,Oklahoma,0
6,29001,Adair,Missouri,0
7,21001,Adair,Kentucky,0
8,16003,Adams,Idaho,0
9,31001,Adams,Nebraska,0


In [29]:
query = """
CALL gds.wcc.stats({
nodeProjection: 'County',
relationshipProjection: 'COMMUTES'
})
YIELD componentCount
"""
graph.run(query)
# yup, only 1 component

componentCount
1


<A HREF="https://neo4j.com/docs/graph-data-science/current/algorithms/modularity-optimization/">Cypher Modularity Optimization</A> aka Girvan-Newman

In [30]:
query = """
CALL gds.beta.modularityOptimization.stream({
nodeProjection: 'County',
relationshipProjection: 'COMMUTES'
})
YIELD nodeId, communityId
RETURN nodeId,
       gds.util.asNode(nodeId).geoid AS geoid, 
       gds.util.asNode(nodeId).county AS county,
       gds.util.asNode(nodeId).state AS state, 
       communityId
ORDER BY nodeId
"""
graph.run(query)

nodeId,geoid,county,state,communityId
0,1001,Autauga,Alabama,10
1,1013,Butler,Alabama,10
2,1015,Calhoun,Alabama,10


<A HREF="https://neo4j.com/docs/graph-data-science/current/algorithms/triangle-count/">Cypher Triangle Counts</A><BR>
This may not work for directed graphs, but here's a bunch of code anyway.

In [31]:
# General Triangle Count stats
query = """
CALL gds.triangleCount.stats({
nodeProjection:"County",
relationshipProjection:  {COMMUTES: {
type: "COMMUTES",
   orientation: "UNDIRECTED"
 }}
})
YIELD globalTriangleCount,nodeCount
"""
graph.run(query)


globalTriangleCount,nodeCount
1061622,3220


In [32]:
# Print Triangles
query = """
CALL gds.triangleCount.stream({
nodeProjection:"County",
relationshipProjection:  {COMMUTES: {
type: "COMMUTES",
   orientation: "UNDIRECTED"
 }}
})
YIELD nodeId , triangleCount
RETURN gds.util.asNode(nodeId).geoid as geoid,
       gds.util.asNode(nodeId).county as county,
       triangleCount as nbr_triangles
ORDER BY triangleCount DESC;
"""
graph.run(query)

geoid,county,nbr_triangles
17031,Cook,34071
48201,Harris,31655
48113,Dallas,22538


In [33]:
query = """
CALL gds.alpha.triangles({
nodeProjection: "County",
relationshipProjection:  {
    COMMUTES: {type: "COMMUTES",
    orientation: "UNDIRECTED"}
    }
    })
YIELD nodeA, nodeB, nodeC
RETURN gds.util.asNode(nodeA).county AS nodeA,
       gds.util.asNode(nodeB).county AS nodeB,
       gds.util.asNode(nodeC).county AS nodeC;
"""
graph.run(query)

nodeA,nodeB,nodeC
Butler,Dallas,Elmore
Calhoun,Chilton,Clay
Autauga,Butler,Dallas


<A HREF="https://neo4j.com/docs/graph-data-science/current/algorithms/local-clustering-coefficient/">Cypher Local Clustering Coeff</A>

In [34]:
# Clustering Coefficient - ahhh, ok does not support Directed 
query = """
CALL gds.localClusteringCoefficient.stream({
nodeProjection: "County",
relationshipProjection:  {
  DEPENDS_ON: {
    type: "COMMUTES",
    orientation: "UNDIRECTED"
  }
}
})
YIELD nodeId, localClusteringCoefficient
WHERE localClusteringCoefficient > 0
RETURN gds.util.asNode(nodeId).geoid as geoid,
       gds.util.asNode(nodeId).county as county,
       gds.util.asNode(nodeId).state as state,
       localClusteringCoefficient
ORDER BY localClusteringCoefficient DESC;
"""
graph.run(query)

geoid,county,state,localClusteringCoefficient
48261,Kenedy,Texas,1.0952380952380951
32009,Esmeralda,Nevada,0.8666666666666667
6091,Sierra,California,0.8214285714285714


In [35]:
query = """
CALL gds.graph.drop('counties');
"""
graph.run(query)

graphName,database,memoryUsage,sizeInBytes,nodeProjection,relationshipProjection,nodeQuery,relationshipQuery,nodeCount,relationshipCount,nodeFilter,relationshipFilter,density,creationTime,modificationTime,schema
counties,neo4j,,-1,"{County: {properties: {}, label: 'County'}}","{COMMUTES: {orientation: 'NATURAL', aggregation: 'DEFAULT', type: 'COMMUTES', properties: {}}}",,,3220,137806,,,0.013295089906784,datetime('2022-06-01T19:10:24.318059400+02:00'),datetime('2022-06-01T19:10:24.338003700+02:00'),"{relationships: {COMMUTES: {}}, nodes: {County: {}}}"
