# Google Storage Read CSV with Neo4j Python API
In the examples that follows, we will be using the Neo4j python client
[Neo4j python client](https://neo4j.com/developer/python/)

## Setup Neo4j instance
Create a free account at [https://sandbox.neo4j.com](https://sandbox.neo4j.com). Choose the "Blank Sandbox - Graph Data Science" option.

When your sandbox has been created, fill in the Bolt URL and password below.

In the examples that follows, we will be using the new PySpark graphdatascience client library.

## Setup Neo4j Python client imports

In [None]:
from neo4j import GraphDatabase
import pandas as pd
import warnings

Import visualization libraries for analyzing dataframes in this notebook.

In [None]:
import warnings
import seaborn as sns
from matplotlib import pyplot as plt

Define Neo4j connection variables.  Yours will be different.

In [None]:
neo4j_url = "bolt://***removed***:7687"
neo4j_user = "neo4j"
neo4j_password = "***removed***"
neo4j_database= "neo4j"

Create a helper class Neo4jConnection for working with the Neo4j python client.

In [None]:
class Neo4jPythonClient:

    def __init__(self, uri, user, pwd):

        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None

        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)

    def close(self):

        if self.__driver is not None:
            self.__driver.close()

    def query(self, query, parameters=None, db=None):

        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None

        try:
            session = self.__driver.session(database=db) if db is not None else self.__driver.session()
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally:
            if session is not None:
                session.close()
        return response

    def queryToDf(self, query, parameters=None, db=None):

        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None

        try:
            warnings.filterwarnings(
                "ignore",
                message=r"^pandas support is experimental and might be changed or removed in future versions$",
            )
            session = self.__driver.session(database=db) if db is not None else self.__driver.session()
            response = session.run(query, parameters).to_df()
        except Exception as e:
            print("Query failed:", e)
        finally:
            if session is not None:
                session.close()
        return response

    def queryToPrint(self, query, parameters=None, db=None):
        print(self.queryToDf(query,parameters,db))

Create connection to Python client

In [None]:
gdbClient = Neo4jPythonClient(uri=neo4j_url, user=neo4j_user, pwd=neo4j_password)

In [None]:
gds_version_query = """return gds.version() as gds_version"""
gdbClient.queryToPrint(gds_version_query)

Test Python client by checking if there are records currently in the database.

In [None]:
count_nodes_query = """MATCH (n) RETURN count(n) as count"""
gdbClient.queryToPrint(count_nodes_query)

If there are records, wipe out the database

In [None]:
reset_db = gdbClient.queryToPrint("CREATE OR REPLACE DATABASE `"+neo4j_database+"`")

Recheck the database

# Load MSA data from CSV

Create MSA unique constraint

In [None]:
gdbClient.query("""
CREATE CONSTRAINT msa_name IF NOT EXISTS ON (m:MSA) ASSERT m.name IS NODE KEY
""")

In [None]:
load_csv_query = """
LOAD CSV WITH HEADERS FROM 'https://raw.githubusercontent.com/smithna/datasets/main/CensusDemographicsByMetroArea.csv'
AS row
WITH row WHERE row.name CONTAINS 'Metro'
MERGE (m:MSA {name:row.name})
SET m.population = toInteger(row.population),
m.medianHouseholdIncome = toInteger(row.medianHouseholdIncome),
m.medianHomePrice = toInteger(row.medianHomePrice),
m.percentOver25WithBachelors = toFloat(row.percentOver25WithBachelors)
RETURN count(m) as msaCount"""

gdbClient.queryToPrint(load_csv_query)

Start by creating a [node key constraint](https://neo4j.com/docs/cypher-manual/current/constraints/) that requires that each MSA node has a unique name.

In [None]:
gdbClient.query("""
CREATE CONSTRAINT msa_name IF NOT EXISTS ON (m:MSA) ASSERT m.name IS NODE KEY
""")

# Run Cypher queries for data profiling
Aggregate and find quantiles.

In [None]:
gdbClient.queryToPrint("""
MATCH (n)
WITH n, 
["population", "medianHouseholdIncome", "medianHomePrice", "percentOver25WithBachelors" ] AS metrics
UNWIND metrics as metric
WITH metric, n[metric] AS value
RETURN metric, min(value) AS minValue,
percentileCont(value, 0.25) AS percentile25, 
percentileCont(value, 0.50) AS percentile50, 
percentileCont(value, 0.75) AS percentile75, 
max(value) AS maxValue
""")

Some of those quantiles look asymetrical. Lets plot histograms and see what happens if we apply a log transformation.
In the next section we are returning results of cypher to a data frame.

In [None]:
pd_msa_df = gdbClient.queryToDf("""
MATCH (m:MSA)
RETURN m.name AS msa, 
m.population AS population,
m.medianHouseholdIncome AS medianHouseholdIncome,
m.medianHomePrice AS medianHomePrice,
m.percentOver25WithBachelors as percentOver25WithBachelors
""")

In [None]:
fig, axes = plt.subplots(4, 2)
fig.set_size_inches(15,30)
for i in range(1,5):
    sns.histplot(pd_msa_df.iloc[:,i], ax=axes[i-1,0])
    sns.histplot(pd_msa_df.iloc[:, i], log_scale=True, ax=axes[i-1,1])

That log transformation looks like it will help. Run the Cypher to store the transformed values in the graph.

In [None]:
gdbClient.queryToPrint("""
MATCH (m:MSA)
SET 
m.logPopulation = log(m.population),
m.logMedianHouseholdIncome = log(m.medianHouseholdIncome),
m.logMedianHomePrice = log(m.medianHomePrice),
m.logPercentOver25WithBachelors = log(m.percentOver25WithBachelors)
""")

That log transformation looks like it will help. Run the Cypher to store the transformed values in the graph.

In [None]:
log_update = """
MATCH (m:MSA)
SET
m.logPopulation = log(m.population),
m.logMedianHouseholdIncome = log(m.medianHouseholdIncome),
m.logMedianHomePrice = log(m.medianHomePrice),
m.logPercentOver25WithBachelors = log(m.percentOver25WithBachelors)
RETURN m.name AS msa,m.logPopulation,m.logMedianHouseholdIncome,m.logMedianHomePrice,m.logPercentOver25WithBachelors
"""

gdbClient.queryToPrint(log_update)

Check that log attributes were populated

# Create in-memory graph projection
Passing `"*"` as the third argument to `gds.graph.project` tells GDS to include any relationships that exist in the database in the in-memory graph. Because no relationships have been created in the graph yet, there will be no relationships in the in-memory graph projection when it is created.

In [None]:
graph_project_query = """
    CALL gds.graph.project(
    'msa-graph',
    'MSA',
    '*',
    {nodeProperties: ["logPopulation",
        "logMedianHouseholdIncome",
        "logMedianHomePrice",
        "logPercentOver25WithBachelors"]})
"""

gdbClient.queryToPrint(graph_project_query)

Notice that when we look at the results of `gds.graph.project`, we see that the `relationshipCount` is 0.

# Apply MinMax scalar to property values

In [None]:
graph_scale_properties_mutations = """
CALL gds.alpha.scaleProperties.mutate("msa-graph", {
                                 nodeProperties: [
                                     "logPopulation",
                                     "logMedianHouseholdIncome",
                                     "logMedianHomePrice",
                                     "logPercentOver25WithBachelors"],
                                 scaler : "MinMax",
                                 mutateProperty : "scaledProperties"
                                 })
                                 """

gdbClient.queryToPrint(graph_scale_properties_mutations)

This next line streams node properties to the procedure caller.

In [None]:
graph_stream_scaled_properties_query = """
CALL gds.graph.streamNodeProperty('msa-graph', 'scaledProperties')
YIELD nodeId, propertyValue
RETURN nodeId, propertyValue
                                 """
pandas_sp_df = gdbClient.queryToDf(graph_stream_scaled_properties_query)
pandas_sp_df


In [None]:
pd.DataFrame(list(pandas_sp_df['propertyValue'])).iloc[:,0].hist()

In [None]:
pd.DataFrame(list(pandas_sp_df['propertyValue'])).iloc[:,1].hist()

In [None]:
pd.DataFrame(list(pandas_sp_df['propertyValue'])).iloc[:,2].hist()

In [None]:
pd.DataFrame(list(pandas_sp_df['propertyValue'])).iloc[:,3].hist()

# Run KNN to create relationships to nearest neighbors
First run in stats mode and look at the similarity distribution.

In [None]:
knn_stats_query = """CALL gds.knn.stats("msa-graph",
   {
      nodeProperties:{
      scaledProperties:"EUCLIDEAN"},
      topK:15,
      similarityCutoff: 0.8350143432617188,
      sampleRate:1,
      randomSeed:42,
      concurrency:1
   }
)
YIELD similarityDistribution
RETURN similarityDistribution """

knn_stats=gdbClient.query(knn_stats_query)
print(knn_stats)

Now run KNN in mutate mode to update the in-memory graph projection. We'll exclude the bottom 1% of similarity relationships.

In [None]:
knn_write = f"""CALL gds.knn.mutate("msa-graph",
               {{nodeProperties: {{scaledProperties: "EUCLIDEAN"}},
               topK: 15,
               mutateRelationshipType: "IS_SIMILAR",
               mutateProperty: "similarity",
               similarityCutoff: {knn_stats[0]['similarityDistribution']['p1']},
               sampleRate:1,
               randomSeed:42,
               concurrency:1}}
              ) """

print(knn_write)

gdbClient.queryToPrint(knn_write)

Also write the relationships from the in-memory graph projection back to the on-disk graph.

In [None]:
similarity_relationship_writeback = """CALL gds.graph.writeRelationship(
    "msa-graph",
    "IS_SIMILAR",
    "similarity"
)"""

gdbClient.queryToPrint(similarity_relationship_writeback)

Add a `rank` property to the `IS_SIMILAR` relationships for use with Bloom filtering.

In [None]:
add_rank_update = """
MATCH (m:MSA)-[s:IS_SIMILAR]->()
WITH m, s ORDER BY s.similarity DESC
WITH m, collect(s) as similarities, range(0, 11) AS ranks
UNWIND ranks AS rank
WITH rank, similarities[rank] AS rel
SET rel.rank = rank + 1
"""

gdbClient.queryToPrint(add_rank_update)

# Run Louvain community detection

In [None]:
read_louvain = """
CALL gds.louvain.stats('msa-graph',
{relationshipTypes: ["IS_SIMILAR"],
relationshipWeightProperty:"similarity"})
YIELD communityCount, modularities
RETURN communityCount, modularities
"""
gdbClient.queryToPrint(read_louvain)

Now commit louvain communities to database

In [None]:
write_louvain = """
CALL gds.louvain.write('msa-graph',
{relationshipTypes: ["IS_SIMILAR"],
relationshipWeightProperty:"similarity",
 writeProperty:"communityId"})
YIELD communityCount, modularities
RETURN communityCount, modularities
"""

gdbClient.queryToPrint(write_louvain)

# Gather statistics about the communities that were discovered

Get average values for each community and 3 example MSAs for each community.

In [None]:
community_query = """
MATCH (m:MSA)
WITH m 
ORDER BY apoc.coll.sum([(m)-[s:IS_SIMILAR]->(m2) 
WHERE m.communityId = m2.communityId | s.similarity]) desc
RETURN m.communityId as communityId,
count(m) as msaCount, 
avg(m.population) as avgPopulation,
avg(m.medianHomePrice) as avgHomePrice,
avg(m.medianHouseholdIncome) as avgIncome,
avg(m.percentOver25WithBachelors) as avgPctBachelors,
collect(m.name)[..3] as exampleMSAs
ORDER BY avgPopulation DESC
"""
pd_community = gdbClient.queryToDf(community_query)
pd_community

In [None]:
pd_community.sort_values('communityId')

In [None]:
fig, axes = plt.subplots(5, 1)
fig.set_size_inches(6,20)
for i in range(1,6):
    sns.barplot(data=pd_community, x="communityId", y=pd_community.columns[i], ax=axes[i-1])

Mean can give us a quick overview of properties, but can be skewed by outliers. Compare emperical cumulative distribution function (ECDF) at various proportions to get a more complete picture of distributions.

In [None]:
# we need to remove sort by here
detail_query="""
MATCH (m:MSA)
RETURN "community " + m.communityId as communityId,
m.population as population,
m.medianHomePrice as medianHomePrice,
m.medianHouseholdIncome as medianIncome,
m.percentOver25WithBachelors as pctBachelors
order by communityId ASC
"""

## post sorting in spark

pd_detail =gdbClient.queryToDf(detail_query)

In [None]:
print(pd_detail)

In [None]:
fig, axes = plt.subplots(4, 1)
fig.set_size_inches(6,20)
for i in range(1,5):
    sns.ecdfplot(data=pd_detail, hue="communityId", x=pd_detail.columns[i], log_scale=True, ax=axes[i-1])

Compare two-dimensions on scatter plots

In [None]:
splot = sns.scatterplot(data=pd_detail, x="medianIncome", y="population", hue="communityId")
splot.set(yscale="log")
splot.set(xscale="log")

In [None]:
splot = sns.scatterplot(data=pd_detail, x="pctBachelors", y="medianHomePrice", hue="communityId")
splot.set(yscale="log")
splot.set(xscale="log")

# Assign human-friendly names to the clusters discovered.
The Louvain community detection algorithm is not deterministic. You should have roughly the same clusters from previous runs, but some edge cases might be assigned to different communities. The community numbers might be shuffled between across different runs.
**This step requires adjustment by hand: choose from community IDs above.**

In [None]:
gdbClient.queryToPrint("""
MATCH (m:MSA) 
  SET m.communityName = CASE m.communityId 
  WHEN 56 THEN "Large mid-cost metros"
  WHEN 83 THEN "College towns"
  WHEN 100 THEN "Large high-cost metros"
  WHEN 254 THEN "Mid-size metros"
  WHEN 266 THEN "Small metros"
  WHEN 377 THEN "Mid-price metros"
  WHEN 313 THEN "Low-income metros"
  END
return m.communityName, m.communityId, count(*)
""")

Create an index on the `communityName` property to make it searchable in Bloom.

In [None]:
gdbClient.query("""
CREATE INDEX msa_community_name IF NOT EXISTS
FOR (m:MSA)
ON (m.communityName)
""")