In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/benofben/vertex-ai-samples/blob/master/notebooks/community/neo4j/graph_paysim.ipynb" target="_blank">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/benofben/vertex-ai-samples/tree/master/notebooks/community/neo4j/graph_paysim.ipynb" target="_blank">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
</table>

# Overview
In this notebook, you will learn how to use Neo4j AuraDS to create graph features.  You'll then use those new features to solve a classification problem with Vertex AI.


## Dataset
This notebook uses a version of the PaySim dataset that has been modified to work with Neo4j's graph database.  PaySim is a synthetic fraud dataset.  The goal is to identify whether or not a given transaction constitutes fraud.  The [original version of the dataset](https://github.com/EdgarLopezPhD/PaySim) has tabular data.

Neo4j has worked on a modified version that generates a graph dataset [here](https://github.com/voutilad/PaySim).  We've pregenerated a copy of that dataset that you can grab [here](https://storage.googleapis.com/neo4j-datasets/paysim.dump).  You'll want to download that dataset and then upload it to Neo4j AuraDS.  AuraDS is a graph data science tool that is offered as a service on GCP.  Instructions on signing up and uploading the dataset are available [here](https://github.com/neo4j-partners/aurads-paysim).

##Costs
This tutorial uses billable components of Google Cloud:

* Cloud Storage
* Vertex AI

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

# Setup

## Set up your development environment
We suggest you use Colab for this notebook.

## Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

1. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

1. If you are running this notebook locally, you will need to install the [Cloud SDK](https://cloud.google.com/sdk).

1. Enter your project ID in the cell below. Then run the cell to make sure the
Cloud SDK uses the right project for all the commands in this notebook.

**Note**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$` into these commands.

## Install additional Packages
First off, you'll also need to install a few packages.

In [1]:
!pip install --quiet --upgrade neo4j

[?25l[K     |███▊                            | 10 kB 18.7 MB/s eta 0:00:01[K     |███████▍                        | 20 kB 25.1 MB/s eta 0:00:01[K     |███████████                     | 30 kB 14.3 MB/s eta 0:00:01[K     |██████████████▊                 | 40 kB 10.2 MB/s eta 0:00:01[K     |██████████████████▍             | 51 kB 5.2 MB/s eta 0:00:01[K     |██████████████████████          | 61 kB 5.3 MB/s eta 0:00:01[K     |█████████████████████████▊      | 71 kB 5.9 MB/s eta 0:00:01[K     |█████████████████████████████▍  | 81 kB 6.6 MB/s eta 0:00:01[K     |████████████████████████████████| 89 kB 3.3 MB/s 
[?25h  Building wheel for neo4j (setup.py) ... [?25l[?25hdone


In [2]:
pip install --quiet google-cloud-storage

In [3]:
!pip install --quiet google.cloud.aiplatform

[K     |████████████████████████████████| 1.6 MB 5.3 MB/s 
[K     |████████████████████████████████| 106 kB 53.9 MB/s 
[K     |████████████████████████████████| 45 kB 2.9 MB/s 
[K     |████████████████████████████████| 105 kB 50.0 MB/s 
[K     |████████████████████████████████| 105 kB 40.3 MB/s 
[K     |████████████████████████████████| 105 kB 50.3 MB/s 
[K     |████████████████████████████████| 105 kB 52.4 MB/s 
[K     |████████████████████████████████| 105 kB 26.5 MB/s 
[K     |████████████████████████████████| 104 kB 52.4 MB/s 
[K     |████████████████████████████████| 104 kB 54.2 MB/s 
[K     |████████████████████████████████| 103 kB 54.7 MB/s 
[K     |████████████████████████████████| 103 kB 55.2 MB/s 
[K     |████████████████████████████████| 103 kB 52.9 MB/s 
[K     |████████████████████████████████| 103 kB 53.2 MB/s 
[K     |████████████████████████████████| 97 kB 5.6 MB/s 
[K     |████████████████████████████████| 97 kB 5.5 MB/s 
[K     |██████████████████████

## (Colab only) Restart the kernel
After you install the additional packages, you need to restart the notebook kernel so it can find the packages.

In [4]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'restart': True, 'status': 'ok'}

# Working with Neo4j

## Define Neo4J related variables

You'll need to enter the credentials from your AuraDS instance below.  You can get your credentials by following this [walkthrough](https://github.com/neo4j-partners/aurads-paysim).

The "DB_NAME" is always neo4j for AuraDS.  It is different from the name you gave your database tenant in the AuraDS console.

In [1]:
DB_URL = 'neo4j+s://df9cad2b.databases.neo4j.io'
DB_USER = 'neo4j'
DB_PASS = 'some password'
DB_NAME = 'neo4j'

In this section we're going to connect to Neo4j and look around the database.  We're going to generate some new features in the dataset using Neo4j's Graph Data Science library.  Finally, we'll load the data into a Pandas dataframe so that it's all ready to put into GCP Feature Store.

## Exploring the database

In [4]:
import pandas as pd
from neo4j import GraphDatabase

In [5]:
driver = GraphDatabase.driver(DB_URL, auth=(DB_USER, DB_PASS))

Now, let's explore the data in the database a bit to understand what we have to work with.

In [6]:
# node labels
with driver.session(database = DB_NAME) as session:
  result = session.read_transaction( lambda tx: 
    tx.run(
    """
    CALL db.labels() YIELD label
    CALL apoc.cypher.run('MATCH (:`'+label+'`) RETURN count(*) as freq', {})
    YIELD value
    RETURN label, value.freq AS freq
    """
    ).data()
  )
  df = pd.DataFrame(result)
  display(df)

Unnamed: 0,label,freq
0,Node,0
1,Client,11270
2,Bank,5
3,Merchant,3465
4,Mule,0
5,CashIn,746751
6,CashOut,424574
7,Debit,130284
8,Payment,542443
9,Transfer,0


In [7]:
# relationship types
with driver.session(database = DB_NAME) as session:
  result = session.read_transaction( lambda tx: 
    tx.run(
      """
      CALL db.relationshipTypes() YIELD relationshipType as type
      CALL apoc.cypher.run('MATCH ()-[:`'+type+'`]->() RETURN count(*) as freq', {})
      YIELD value
      RETURN type AS relationshipType, value.freq AS freq
      ORDER by freq DESC
      """
      ).data()
    )
df = pd.DataFrame(result)
display(df)

Unnamed: 0,relationshipType,freq
0,PERFORMED,1844052
1,TO,1844052
2,NEXT,1833720
3,HAS_SSN,11330
4,HAS_EMAIL,11330
5,HAS_PHONE,11330
6,FIRST_TX,10332
7,LAST_TX,10332


In [8]:
# transaction types
with driver.session(database = DB_NAME) as session:
  result = session.read_transaction( lambda tx: 
    tx.run(
    """
    MATCH (t:Transaction)
    WITH sum(t.amount) AS globalSum, count(t) AS globalCnt
    WITH *, 10^3 AS scaleFactor
    UNWIND ['CashIn', 'CashOut', 'Payment', 'Debit', 'Transfer'] AS txType
      CALL apoc.cypher.run('MATCH (t:' + txType + ')
        RETURN sum(t.amount) as txAmount, count(t) AS txCnt', {})
      YIELD value
    RETURN txType,value.txAmount AS TotalMarketValue
    """
    ).data()
  )
  df = pd.DataFrame(result)
  display(df)

Unnamed: 0,txType,TotalMarketValue
0,CashIn,104058200000.0
1,CashOut,53854100000.0
2,Payment,96468140000.0
3,Debit,1016829000.0
4,Transfer,0.0


## Create a New Feature with a Graph Embedding using Neo4j
First we're going to create an in memory graph represtation of the data in Neo4j Graph Data Science (GDS).

Note, if you get an error saying the graph already exists, that's probably because you ran this code before.  You can destroy it using the command in the cleanup section of this notebook.

In [11]:
with driver.session(database = DB_NAME) as session:
  result = session.read_transaction( lambda tx: 
    tx.run(
    """
    CALL gds.graph.create.cypher('client_graph', 
      'MATCH (c:Client) RETURN id(c) as id, c.num_transactions as num_transactions, c.total_transaction_amnt as total_transaction_amnt, c.is_fraudster as is_fraudster',
      'MATCH (c:Client)-[:PERFORMED]->(t:Transaction)-[:TO]->(c2:Client) return id(c) as source, id(c2) as target, sum(t.amount) as amount, "TRANSACTED_WITH" as type ')
    """
    ).data()
  )
  df = pd.DataFrame(result)
  display(df)

Unnamed: 0,nodeQuery,relationshipQuery,graphName,nodeCount,relationshipCount,createMillis
0,"MATCH (c:Client) RETURN id(c) as id, c.num_tra...",MATCH (c:Client)-[:PERFORMED]->(t:Transaction)...,client_graph,11270,26035,461


Now we can generate an embedding from that graph.  This is a new feature we can use in our predictions.  We're using FastRP, which is a more full featured and higher performance of Node2Vec.  You can learn more about that [here](https://neo4j.com/docs/graph-data-science/current/algorithms/fastrp/).

In [12]:
with driver.session(database = DB_NAME) as session:
  result = session.read_transaction( lambda tx: 
    tx.run(
    """
    CALL gds.fastRP.mutate('client_graph',{
      relationshipWeightProperty:'amount',
      iterationWeights: [0.0, 1.00, 1.00, 0.80, 0.60],
      featureProperties: ['num_transactions', 'total_transaction_amnt'],
      propertyRatio: .25, 
      embeddingDimension: 16,
      randomSeed: 1, 
      mutateProperty:'embedding'
    })
    """
    ).data()
  )
  df = pd.DataFrame(result)
  display(df)

Unnamed: 0,nodePropertiesWritten,mutateMillis,nodeCount,createMillis,computeMillis,configuration
0,11270,0,11270,0,18,"{'nodeSelfInfluence': 0, 'relationshipWeightPr..."


Finally we dump that out to a dataframe

In [14]:
with driver.session(database = DB_NAME) as session:
  result = session.read_transaction( lambda tx: 
    tx.run(
    """
    CALL gds.graph.streamNodeProperties
    ('client_graph', ['embedding', 'num_transactions', 'total_transaction_amnt', 'is_fraudster'])
    YIELD nodeId, nodeProperty, propertyValue
    RETURN nodeId, nodeProperty, propertyValue
    """
    ).data()
  )
df = pd.DataFrame(result)
df.head()

Unnamed: 0,nodeId,nodeProperty,propertyValue
0,0,embedding,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,0,num_transactions,4
2,0,total_transaction_amnt,118919
3,0,is_fraudster,1
4,3,embedding,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


Now we need to take that dataframe and shape it into something that better represents our classification problem.

In [15]:
x = df.pivot(index='nodeId', columns='nodeProperty', values='propertyValue')
x = x.reset_index()
x.columns.name = None
x.head()

Unnamed: 0,nodeId,embedding,is_fraudster,num_transactions,total_transaction_amnt
0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,4,118919.0
1,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",-9223372036854775808,0,0.0
2,5,"[-4.998395475297457e-09, 5.79870196304455e-09,...",1,80,7484460.0
3,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",-9223372036854775808,0,0.0
4,10,"[0.02352502942085266, -0.023524967953562737, 2...",1,227,37580600.0


is_fraudster will have a value of 0 or 1 if populated.  If the value is 10000 then it's unlabled, so we're going to drop it.

In [16]:
x = x.loc[x['is_fraudster'] != 10000]
x.head()

Unnamed: 0,nodeId,embedding,is_fraudster,num_transactions,total_transaction_amnt
0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,4,118919.0
1,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",-9223372036854775808,0,0.0
2,5,"[-4.998395475297457e-09, 5.79870196304455e-09,...",1,80,7484460.0
3,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",-9223372036854775808,0,0.0
4,10,"[0.02352502942085266, -0.023524967953562737, 2...",1,227,37580600.0


Note that the embedding row is an array.  To make this dataset more consumable, we should flatten that out into multiple individual features: embedding_0, embedding_1, ... embedding_n.

In [17]:
embeddings = pd.DataFrame(x['embedding'].values.tolist()).add_prefix('embedding_')
merged = x.drop(columns=['embedding']).merge(embeddings, left_index=True, right_index=True)
features_df = merged.drop(columns=['is_fraudster', 'num_transactions', 'total_transaction_amnt'])
train_df = merged.drop(columns=['nodeId'])

features_df.to_csv(FEATURES_FILENAME, index=False)

This dataset is too small to use with Vertex AI AutoML Tables. For sake of demonstration, we're going to repeat it a few times. Don't do this in the real world.

In [18]:
pd.concat([train_df for i in range(10)]).to_csv(TRAINING_FILENAME, index=False)

And that's it!  The dataframe now has a nice dataset that we can use with GCP Vertex AI.

# Using Vertex AI with Neo4j data

## Define Google Cloud variables
You'll need to set a few variables for your GCP environment.  PROJECT_ID and STORAGE_BUCKET are most critical.  The others will probably work with the defaults given.

In [21]:
# Edit these variables!
PROJECT_ID = 'neo4jbusinessdev'
STORAGE_BUCKET = 'paysimneo4j1231'

# You can leave these defaults
REGION = 'us-central1'

STORAGE_PATH = 'paysim'
FEATURES_FILENAME = 'features.csv'
TRAINING_FILENAME = 'train.csv'
EMBEDDING_DIMENSION = 16

FEATURESTORE_ID = 'paysim'
ENTITY_NAME = 'payer'

In [22]:
import os
os.environ['GCLOUD_PROJECT'] = PROJECT_ID

## (Colab only) Authenticate your Google Cloud account


In [23]:
try:
  from google.colab import auth as google_auth
  google_auth.authenticate_user()
except:
  pass

##Upload to a GCP Cloud Storage Bucket

To get the data into Vertex AI, we must first put it in a bucket as a CSV.

In [24]:
from google.cloud import storage
client = storage.Client()

In [25]:
bucket = client.bucket(STORAGE_BUCKET)
client.create_bucket(bucket)

<Bucket: paysimneo4j1231>

In [26]:
# Upload our files to that bucket
for filename in [FEATURES_FILENAME, TRAINING_FILENAME]:
  upload_path = os.path.join(STORAGE_PATH, filename)  
  blob = bucket.blob(upload_path)
  blob.upload_from_filename(filename)

## Train and deploy a model on GCP
We'll use the engineered features to train an AutoML Tables model, then deploy it to an endpoint

In [27]:
from google.cloud import aiplatform
aiplatform.init(
    project=PROJECT_ID,
    location=REGION
)

dataset = aiplatform.TabularDataset.create(display_name='paysim', gcs_source=os.path.join('gs://', STORAGE_BUCKET, STORAGE_PATH, TRAINING_FILENAME))
dataset.wait()

print(f'\tDataset: "{dataset.display_name}"')
print(f'\tname: "{dataset.resource_name}"')

	Dataset: "paysim"
	name: "projects/803648085855/locations/us-central1/datasets/3671691537609129984"


In [28]:
embedding_column_names = [ 'embedding_{}'.format(i) for i in range(EMBEDDING_DIMENSION) ]
other_column_names = [
  'num_transactions',
  'total_transaction_amnt'
]
all_columns = other_column_names + embedding_column_names
column_specs = {column: 'numeric' for column in all_columns} 

job = aiplatform.AutoMLTabularTrainingJob(
    display_name = "train-paysim-automl-1",
    optimization_prediction_type = "classification",
    column_specs = column_specs
)

In [29]:
model = job.run(
    dataset = dataset,
    target_column = 'is_fraudster',
    training_fraction_split = 0.8,
    validation_fraction_split = 0.1,
    test_fraction_split = 0.1,
    model_display_name = 'paysim-prediction-model',
    disable_early_stopping = False,
    budget_milli_node_hours = int(1000 / 60 ) #* 5) # Limit to 5 minute running time
)

KeyboardInterrupt: ignored

In [None]:
endpoint = model.deploy(
    machine_type="n1-standard-4",
)

## Loading Data into GCP Feature Store
In this section, we'll take our dataframe with newly engineered features and load that into GCP feature store.

In [None]:
from google.cloud.aiplatform_v1 import FeaturestoreServiceClient
api_endpoint = '{}-aiplatform.googleapis.com'.format(REGION)
fs_client = FeaturestoreServiceClient(client_options={'api_endpoint': api_endpoint})

resource_path = fs_client.common_location_path(PROJECT_ID, REGION)
fs_path = fs_client.featurestore_path(PROJECT_ID, REGION, FEATURESTORE_ID)
entity_path = fs_client.entity_type_path(PROJECT_ID, REGION, FEATURESTORE_ID, ENTITY_NAME)

First, let's check if the Feature Store already exists

In [None]:
from grpc import StatusCode
def check_has_resource(callable):
  has_resource = False
  try:
    callable()
    has_resource = True
  except Exception as e:
    if not hasattr(e, 'grpc_status_code') or e.grpc_status_code != StatusCode.NOT_FOUND:
      raise e
  return has_resource

In [None]:
feature_store_exists = check_has_resource(
    lambda: fs_client.get_featurestore(
      name= fs_path
    )
)

In [None]:
from google.cloud.aiplatform_v1.types import featurestore_service as featurestore_service_pb2
from google.cloud.aiplatform_v1.types import featurestore as featurestore_pb2
from google.cloud.aiplatform_v1.types import feature as feature_pb2
from google.cloud.aiplatform_v1.types import entity_type as entity_type_pb2
from google.cloud.aiplatform_v1.types import io as io_pb2

if not feature_store_exists:
  create_lro = fs_client.create_featurestore(
      featurestore_service_pb2.CreateFeaturestoreRequest(
          parent=resource_path,
          featurestore_id=FEATURESTORE_ID,
          featurestore=featurestore_pb2.Featurestore(
              online_serving_config=featurestore_pb2.Featurestore.OnlineServingConfig(
                  fixed_node_count=1
              ),
          ),
      )
  )

  print(create_lro.result())

In [None]:
entity_type_exists = check_has_resource(    
    lambda: fs_client.get_entity_type(
        name=entity_path
    )
)

if not entity_type_exists:
  users_entity_type_lro = fs_client.create_entity_type(
    featurestore_service_pb2.CreateEntityTypeRequest(
        parent=fs_path,
        entity_type_id=ENTITY_NAME,
        entity_type=entity_type_pb2.EntityType(
            description="Main entity type",
        ),
      )
  )
  print(users_entity_type_lro.result())

  feature_requests = [
    featurestore_service_pb2.CreateFeatureRequest(
      feature=feature_pb2.Feature(
          value_type=feature_pb2.Feature.ValueType.DOUBLE,
          description="Embedding {} from Neo4j".format(i),
      ),
      feature_id="embedding_{}".format(i),
    )
    for i in range(EMBEDDING_DIMENSION)
  ]
  create_features_lro = fs_client.batch_create_features(
      parent=entity_path,
      requests=feature_requests,
  )
  print(create_features_lro.result())

In [None]:
feature_specs = [
  featurestore_service_pb2.ImportFeatureValuesRequest.FeatureSpec(id="embedding_{}".format(i))
  for i in range(EMBEDDING_DIMENSION)         
]

from google.protobuf.timestamp_pb2 import Timestamp
feature_time = Timestamp()
feature_time.GetCurrentTime()
feature_time.nanos = 0

import_request = fs_client.import_feature_values(
    featurestore_service_pb2.ImportFeatureValuesRequest(
      entity_type=entity_path,
      csv_source=io_pb2.CsvSource(
          gcs_source=io_pb2.GcsSource(
              uris=[
                  os.path.join('gs://', STORAGE_BUCKET, STORAGE_PATH, FEATURES_FILENAME)
              ]
          )
      ),
      entity_id_field="nodeId",
      feature_specs=feature_specs,
      worker_count=1,
      feature_time=feature_time
  )
)

print(import_request.result())

## Sending a prediction using features from the feature store

In [None]:
from google.cloud.aiplatform_v1 import FeaturestoreOnlineServingServiceClient
data_client = FeaturestoreOnlineServingServiceClient(
    client_options={"api_endpoint": api_endpoint}
)

In [None]:
# Retrieve Neo4j embeddings from feature store
from google.cloud.aiplatform_v1.types import FeatureSelector, IdMatcher
from google.cloud.aiplatform_v1.types import \
    featurestore_online_service as featurestore_online_service_pb2

feature_selector = FeatureSelector(
    id_matcher=IdMatcher(ids=[
        "embedding_{}".format(i)
        for i in range(EMBEDDING_DIMENSION) 
    ])
)

fs_features = data_client.read_feature_values(
    featurestore_online_service_pb2.ReadFeatureValuesRequest(
        entity_type=entity_path,
        entity_id="5",
        feature_selector=feature_selector,
    )
)

saved_embeddings = dict(zip(
  (fd.id for fd in fs_features.header.feature_descriptors),
  (str(d.value.double_value) for d in fs_features.entity_view.data)
))

In [None]:
# Combine with other features. These might be sourced per transaction
all_features = {
    'num_transactions': '80',
    'total_dollar_amnt': '7484459.618641878'
}

all_features.update(saved_embeddings)

instances = [{
    key: str(value) for key, value in all_features.items()
}]

In [None]:
# Send a prediction
endpoint.predict(instances=instances)

# Cleanup


## Neo4j cleanup

To delete the Graph Data Science representation of the graph, run this:

In [10]:
with driver.session(database = DB_NAME) as session:
    result = session.read_transaction( lambda tx: 
        tx.run(
        """
        CALL gds.graph.drop('client_graph')
        """
        ).data()
    )

## Google Cloud cleanup

Delete the feature store and turn down the endpoint

In [None]:
fs_client.delete_featurestore(
    request=featurestore_service_pb2.DeleteFeaturestoreRequest(
        name=fs_client.featurestore_path(PROJECT_ID, REGION, FEATURESTORE_ID),
        force=True,
    )
).result()

endpoint.delete()