In [64]:
from neo4j import GraphDatabase
from sklearn.manifold import TSNE
import numpy as np
import altair as alt
import pandas as pd
import math

In [65]:
import sys
!{sys.executable} -m pip install anvil-uplink



In [66]:
import anvil.server

anvil.server.connect('SKVESGU4C4YA5DIDK4EPB23X-MJC5CLDPJDN5BS7P')

# Presentation of thre neo4j graph embeddings techinques 

* Random Projection
* node2Vec
* GraphSAGE

1. Create neo4j (I'm using 4.1.3 version) local database running on `bolt://localhost:7687`
2. Connect to neo4j from jupyter.

In [67]:
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "Roads"))
host = "bolt://localhost:7687"
user = "neo4j"
password = "Roads"

3. Import data to your graph db (EuropeanRoads dataset https://github.com/neo4j-examples/graph-embeddings)


Data has simple schema, it is made of one type of nodes (Place) and one type of relation (EROAD). Places are European towns and EROADS are roads connecting towns.



In [68]:
MAX = 100000
def find_three_similar_countries(cities, df):
    x = []
    y = []
    for city in cities:
        city_data = df.loc[df['name'] == city]
        x.append(city_data.iloc[0].x)
        y.append(city_data.iloc[0].y)
        
    centroid_x = sum(x) / len(cities)
    centroid_y = sum(y) / len(cities)
    
    first_min = MAX
    second_min = MAX
    third_min = MAX
    
    first_city = ""
    second_city = ""
    third_city = ""
    
    for i in df.iterrows():
        city_name = i[1]['name']
        
        if(city_name not in cities):
            distance = math.sqrt((i[1].x - centroid_x)*(i[1].x - centroid_x) + (i[1].y - centroid_y)*(i[1].y - centroid_y))

            if  distance < first_min:
                third_min = second_min
                second_min = first_min
                first_min = distance

                third_city = second_city
                second_city = first_city
                first_city = city_name

            elif distance < second_min:
                third_min = second_min
                second_min = distance

                third_city = second_city
                second_city = city_name

            elif distance < third_min:
                third_min = distance

                third_city = city_name
    return [(first_city, first_min), (second_city, second_min), (third_city, third_min)]

## FAST RP

#### 2 dimensional embedding

In [69]:
with driver.session(database="neo4j") as session:
    fastRP_2d = pd.DataFrame(session.run("""CALL gds.alpha.randomProjection.stream({
          nodeProjection: "ReferencePlace",
          relationshipProjection: {
            eroad: {
              type: "EROAD",
              orientation: "UNDIRECTED"
            }
          },
          embeddingSize: 2,
          maxIterations: 1
        })
        YIELD nodeId, embedding
        RETURN nodeId, gds.util.asNode(nodeId).name AS name, gds.util.asNode(nodeId).country_code as countryCode, embedding;"""))
    fastRP_2d = fastRP_2d.rename(columns={0: "nodeId", 1: "name", 2: "countryCode", 3: "embedding"})

In [70]:
fastRP_2d_embeddings = fastRP_2d["embedding"]
fastRP_2d_df = pd.DataFrame(data = {
    "nodeId": fastRP_2d["nodeId"],
    "name": fastRP_2d["name"],
    "country_code": fastRP_2d["countryCode"],
    "x": [value[0] for value in fastRP_2d_embeddings],
    "y": [value[1] for value in fastRP_2d_embeddings]
})
pd.set_option('display.max_rows', 1000)
fastRP_2d_df.head(894)

Unnamed: 0,nodeId,name,country_code,x,y
0,0,Larne,GB,-0.408248,0.0
1,1,Belfast,GB,-0.204124,-0.204124
2,2,Dublin,IRL,-0.306186,0.306186
3,3,Wexford,IRL,-0.918559,0.0
4,4,Rosslare,IRL,0.306186,0.0
5,5,La Coruña,E,0.0,0.0
6,6,Pontevedra,E,0.0,0.0
7,7,Valença do Minho,P,-0.612372,0.0
8,8,Porto,P,0.408248,0.0
9,9,Aveiro,P,-0.306186,0.306186


In [71]:
alt.Chart(fastRP_2d_df).mark_circle().encode(
    x='x',
    y='y',
    color='country_code',
    tooltip=['name', 'country_code']
).properties(width=700, height=400)

In [72]:
find_three_similar_countries(["Warszawa"], fastRP_2d_df)

[('Luxembourg', 0.0), ('Bratislava', 0.0), ('Liège', 0.03092948585314384)]

#### 10 dimensional embedding

In [73]:
with driver.session(database="neo4j") as session:
    fastRP_10d = pd.DataFrame(session.run("""CALL gds.alpha.randomProjection.stream({
          nodeProjection: "ReferencePlace",
          relationshipProjection: {
            eroad: {
              type: "EROAD",
              orientation: "UNDIRECTED"
            }
          },
          embeddingSize: 10,
          maxIterations: 10
        })
        YIELD nodeId, embedding
        RETURN nodeId, gds.util.asNode(nodeId).name AS name, gds.util.asNode(nodeId).country_code as countryCode, embedding;"""))
    fastRP_10d = fastRP_10d.rename(columns={0: "nodeId", 1: "name", 2: "countryCode", 3: "embedding"})

### Dimensions are reduced using t-distributed stochastic neighbor embedding (TSNE)

In [74]:
fastRP_10d_embedding = TSNE(n_components=2, random_state=6).fit_transform(list(fastRP_10d["embedding"]))
fastRP_10d_df = pd.DataFrame(data = {
    "nodeId": fastRP_10d["nodeId"],
    "name": fastRP_10d["name"],
    "country_code": fastRP_10d["countryCode"],
    "x": [value[0] for value in fastRP_10d_embedding],
    "y": [value[1] for value in fastRP_10d_embedding]
})
fastRP_10d_df

Unnamed: 0,nodeId,name,country_code,x,y
0,0,Larne,GB,34.95657,-30.478764
1,1,Belfast,GB,30.953964,-28.477125
2,2,Dublin,IRL,-16.234644,1.666819
3,3,Wexford,IRL,-14.640694,3.570708
4,4,Rosslare,IRL,-17.797628,0.580588
5,5,La Coruña,E,-13.050818,3.027972
6,6,Pontevedra,E,-26.713039,16.842312
7,7,Valença do Minho,P,-0.246116,23.384741
8,8,Porto,P,-28.083023,15.880356
9,9,Aveiro,P,-0.189909,24.342222


In [75]:
alt.Chart(fastRP_10d_df).mark_circle().encode(
    x='x',
    y='y',
    color='country_code',
    tooltip=['name', 'country_code']
).properties(width=700, height=400)

In [76]:
find_three_similar_countries(["Warszawa"], fastRP_10d_df)

[("L'vov", 1.631550342011643),
 ('Kovel', 1.8656277791904239),
 ('Žitomir', 2.0033531339681456)]

# NODE2VEC

#### 2 dimensional embedding

In [77]:
with driver.session(database="neo4j") as session:
    node_2_vec_2d = pd.DataFrame(session.run("""CALL gds.alpha.node2vec.stream({
          nodeProjection: "ReferencePlace",
          relationshipProjection: {
            eroad: {
              type: "EROAD",
              orientation: "UNDIRECTED"
            }
          },
          embeddingSize: 2,
          iterations: 10,
          walkLength: 10
        })
        YIELD nodeId, embedding
        RETURN nodeId, gds.util.asNode(nodeId).name AS name, gds.util.asNode(nodeId).country_code as countryCode, embedding;"""))
    node_2_vec_2d = node_2_vec_2d.rename(columns={0: "nodeId", 1: "name", 2: "countryCode", 3: "embedding"})

In [78]:
node_2_vec_2d_embeddings = node_2_vec_2d["embedding"]
node_2_vec_2d_df = pd.DataFrame(data = {
    "nodeId": node_2_vec_2d["nodeId"],
    "name": node_2_vec_2d["name"],
    "country_code": node_2_vec_2d["countryCode"],
    "x": [value[0] for value in node_2_vec_2d_embeddings],
    "y": [value[1] for value in node_2_vec_2d_embeddings]
})
node_2_vec_2d_df

Unnamed: 0,nodeId,name,country_code,x,y
0,0,Larne,GB,3.468895,1.658854
1,1,Belfast,GB,3.153829,1.414664
2,2,Dublin,IRL,2.362371,0.85505
3,3,Wexford,IRL,2.647601,1.014408
4,4,Rosslare,IRL,2.71389,0.942618
5,5,La Coruña,E,2.242161,0.55875
6,6,Pontevedra,E,3.19953,0.902866
7,7,Valença do Minho,P,3.522139,1.072846
8,8,Porto,P,3.291695,1.066389
9,9,Aveiro,P,2.903151,1.003805


In [79]:
alt.Chart(node_2_vec_2d_df).mark_circle().encode(
    x='x',
    y='y',
    color='country_code',
    tooltip=['name', 'country_code']
).properties(width=700, height=400)

In [80]:
find_three_similar_countries(["Warszawa"], node_2_vec_2d_df)

[('Koper', 0.025937862475640863),
 ('Sheffield', 0.06401747536332258),
 ('Pula', 0.0718478176884237)]

#### 10 dimensional embedding

In [81]:
with driver.session(database="neo4j") as session:
    node_2_vec_10d = pd.DataFrame(session.run("""CALL gds.alpha.node2vec.stream({
          nodeProjection: "ReferencePlace",
          relationshipProjection: {
            eroad: {
              type: "EROAD",
              orientation: "UNDIRECTED"
            }
          },
          embeddingSize: 10,
          iterations: 10,
          walkLength: 10
        })
        YIELD nodeId, embedding
        RETURN nodeId, gds.util.asNode(nodeId).name AS name, gds.util.asNode(nodeId).country_code as countryCode, embedding;"""))
    node_2_vec_10d = node_2_vec_10d.rename(columns={0: "nodeId", 1: "name", 2: "countryCode", 3: "embedding"})

In [82]:
node_2_vec_10d_embedding = TSNE(n_components=2, random_state=6).fit_transform(list(node_2_vec_10d["embedding"]))
node_2_vec_10d_df = pd.DataFrame(data = {
    "nodeId": node_2_vec_10d["nodeId"],
    "name": node_2_vec_10d["name"],
    "country_code": node_2_vec_10d["countryCode"],
    "x": [value[0] for value in node_2_vec_10d_embedding],
    "y": [value[1] for value in node_2_vec_10d_embedding]
})
node_2_vec_10d_df

Unnamed: 0,nodeId,name,country_code,x,y
0,0,Larne,GB,0.980743,43.258156
1,1,Belfast,GB,1.608151,44.019619
2,2,Dublin,IRL,3.904272,43.743423
3,3,Wexford,IRL,5.772392,44.62653
4,4,Rosslare,IRL,6.893045,44.721458
5,5,La Coruña,E,-29.042343,30.832556
6,6,Pontevedra,E,-31.238268,31.496693
7,7,Valença do Minho,P,-32.239754,32.211349
8,8,Porto,P,-33.084057,32.931515
9,9,Aveiro,P,-32.739967,34.194115


In [83]:
alt.Chart(node_2_vec_10d_df).mark_circle(size=60).encode(
    x='x',
    y='y',
    color='country_code',
    tooltip=['name', 'country_code']
).properties(width=700, height=400)

In [84]:
find_three_similar_countries(["Warszawa"], node_2_vec_10d_df)

[('Łowicz', 1.358792929345724),
 ('Elblag', 2.579819061391825),
 ('Opole', 2.9042258921862922)]

# GRAPH SAGE

Convert country codes to integers

In [85]:
with driver.session(database="neo4j") as session:
    cc = pd.DataFrame(session.run("""MATCH (a) RETURN DISTINCT a.country_code"""))

In [86]:
cc_dict = cc.to_dict()
inv_cc = {v: k for k, v in cc_dict[0].items()}
inv_cc

{'GB': 0,
 'IRL': 1,
 'E': 2,
 'P': 3,
 'F': 4,
 'S': 5,
 'FIN': 6,
 'N': 7,
 'RUS': 8,
 'UA': 9,
 'AZ': 10,
 'B': 11,
 'NL': 12,
 'DK': 13,
 'EST': 14,
 'D': 15,
 'CH': 16,
 'L': 17,
 'I': 18,
 'PL': 19,
 'LT': 20,
 'BY': 21,
 'SK': 22,
 'A': 23,
 'CZ': 24,
 'GR': 25,
 'SLO': 26,
 'H': 27,
 'RO': 28,
 'MD': 29,
 'HR': 30,
 'YU': 31,
 'MK': 32,
 'BIH': 33,
 'LV': 34,
 'BG': 35,
 'TR': 36,
 '\xa0': 37,
 'GE': 38}

In [87]:
with driver.session(database="neo4j") as session:
    for key in inv_cc.keys():
        session.run(f"MATCH (a) WHERE a.country_code = '{key}' SET a.country_code_int={inv_cc[key]}")

#### 2 dimensional embedding

In [88]:
with driver.session(database="neo4j") as session:
    graph_sage_2d = pd.DataFrame(session.run("""CALL gds.alpha.graphSage.stream({
          nodeProjection: { 
              ReferencePlace: {
                  label: "ReferencePlace",
                  properties: ["country_code_int"]
                }
            },
          nodePropertyNames: ["country_code_int"],
          relationshipProjection: {
            eroad: {
              type: "EROAD",
              orientation: "UNDIRECTED"
            }
          },
          embeddingSize: 2,
          maxIterations: 10
        })
        YIELD nodeId, embedding
        RETURN nodeId, gds.util.asNode(nodeId).name AS name, gds.util.asNode(nodeId).country_code as countryCode, embedding;"""))
    graph_sage_2d = graph_sage_2d.rename(columns={0: "nodeId", 1: "name", 2: "countryCode", 3: "embedding"})

In [89]:
graph_sage_2d_embeddings = graph_sage_2d["embedding"]
graph_sage_2d_df = pd.DataFrame(data = {
    "nodeId": graph_sage_2d["nodeId"],
    "name": graph_sage_2d["name"],
    "country_code": graph_sage_2d["countryCode"],
    "x": [value[0] for value in graph_sage_2d_embeddings],
    "y": [value[1] for value in graph_sage_2d_embeddings]
})
graph_sage_2d_df

Unnamed: 0,nodeId,name,country_code,x,y
0,0,Larne,GB,0.02912,0.999576
1,1,Belfast,GB,0.029042,0.999578
2,2,Dublin,IRL,0.027916,0.99961
3,3,Wexford,IRL,0.027263,0.999628
4,4,Rosslare,IRL,0.02706,0.999634
5,5,La Coruña,E,0.025524,0.999674
6,6,Pontevedra,E,0.024595,0.999697
7,7,Valença do Minho,P,0.023817,0.999716
8,8,Porto,P,0.023362,0.999727
9,9,Aveiro,P,0.023213,0.999731


In [90]:
alt.Chart(graph_sage_2d_df).mark_circle().encode(
    x='x',
    y='y',
    color='country_code',
    tooltip=['name', 'country_code']
).properties(width=700, height=400)

In [91]:
find_three_similar_countries(["Warszawa"], graph_sage_2d_df)

[('Bari', 1.4859877338326384e-05),
 ('Taranto', 1.4859877338326384e-05),
 ('Goleniów', 1.4897691293376361e-05)]

#### 10 dimensional embedding

In [92]:
with driver.session(database="neo4j") as session:
    graph_sage_10d = pd.DataFrame(session.run("""CALL gds.alpha.graphSage.stream({
          nodeProjection: { 
              ReferencePlace: {
                  label: "ReferencePlace",
                  properties: ["country_code_int"]
                }
            },
          nodePropertyNames: ["country_code_int"],
          relationshipProjection: {
            eroad: {
              type: "EROAD",
              orientation: "UNDIRECTED"
            }
          },
          embeddingSize: 10,
          maxIterations: 10
        })
        YIELD nodeId, embedding
        RETURN nodeId, gds.util.asNode(nodeId).name AS name, gds.util.asNode(nodeId).country_code as countryCode, embedding;"""))
    graph_sage_10d = graph_sage_10d.rename(columns={0: "nodeId", 1: "name", 2: "countryCode", 3: "embedding"})

In [93]:
graph_sage_10d_embedding = TSNE(n_components=2, random_state=6).fit_transform(list(graph_sage_10d["embedding"]))

In [94]:
graph_sage_10d_embeddings = graph_sage_10d["embedding"]
graph_sage_10d_df = pd.DataFrame(data = {
    "nodeId": graph_sage_10d["nodeId"],
    "name": graph_sage_10d["name"],
    "country_code": graph_sage_10d["countryCode"],
    "x": [value[0] for value in graph_sage_10d_embeddings],
    "y": [value[1] for value in graph_sage_10d_embeddings]
})
graph_sage_10d_df

Unnamed: 0,nodeId,name,country_code,x,y
0,0,Larne,GB,0.005574,0.00067
1,1,Belfast,GB,0.005588,0.000686
2,2,Dublin,IRL,0.005816,0.000972
3,3,Wexford,IRL,0.005923,0.0012
4,4,Rosslare,IRL,0.005842,0.001287
5,5,La Coruña,E,0.005553,0.002196
6,6,Pontevedra,E,0.00521,0.003086
7,7,Valença do Minho,P,0.004827,0.004138
8,8,Porto,P,0.004586,0.004924
9,9,Aveiro,P,0.004506,0.005214


In [95]:
alt.Chart(graph_sage_10d_df).mark_circle().encode(
    x='x',
    y='y',
    color='country_code',
    tooltip=['name', 'country_code']
).properties(width=700, height=400)

In [96]:
find_three_similar_countries(["Warszawa"], graph_sage_10d_df)

[('Udine', 0.00025855040970545193),
 ('Canosa di Puglia', 0.00027171195457953564),
 ('Roma', 0.0018749077768394503)]

In [97]:
def get_algorithm_data(algorithm):
    return {
        "Node2Vec-2D": node_2_vec_2d_df,
        "Node2Vec-10D": node_2_vec_10d_df,
        "FastRP-2D": fastRP_2d_df,
        "FastRP-10D": fastRP_10d_df,
        "GraphSage-2D": graph_sage_2d_df,
        "GraphSage-10D": graph_sage_10d_df
    }[algorithm]

In [None]:
@anvil.server.callable
def get_similar_countries(city, algorithm):
    data = get_algorithm_data(algorithm)
    result = find_three_similar_countries(city, data)
    result_string = ""
    for i in range(0, len(result)):
        result_string = result_string + "City: " + str(result[i][0]) + "\nEmbedding: " + str(result[i][1]) + "\n"
    return result_string

anvil.server.wait_forever()

Exception in thread Thread-9:
Traceback (most recent call last):
  File "C:\Users\Klaudia\anaconda3\lib\site-packages\anvil\server.py", line 403, in call
    return _do_call(args, kwargs, fn_name=fn_name)
  File "C:\Users\Klaudia\anaconda3\lib\site-packages\anvil\server.py", line 395, in _do_call
    return _threaded_server.do_call(args, kwargs, fn_name=fn_name, live_object=live_object)
  File "C:\Users\Klaudia\anaconda3\lib\site-packages\anvil\_threaded_server.py", line 423, in do_call
    raise error_from_server
anvil._server.AnvilWrappedError: 'Connection to Anvil Uplink server lost'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Klaudia\anaconda3\lib\threading.py", line 932, in _bootstrap_inner
    self.run()
  File "C:\Users\Klaudia\anaconda3\lib\threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\Klaudia\anaconda3\lib\site-packages\anvil\server.py", line 206, in h

Anvil websocket closed (code 1006, reason=Going away)
Reconnecting Anvil Uplink...
Connecting to wss://anvil.works/uplink
Anvil websocket open
Connected to "Default environment" as SERVER


Exception in thread Thread-79:
Traceback (most recent call last):
  File "C:\Users\Klaudia\anaconda3\lib\site-packages\anvil\server.py", line 403, in call
    return _do_call(args, kwargs, fn_name=fn_name)
  File "C:\Users\Klaudia\anaconda3\lib\site-packages\anvil\server.py", line 395, in _do_call
    return _threaded_server.do_call(args, kwargs, fn_name=fn_name, live_object=live_object)
  File "C:\Users\Klaudia\anaconda3\lib\site-packages\anvil\_threaded_server.py", line 423, in do_call
    raise error_from_server
anvil._server.AnvilWrappedError: 'Connection to Anvil Uplink server lost'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Klaudia\anaconda3\lib\threading.py", line 932, in _bootstrap_inner
    self.run()
  File "C:\Users\Klaudia\anaconda3\lib\threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\Klaudia\anaconda3\lib\site-packages\anvil\server.py", line 206, in 

Anvil websocket closed (code 1006, reason=Going away)
Reconnecting Anvil Uplink...
Connecting to wss://anvil.works/uplink
Anvil websocket open


Exception in thread Thread-82:
Traceback (most recent call last):
  File "C:\Users\Klaudia\anaconda3\lib\threading.py", line 932, in _bootstrap_inner
    self.run()
  File "C:\Users\Klaudia\anaconda3\lib\threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\Klaudia\anaconda3\lib\site-packages\anvil\server.py", line 206, in heartbeat_until_reopened
    call("anvil.private.echo", "keep-alive")
  File "C:\Users\Klaudia\anaconda3\lib\site-packages\anvil\server.py", line 403, in call
    return _do_call(args, kwargs, fn_name=fn_name)
  File "C:\Users\Klaudia\anaconda3\lib\site-packages\anvil\server.py", line 395, in _do_call
    return _threaded_server.do_call(args, kwargs, fn_name=fn_name, live_object=live_object)
  File "C:\Users\Klaudia\anaconda3\lib\site-packages\anvil\_threaded_server.py", line 377, in do_call
    send_call()
  File "C:\Users\Klaudia\anaconda3\lib\site-packages\anvil\_threaded_server.py", line 371, in send_call
    send_reqresp(

Anvil websocket closed (code 1006, reason=Going away)
Reconnecting Anvil Uplink...
Connecting to wss://anvil.works/uplink
Reconnection failed. Waiting 10 seconds, then retrying.
Reconnecting Anvil Uplink...
Connecting to wss://anvil.works/uplink
Anvil websocket open
Connected to "Default environment" as SERVER
