# E13 - Exercise 2: Graph Embeddings

1. Create an airports graph
2. Create node embeddings by FastRP algorithm
3. Visualize the embeddings and the data
4. Use the embeddings as an input for clustering ML algorithm of your choice.

Use the European Roads dataset example as reference:
- Tutorial
    - https://neo4j.com/developer/graph-data-science/applied-graphembeddings/
- Code
    - https://github.com/neo4j-examples/applied-graphembeddings/blob/main/notebooks/embedding_visualization.ipynb

In [1]:
from neo4j import GraphDatabase
from sklearn.manifold import TSNE
import numpy as np
import altair as alt
import pandas as pd

#### The following establishes a connection to the Neo4j database.

This can be hosted locally or in the cloud. Adjust the uri and pwd variables below appropriately.

In [9]:
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, parameters=None, db='roads'):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

In [10]:
uri = 'bolt://localhost:7687'
pwd = 'test1234'

conn = Neo4jConnection(uri=uri, user="neo4j", pwd=pwd)

## Import the data from the database
Here we choose a limited number of countries for the sake of the ease of visualization.

In [11]:
query = '''MATCH (p:Place)-[:IN_COUNTRY]->(country)
           WHERE country.code IN ["E", "GB", "F", "TR", "I", "D", "GR"]
           RETURN p.name AS place, p.embedding AS embedding, country.code AS country
'''

df = pd.DataFrame([dict(_) for _ in conn.query(query)])
df.head(10)

Unnamed: 0,place,embedding,country
0,Scotch Corner,"[0.01614914834499359, -0.09021338820457458, -0...",GB
1,Londonderry,"[-0.1937214583158493, -0.07249774783849716, 0....",GB
2,Newbury,"[-0.08637042343616486, 0.0038085298147052526, ...",GB
3,Stranraer,"[-0.0694141611456871, -0.11235079169273376, -0...",GB
4,Edinburgh,"[-0.10838460922241211, 0.015028515830636024, -...",GB
5,Cambridge,"[0.009630650281906128, -0.013554753735661507, ...",GB
6,Manchester,"[0.013831322081387043, 0.07630336284637451, -0...",GB
7,Inverness,"[-0.017810646444559097, 0.0, 0.0, -0.043181713...",GB
8,Bristol,"[0.16178354620933533, -0.06708329916000366, 0....",GB
9,Gretna,"[-0.020820962265133858, -0.059148624539375305,...",GB


***
## Visualization through t-Distributed Stochastic Neighbor Embedding (t-SNE)


In [12]:
X_embedded = TSNE(n_components=2, random_state=6).fit_transform(list(df.embedding))

places = df.place
tsne_df = pd.DataFrame(data = {
    "place": places,
    "country": df.country,
    "x": [value[0] for value in X_embedded],
    "y": [value[1] for value in X_embedded]
})
tsne_df.head()



Unnamed: 0,place,country,x,y
0,Scotch Corner,GB,2.777952,-24.824394
1,Londonderry,GB,11.737528,-30.549892
2,Newbury,GB,-2.339439,13.343311
3,Stranraer,GB,10.846086,-27.634544
4,Edinburgh,GB,6.952541,-25.283731


In [13]:
alt.Chart(tsne_df).mark_circle(size=60).encode(
    x='x',
    y='y',
    color='country',
    tooltip=['place', 'country']
).properties(width=700, height=400)