In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import Data, DataLoader
import numpy as np
from graphdatascience import GraphDataScience
from py2neo import Graph
from neo4j import GraphDatabase
import networkx as nx
import matplotlib.pyplot as plt
import torch.nn.functional as F
import pandas as pd
import neo4jupyter
from torch_geometric.nn import GCNConv
from torch_geometric.utils import negative_sampling, remove_self_loops, add_self_loops, to_dense_adj, dense_to_sparse
from torch_geometric.nn import VGAE,GAE
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
from torch_geometric.utils import to_dense_adj
#neo4jupyter.init_notebook_mode()
import tqdm as notebook_tqdm

In [3]:
uri = "bolt://localhost:7687"
user = "neo4j"
password= "*******"

gds = GraphDataScience(uri, auth=(user, password))

graph = Graph(uri, auth=(user, password))

In [5]:
from neo4j import GraphDatabase
import networkx as nx

# Connect to Neo4j
driver = GraphDatabase.driver(uri, auth=(user, password))

# Define a function to convert Neo4j graph to NetworkX graph
def neo4j_to_networkx(driver):
    # Initialize NetworkX graph
    G = nx.Graph()

    # Query nodes and add them to NetworkX graph with all properties
    with driver.session() as session:
        result = session.run("MATCH (n) RETURN n.id AS nodeid, n.name AS name, properties(n) AS properties")

        for record in result:
            node_id = record["nodeid"]
            properties = record["properties"]
            filtered_properties = {key: value for key, value in properties.items() if key not in ['name', 'id']}
            # Directly use all properties without filtering
            G.add_node(node_id, **filtered_properties)

    # Query relationships and add them to NetworkX graph with properties
    with driver.session() as session:
        result = session.run("MATCH (n)-[r]->(m) RETURN n.id AS nid, n.name AS source, m.id AS mid, m.name AS target, type(r) AS relation_type, properties(r) AS relation_properties")

        sn_id = []
        tn_id = []
        for record in result:
            source = record["source"]
            sn_id.append(record["nid"])
            target = record["target"]
            tn_id.append(record["mid"])
            relation_type = record["relation_type"]
            relation_properties = record["relation_properties"]
            filtered_relation_properties={key: value for key, value in relation_properties.items() if key not in ['name', 'id']}

            # Add the relationship with type and properties
            G.add_edge(record["nid"], record["mid"], **filtered_relation_properties)

    return G, sn_id, tn_id

# Convert Neo4j graph to NetworkX graph
G, sn_id, tn_id = neo4j_to_networkx(driver)

# Close Neo4j driver
driver.close()

# Print summary of NetworkX graph
#print(nx.info(G))


In [6]:
src_n = sn_id
dst_n = tn_id

In [10]:
import ast
import statistics

relation_prop = []
i=0
for ed in G.edges(data=True):
    
    if len(ed[2]) > 0:
        value = ed[2]['value']
        # Check if the value is a string list or a single float
        if isinstance(value, str) and value.startswith('[') and value.endswith(']'):
            # Handle the case where value is a string representation of a list
                value_list = ast.literal_eval(value)  # Convert string to list
                float_values = [float(v) for v in value_list]  # Convert each element to float
                relation_prop.append((ed[0],ed[1],[statistics.median(float_values)]))
        else:
            relation_prop.append((ed[0],ed[1],[float(value)]))

    else:
        relation_prop.append((ed[0], ed[1],[0.0]))
    i=i+1

In [12]:
data = relation_prop

# Initialize the dictionary
item_dict = {}
item_ids_to_keep = [
    30, 4492,6008,10094,10095,10584,10588,25874,30758,31957,32505,
    32909,10585,10590,10586,10587,25871,25873,25872,25058,25059]

# Iterate through items from 0 to 9 (assuming you want to test with 10 item IDs)
for item_id in range(0, len(G)):
    # Filter rows that match the current item_id
    item_rows = [row for row in data if row[0] == item_id]
    
    if item_rows:  # If the item is exist in the list
        filtered_rows = [(row[1], row[2][0]) for row in item_rows if row[1] in item_ids_to_keep]
        
        if ((len(filtered_rows)<21)|(not filtered_rows)):  # If no matching rows exist in item_ids_to_keep
            item_dict[item_id] = [0.0] * 21
        
        else:
            # Sort based on the second column (relationship id)
            filtered_rows.sort(key=lambda x: x[0])
            # Extract the values in sorted order
            sorted_values = [value for _, value in filtered_rows]
            item_dict[item_id] = sorted_values
    else:
        item_dict[item_id] = [0.0] * 21
        

In [13]:
node_properties = {}
for node, data in G.nodes(data=True):
    node_properties[node] = data

In [14]:
sorted_node_properties = dict(sorted(node_properties.items()))

In [16]:
array_of_arrays1 = []
for inner_dict in sorted_node_properties.values():
    inner_array = []
    for value in inner_dict.values():
        inner_array.append(float(value))
    array_of_arrays1.append(inner_array)

# Print the array of arrays

In [17]:
array_of_arrays = [arr if arr else [-1] * 5 for arr in array_of_arrays1]

In [18]:
import numpy as np

# Example list of arrays with some empty arrays
list_of_arrays = list(array_of_arrays)
# Fill empty arrays with NaN
filled_list_of_arrays = [arr if sum(arr) > 0 else np.full_like(list_of_arrays[0],-1) for arr in list_of_arrays]

In [19]:
matrix = np.vstack(filled_list_of_arrays)

In [21]:
matrix_relation_prop = np.array(list(item_dict.values()))

In [22]:
matrix_relation_prop.shape

(33277, 21)

In [23]:
def normalize_row(row):
    # Convert the row to a numpy array if it isn't already
    
    # Get the minimum and maximum values of the row
    min_val = np.min(row)
    max_val = np.max(row)
    
    # Normalize the row to the range [0, 1]
    if max_val != min_val:
        normalized_row = (row - min_val) / (max_val - min_val)
    else:
        # If all values in the row are the same, return an array of 0.5 (arbitrary choice)
        normalized_row = np.full(row.shape, -1)
    
    return normalized_row

def normalize_matrix(matrix):    
    # Apply normalization to each row individually
    normalized_matrix = np.array([normalize_row(row) for row in matrix])
    
    return normalized_matrix

In [24]:
matrix1=normalize_matrix(matrix)

In [25]:
matrix_relation_prop1 = normalize_matrix(matrix_relation_prop)

In [26]:
result_matrix = np.hstack((matrix1,matrix_relation_prop1))

In [29]:
# Connect to Neo4j
driver = GraphDatabase.driver(uri, auth=(user, password))

# Define a function to convert Neo4j graph to NetworkX graph
def neo4j_to_networkx(driver):
    # Initialize NetworkX graph
    G = nx.Graph()

    # Query nodes and add them to NetworkX graph with properties
    with driver.session() as session:
        result = session.run("MATCH (n) RETURN n.id AS nodid, labels(n) AS lab, n.name AS name, properties(n) AS properties")

        node_type = []
        node_idd = []
        for record in result:
            node_id = record["name"]
            properties = record["properties"]
            node_type=node_type+record["lab"]
            node_idd=node_idd+[record["nodid"]]
            filtered_properties = {key: value for key, value in properties.items() if key not in ['name', 'id']}
            G.add_node(node_id, **filtered_properties)

    # Query relationships and add them to NetworkX graph
    with driver.session() as session:
        result = session.run("MATCH (n)-[r]->(m) RETURN n.id AS nid ,n.name AS source,m.id AS mid, m.name AS target, r.name AS relation_name")
 
        sn_id = []
        tn_id = []
        
        for record in result:
            source = record["source"]
            sn_id.append(record["nid"])
            target = record["target"]
            tn_id.append(record["mid"])
            relation_type = record["relation_name"]

            G.add_edge(record["source"], record["target"], relation_type=relation_type)
            #G.add_edge(source,target, relation_type=relation_type)

    return G,sn_id,tn_id,node_type,node_idd

# Convert Neo4j graph to NetworkX graph
G,sn_id,tn_id,node_type,node_idd= neo4j_to_networkx(driver)

# Close Neo4j driver
driver.close()

# Print summary of NetworkX graph

#print(nx.info(G))

In [30]:
node_type_df = pd.DataFrame({'node_id':node_idd,'node_type':node_type})

In [31]:
node_type_df_sorted = node_type_df.sort_values(by='node_id')

In [32]:
def words_to_numbers(words):
    # Create an empty dictionary to map words to unique numbers
    word_to_id = {}
    # Create an empty list to store the resulting numbers
    numbers = []
    
    # Iterate over the list of words
    for word in words:
        # If the word is not already in the dictionary, add it with a new unique number
        if word not in word_to_id:
            word_to_id[word] = len(word_to_id) + 1
        # Append the unique number to the result list
        numbers.append(word_to_id[word])
    
    return np.array(numbers)

In [33]:
node_types = words_to_numbers(node_type_df_sorted.node_type)

### Tunning hyperparameters

In [36]:
import torch
import torch.nn as nn
import torch.nn.functional as F
#from torch_geometric.nn import GCNConv, dropout
from torch_geometric.data import Data
from torch_geometric.utils import negative_sampling, remove_self_loops, to_dense_adj, dense_to_sparse
from torch_geometric.nn import VGAE
from torch_geometric.nn import GCNConv

# Encoder for the DVGAE
class DVGAEEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels, num_node_types):
        super(DVGAEEncoder, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv_mu = GCNConv(2 * out_channels, out_channels)
        self.conv_logstd = GCNConv(2 * out_channels, out_channels)
        self.node_type_embedding = torch.nn.Embedding(num_node_types, out_channels)
        self.linear_mu = torch.nn.Linear(out_channels * 2, out_channels)
        self.linear_logstd = torch.nn.Linear(out_channels * 2, out_channels)
        self.dropout = nn.Dropout(0.1)  # Dropout rate of 0.1

    def forward(self, x, edge_index, node_type):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.1, training=self.training)  # Apply dropout
        node_type_emb = self.node_type_embedding(node_type)
        x_mu = self.conv_mu(x, edge_index)
        x_logstd = self.conv_logstd(x, edge_index)
        
        # Concatenate GCN outputs with node type embeddings
        x_mu = torch.cat([x_mu, node_type_emb], dim=-1)
        x_logstd = torch.cat([x_logstd, node_type_emb], dim=-1)
        
        # Pass through linear layers to get the final mu and logstd
        x_mu = self.linear_mu(x_mu)
        x_logstd = self.linear_logstd(x_logstd)
        
        return x_mu, x_logstd

# Decoder for the DVGAE
class DVGAEDecoder(torch.nn.Module):
    def __init__(self):
        super(DVGAEDecoder, self).__init__()
        self.dropout = nn.Dropout(0.3)  # Dropout rate of 0.3

    def forward(self, z, edge_index):
        # Reconstruct edges using the dot product of embeddings
        edge_index = remove_self_loops(edge_index)[0]
        edge_logits = (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1)
        edge_logits = self.dropout(edge_logits)  # Apply dropout
        return edge_logits

        
# The DVGAE model
class DVGAE(VGAE):
    def __init__(self, encoder, decoder):
        super(DVGAE, self).__init__(encoder, decoder)
         
    def recon_loss(self, z, pos_edge_index, neg_edge_index=None):
        pos_loss = -torch.log(
            torch.sigmoid((z[pos_edge_index[0]] * z[pos_edge_index[1]]).sum(dim=1)) + 1e-15
        ).mean()

        if neg_edge_index is None:
            neg_edge_index = negative_sampling(
                edge_index=pos_edge_index, 
                num_nodes=z.size(0)
            )

        neg_loss = -torch.log(
            1 - torch.sigmoid((z[neg_edge_index[0]] * z[neg_edge_index[1]]).sum(dim=1)) + 1e-15
        ).mean()

        return pos_loss + neg_loss

    def kl_loss(self, mu, logstd):
        return -0.5 * torch.mean(torch.sum(1 + 2 * logstd - mu**2 - torch.exp(2 * logstd), dim=1))

    def reconstruct_graph(self, z, threshold):
        # Remove self-loops
        edge_index = remove_self_loops(data.edge_index)[0]
        
        # Reconstruct edges using the decoder
        edge_logits = self.decoder(z, edge_index)
    
        # Apply sigmoid to get probabilities
        edge_probs = torch.sigmoid(edge_logits)
        
        # Reconstruct the graph based on a threshold
        adj_matrix = to_dense_adj(edge_index, edge_attr=edge_probs)
        reconstructed_adj = (adj_matrix > threshold).float()
        
        # Convert dense adjacency matrix back to sparse format
        reconstructed_edge_index, _ = dense_to_sparse(reconstructed_adj)
        return reconstructed_edge_index


# Define a function for grid search with validation
# Define a function for grid search with validation
def grid_search_with_validation(learning_rates, epochs_list, latent_dims, train_data, val_data):
    best_loss = float('inf')
    best_lr = None
    best_model = None
    best_epoch = None
    best_latent_dim = None  # To store the best latent dimension

    for lr in learning_rates:
        for epoch in epochs_list:
            for latent_dim in latent_dims:  # Iterate over latent dimensions
                model, optimizer = initialize_model(lr, epoch, latent_dim, len(val_data.node_type))

                for _ in range(epoch):
                    model.train()
                    optimizer.zero_grad()
                    z = model.encode(train_data.x, train_data.edge_index, train_data.node_type)
                    mu, logstd = model.encoder(train_data.x, train_data.edge_index, train_data.node_type)
                    recon_loss = model.recon_loss(z, train_data.edge_index)
                    kl_loss = model.kl_loss(mu, logstd)
                    total_loss = recon_loss + (1 / train_data.num_nodes) * kl_loss
                    total_loss.backward()
                    optimizer.step()

                val_loss = evaluate_model(model, val_data)

                if val_loss < best_loss:
                    best_loss = val_loss
                    best_lr = lr
                    best_model = model
                    best_epoch = epoch
                    best_latent_dim = latent_dim  # Store the best latent dimension

    return best_lr, best_model, best_epoch, best_latent_dim  # Return the best latent dimension

    
# Function to initialize the model with given hyperparameters
def initialize_model(lr, epoch, latent_dim, num_node_types):
    encoder = DVGAEEncoder(in_channels, latent_dim, num_node_types)  # Use latent_dim here
    decoder = DVGAEDecoder()
    model = DVGAE(encoder, decoder)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    return model, optimizer

def evaluate_model(model, val_data):
    model.eval()
    with torch.no_grad():
        z = model.encode(val_data.x, val_data.edge_index,val_data.node_type)
        mu, logstd = model.encoder(val_data.x, val_data.edge_index,val_data.node_type)
        recon_loss = model.recon_loss(z, val_data.edge_index)
        kl_loss = model.kl_loss(mu, logstd)
        total_loss = recon_loss + (1 / data.num_nodes) * kl_loss
    return total_loss.item()


# Assuming you have already defined your Data object



# Sample data
torch.manual_seed(123456)
# Sample directed graph with node features and types
num_nodes = result_matrix.shape[0]
edge_index = torch.tensor([src_n, dst_n], dtype=torch.long)  # Directed edges
x = torch.tensor(result_matrix, dtype=torch.float)  # Node features
node_types = torch.tensor(node_types, dtype=torch.long)  # Node types

# Ensure that node types are within the range of unique types
num_node_types = node_types.max().item()+1
data = Data(x=x, edge_index=edge_index, node_type=node_types)
num_nodes = data.num_nodes
num_edges = data.num_edges

# Splitting the node indices
train_ratio = 0.7  # 70% training, 30% validation
num_train_nodes = int(train_ratio * num_nodes)
train_node_indices = torch.arange(num_train_nodes)
val_node_indices = torch.arange(num_train_nodes, num_nodes)

# Filtering the edges based on the selected node indices
train_edge_mask = (data.edge_index[0] < num_train_nodes) & (data.edge_index[1] < num_train_nodes)
train_edge_index = data.edge_index[:, train_edge_mask]
val_edge_mask = ~train_edge_mask
val_edge_index = data.edge_index[:, val_edge_mask]

# Filter node types
train_node_type = data.node_type[train_node_indices]
val_node_type = data.node_type[val_node_indices]

# Creating Data objects for training and validation
train_data = Data(x=x, edge_index=train_edge_index, node_type=node_types)
val_data = Data(x=x, edge_index=val_edge_index, node_type=node_types)

in_channels = data.num_node_features
#out_channels = 50  # Dimension of the latent space


# Perform grid search for learning rate, number of epochs, and latent space dimensions
learning_rates = [0.001, 0.01, 0.1]
epochs = [100, 200, 300]
latent_dims = [50]  # Add your desired latent space dimensions

best_lr, best_model, best_epoch, best_latent_dim = grid_search_with_validation(learning_rates, epochs, latent_dims, train_data, val_data)

# Initialize the model with the best hyperparameters
best_model, optimizer = initialize_model(best_lr, best_epoch, best_latent_dim, len(node_types))


In [37]:
best_latent_dim

50

In [38]:
best_epoch

100

In [39]:
print(best_lr)

0.001


In [41]:
for epoch in range(best_epoch):
    best_model.train()
    optimizer.zero_grad()
    z = best_model.encode(data.x, data.edge_index,data.node_type)
    mu, logstd = best_model.encoder(data.x, data.edge_index,data.node_type)
    recon_loss = best_model.recon_loss(z, data.edge_index)
    kl_loss = best_model.kl_loss(mu, logstd)
    total_loss = recon_loss + (1 / data.num_nodes) * kl_loss
    total_loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}/{best_epoch}, Loss: {total_loss.item()}")

# Obtain the embeddings
best_model.eval()
with torch.no_grad():
    z = best_model.encode(data.x, data.edge_index,data.node_type)
    print("Node Embeddings:")
    print(z)

# Reconstruct the graph
with torch.no_grad():
    reconstructed_edge_index = best_model.reconstruct_graph(z, threshold=0.5)
    print("Reconstructed Edge Index:")
    print(reconstructed_edge_index)


### end of tunning

In [53]:
import torch
import torch.nn as nn
import torch.nn.functional as F
#from torch_geometric.nn import GCNConv, dropout
from torch_geometric.data import Data
from torch_geometric.utils import negative_sampling, remove_self_loops, to_dense_adj, dense_to_sparse
from torch_geometric.nn import VGAE
from torch_geometric.nn import GCNConv

# Encoder for the DVGAE
class DVGAEEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels, num_node_types):
        super(DVGAEEncoder, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv_mu = GCNConv(2 * out_channels, out_channels)
        self.conv_logstd = GCNConv(2 * out_channels, out_channels)
        self.node_type_embedding = torch.nn.Embedding(num_node_types, out_channels)
        self.linear_mu = torch.nn.Linear(out_channels * 2, out_channels)
        self.linear_logstd = torch.nn.Linear(out_channels * 2, out_channels)
        self.dropout = nn.Dropout(0.1)  # Dropout rate of 0.1

    def forward(self, x, edge_index, node_type):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.1, training=self.training)  # Apply dropout
        node_type_emb = self.node_type_embedding(node_type)
        x_mu = self.conv_mu(x, edge_index)
        x_logstd = self.conv_logstd(x, edge_index)
        
        # Concatenate GCN outputs with node type embeddings
        x_mu = torch.cat([x_mu, node_type_emb], dim=-1)
        x_logstd = torch.cat([x_logstd, node_type_emb], dim=-1)
        
        # Pass through linear layers to get the final mu and logstd
        x_mu = self.linear_mu(x_mu)
        x_logstd = self.linear_logstd(x_logstd)
        
        return x_mu, x_logstd

# Decoder for the DVGAE
class DVGAEDecoder(torch.nn.Module):
    def __init__(self):
        super(DVGAEDecoder, self).__init__()
        self.dropout = nn.Dropout(0.3)  # Dropout rate of 0.3

    def forward(self, z, edge_index):
        # Reconstruct edges using the dot product of embeddings
        edge_index = remove_self_loops(edge_index)[0]
        edge_logits = (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1)
        edge_logits = self.dropout(edge_logits)  # Apply dropout
        return edge_logits

        
# The DVGAE model
class DVGAE(VGAE):
    def __init__(self, encoder, decoder):
        super(DVGAE, self).__init__(encoder, decoder)
         
    def recon_loss(self, z, pos_edge_index, neg_edge_index=None):
        pos_loss = -torch.log(
            torch.sigmoid((z[pos_edge_index[0]] * z[pos_edge_index[1]]).sum(dim=1)) + 1e-15
        ).mean()

        if neg_edge_index is None:
            neg_edge_index = negative_sampling(
                edge_index=pos_edge_index, 
                num_nodes=z.size(0)
            )

        neg_loss = -torch.log(
            1 - torch.sigmoid((z[neg_edge_index[0]] * z[neg_edge_index[1]]).sum(dim=1)) + 1e-15
        ).mean()

        return pos_loss + neg_loss

    def kl_loss(self, mu, logstd):
        return -0.5 * torch.mean(torch.sum(1 + 2 * logstd - mu**2 - torch.exp(2 * logstd), dim=1))

    def reconstruct_graph(self, z, threshold):
        # Remove self-loops
        edge_index = remove_self_loops(data.edge_index)[0]
        
        # Reconstruct edges using the decoder
        edge_logits = self.decoder(z, edge_index)
    
        # Apply sigmoid to get probabilities
        edge_probs = torch.sigmoid(edge_logits)
        
        # Reconstruct the graph based on a threshold
        adj_matrix = to_dense_adj(edge_index, edge_attr=edge_probs)
        reconstructed_adj = (adj_matrix > threshold).float()
        
        # Convert dense adjacency matrix back to sparse format
        reconstructed_edge_index, _ = dense_to_sparse(reconstructed_adj)
        return reconstructed_edge_index


# Define a function for grid search with validation
# Define a function for grid search with validation
def grid_search_with_validation(learning_rates, epochs_list, latent_dims, train_data, val_data):
    best_loss = float('inf')
    best_lr = None
    best_model = None
    best_epoch = None
    best_latent_dim = None  # To store the best latent dimension

    for lr in learning_rates:
        for epoch in epochs_list:
            for latent_dim in latent_dims:  # Iterate over latent dimensions
                model, optimizer = initialize_model(lr, epoch, latent_dim, len(val_data.node_type))

                for _ in range(epoch):
                    model.train()
                    optimizer.zero_grad()
                    z = model.encode(train_data.x, train_data.edge_index, train_data.node_type)
                    mu, logstd = model.encoder(train_data.x, train_data.edge_index, train_data.node_type)
                    recon_loss = model.recon_loss(z, train_data.edge_index)
                    kl_loss = model.kl_loss(mu, logstd)
                    total_loss = recon_loss + (1 / train_data.num_nodes) * kl_loss
                    total_loss.backward()
                    optimizer.step()

                val_loss = evaluate_model(model, val_data)

                if val_loss < best_loss:
                    best_loss = val_loss
                    best_lr = lr
                    best_model = model
                    best_epoch = epoch
                    best_latent_dim = latent_dim  # Store the best latent dimension

    return best_lr, best_model, best_epoch, best_latent_dim  # Return the best latent dimension

    
# Function to initialize the model with given hyperparameters
def initialize_model(lr, epoch, latent_dim, num_node_types):
    encoder = DVGAEEncoder(in_channels, latent_dim, num_node_types)  # Use latent_dim here
    decoder = DVGAEDecoder()
    model = DVGAE(encoder, decoder)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    return model, optimizer

def evaluate_model(model, val_data):
    model.eval()
    with torch.no_grad():
        z = model.encode(val_data.x, val_data.edge_index,val_data.node_type)
        mu, logstd = model.encoder(val_data.x, val_data.edge_index,val_data.node_type)
        recon_loss = model.recon_loss(z, val_data.edge_index)
        kl_loss = model.kl_loss(mu, logstd)
        total_loss = recon_loss + (1 / data.num_nodes) * kl_loss
    return total_loss.item()



In [58]:
node_types = words_to_numbers(node_type_df_sorted.node_type)

In [60]:
# Sample data
torch.manual_seed(123456)
# Sample directed graph with node features and types
num_nodes = result_matrix.shape[0]
edge_index = torch.tensor([src_n, dst_n], dtype=torch.long)  # Directed edges
x = torch.tensor(result_matrix, dtype=torch.float) # Node features
node_types = torch.tensor(node_types, dtype=torch.long)  # Node types

# Ensure that node types are within the range of unique types
num_node_types = node_types.max().item()+1
data = Data(x=x, edge_index=edge_index, node_type=node_types)

# Define the model
in_channels = data.num_node_features
out_channels = 50  # Dimension of the latent space
encoder = DVGAEEncoder(in_channels, out_channels, num_node_types)
decoder = DVGAEDecoder()
model = DVGAE(encoder, decoder)

# Training
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
epochs = 100

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    z = model.encode(data.x, data.edge_index, data.node_type)
    mu, logstd = model.encoder(data.x, data.edge_index, data.node_type)
    recon_loss = model.recon_loss(z, data.edge_index)
    kl_loss = model.kl_loss(mu, logstd)
    total_loss = recon_loss + (1 / data.num_nodes) * kl_loss
    total_loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss.item()}")

# Obtain the embeddings
model.eval()
with torch.no_grad():
    z = model.encode(data.x, data.edge_index, data.node_type)
    print("Node Embeddings:")
    print(z)

# Reconstruct the graph
with torch.no_grad():
    reconstructed_edge_index = model.reconstruct_graph(z, threshold=0.5)
    print("Reconstructed Edge Index:")
    print(reconstructed_edge_index)


In [61]:
# Convert edge indices to sets for easy comparison
original_edge_set = {(edge_index[0, i].item(), edge_index[1, i].item()) for i in range(edge_index.size(1))}
reconstructed_edge_set = {(reconstructed_edge_index[0, i].item(), reconstructed_edge_index[1, i].item()) for i in range(reconstructed_edge_index.size(1))}

# Compute the true positive, false positive, and false negative edges
true_positives = original_edge_set.intersection(reconstructed_edge_set)
false_positives = reconstructed_edge_set - original_edge_set
false_negatives = original_edge_set - reconstructed_edge_set

# Compute metrics
accuracy = len(true_positives) / len(original_edge_set)
precision = len(true_positives) / (len(true_positives) + len(false_positives))
recall = len(true_positives) / (len(true_positives) + len(false_negatives))
f1_score = 2 * (precision * recall) / (precision + recall)

print("Edge-wise Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)


Edge-wise Accuracy: 0.9912135109314532
Precision: 1.0
Recall: 0.9912135109314532
F1 Score: 0.9955873697017872


In [62]:
edge_index

tensor([[13443, 13443, 13963,  ..., 32336, 32336, 32336],
        [28245, 12993, 12993,  ..., 31957, 32505, 32909]])

In [63]:
edge_index.shape

torch.Size([2, 1195437])

In [67]:
z1=z.detach().numpy()

In [69]:
# Connect to Neo4j
driver = GraphDatabase.driver(uri, auth=(user, password))

# Define a function to convert Neo4j graph to NetworkX graph
def neo4j_to_networkx(driver):
    # Initialize NetworkX graph
    G = nx.Graph()

    # Query nodes and add them to NetworkX graph with properties
    with driver.session() as session:
        result = session.run("MATCH (n) RETURN n.id AS nodid, labels(n) AS lab, n.name AS name, properties(n) AS properties")

        node_type = []
        for record in result:
            node_id = record["name"]
            properties = record["properties"]
            node_type=node_type+record["lab"]
            filtered_properties = {key: value for key, value in properties.items() if key not in ['name', 'id']}
            G.add_node(node_id, **filtered_properties)

    # Query relationships and add them to NetworkX graph
    with driver.session() as session:
        result = session.run("MATCH (n)-[r]->(m) RETURN n.id AS nid ,n.name AS source,m.id AS mid, m.name AS target, r.name AS relation_name")
 
        sn_id = []
        tn_id = []
        source =[]
        target = []
        for record in result:
            source.append(record["source"])
            sn_id.append(record["nid"])
            target.append(record["target"])
            tn_id.append(record["mid"])
            relation_type = record["relation_name"]

            G.add_edge(record["source"], record["target"], relation_type=relation_type)
            #G.add_edge(source,target, relation_type=relation_type)

    return G,sn_id,tn_id,source,target,node_type

# Convert Neo4j graph to NetworkX graph
G,sn_id,tn_id,source,target,node_type= neo4j_to_networkx(driver)

# Close Neo4j driver
driver.close()

# Print summary of NetworkX graph

#print(nx.info(G))

In [70]:
source_info = pd.concat([pd.Series(sn_id),pd.Series(source)],axis=1)
source_info1 = source_info.drop_duplicates()
source_info1.columns = ['id','name']

In [71]:
target_info = pd.concat([pd.Series(tn_id),pd.Series(target)],axis=1)
target_info1 = target_info.drop_duplicates()
target_info1.columns = ['id','name']

In [72]:
sr_tg_info = pd.concat([source_info1,target_info1])
sr_tg_info1 = sr_tg_info.drop_duplicates()
sr_tg_info1

Unnamed: 0,id,name
0,13443,P1055632
2,13963,P1303106
3,14137,P1391040
5,14341,P1491610
6,14561,P1593721
...,...,...
350180,8596,Hypnagogic hallucinations
350492,6017,Encephalocele
350699,6868,Febrile seizure (within the age range of 3 mon...
350762,28136,Scrotal pain


In [73]:
sr_tg_info_sorted = sr_tg_info1.sort_values(by='id', ascending=True)

In [74]:
node_embeddings = {node: z1[i] for i, node in zip(sr_tg_info_sorted.id,sr_tg_info_sorted.name)}

In [75]:
import pickle

with open('node_embeddings_50d.pkl', 'wb') as pkl_file:
    pickle.dump(node_embeddings, pkl_file)
