In [None]:
# import packages

from attrdict import AttrDict
from torch_geometric.datasets import TUDataset
from torch_geometric.data import Data
from torch_geometric.utils import to_networkx, from_networkx, to_dense_adj
import torch_geometric.transforms as T

import networkx as nx
import time
import tqdm
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import wget
import zipfile
import os

from GraphRicciCurvature.OllivierRicci import OllivierRicci

import sklearn
from sklearn.mixture import GaussianMixture

In [None]:
# load datasets

mutag = list(TUDataset(root="data", name="MUTAG"))
enzymes = list(TUDataset(root="data", name="ENZYMES"))
proteins = list(TUDataset(root="data", name="PROTEINS"))
imdb = list(TUDataset(root="data", name="IMDB-BINARY"))

In [None]:
# implement functions to compute graph attributes

class graph_attributes:
    def __init__(self):
        self.edge_density = []
        self.average_degree = []
        self.degree_assortativity = []
        self.pseudo_diameter = []
        self.relative_size_of_largest_component = []
        self.average_clustering_coefficient = []
        self.transitivity = []
        self.degeneracy = []
        self.gini_coeff_degree = []
        self.edge_homogeneity = []
        self.in_feature_similarity = []
        self.out_feature_similarity = []
        self.feature_angular_snr =[]
        self.homophily_measure = []
        self.attribute_assortativity = []
        self.algebraic_connectivity = []
        self.curvature_gap = []
        self.gini_coeff_curvature = []
        self.variance_explained = []
        self.max_clique = []
        self.circumference = []

    def compute_edge_density(self, data):
        for i in tqdm.tqdm(range(len(data))):
            edge_density = data[i].num_edges / (data[i].num_nodes * (data[i].num_nodes - 1) / 2)
            self.edge_density.append(edge_density)

    def compute_average_degree(self, data):
        for i in tqdm.tqdm(range(len(data))):
            average_degree = data[i].num_edges / data[i].num_nodes
            self.average_degree.append(average_degree)

    def compute_degree_assortativity(self, data):
        for i in tqdm.tqdm(range(len(data))):
            G = to_networkx(data[i])
            degree_assortativity = nx.degree_assortativity_coefficient(G)
            self.degree_assortativity.append(degree_assortativity)

    def compute_pseudo_diameter(self, data):
        for i in tqdm.tqdm(range(len(data))):
            # convert the graph an undirected graph
            G = to_networkx(data[i])
            G = G.to_undirected()
            # if G is not connected, compute the pseudo diameter of the largest connected component
            if nx.is_connected(G) == False:
                # compute all connected components
                connected_components = list(nx.connected_components(G)).sort(key=len, reverse=True)
                # compute the largest connected component
                G = G.subgraph(connected_components[0])
            pseudo_diameter = nx.algorithms.distance_measures.diameter(G)
            self.pseudo_diameter.append(pseudo_diameter)

    def compute_relative_size_of_largest_component(self, data):
        for i in tqdm.tqdm(range(len(data))):
            G = to_networkx(data[i])
            # compute all connected components
            connected_components = list(nx.connected_components(G)).sort(key=len, reverse=True)
            # compute the relative size of the largest connected component
            relative_size_of_largest_component = len(connected_components[0]) / G.number_of_nodes()
            self.relative_size_of_largest_component.append(relative_size_of_largest_component)

    def compute_average_clustering_coefficient(self, data):
        # for each graph, compute the average clustering coefficient of all nodes
        for i in tqdm.tqdm(range(len(data))):
            G = to_networkx(data[i])
            average_clustering_coefficient = nx.average_clustering(G)
            self.average_clustering_coefficient.append(average_clustering_coefficient)

    def compute_transitivity(self, data):
        # for each graph, compute the transitivity using nx.transitivity
        for i in tqdm.tqdm(range(len(data))):
            G = to_networkx(data[i])
            G.to_undirected()
            transitivity = nx.transitivity(G) 
            self.transitivity.append(transitivity)

    def compute_degeneracy(self, data):
        # for each graph, compute the degeneracy as the minimum number of colors needed to color the graph
        for i in tqdm.tqdm(range(len(data))):
            G = to_networkx(data[i])
            degeneracy = nx.algorithms.core.core_number(G)
            self.degeneracy.append(degeneracy)

    def compute_gini_coeff_degree(self, data):
        # for each graph, compute the gini coefficient of the degree distribution
        for i in tqdm.tqdm(range(len(data))):
            G = to_networkx(data[i])
            degree_sequence = sorted([d for n, d in G.degree()], reverse=True)
            n = len(degree_sequence)
            gini_coeff_degree = sum([(2 * (i + 1) - n - 1) * degree_sequence[i] for i in range(n)]) / (n * sum(degree_sequence))
            self.gini_coeff_degree.append(gini_coeff_degree)

    def compute_edge_homogeneity(self, data):
        pass

    def compute_in_feature_similarity(self, data):
        pass

    def compute_out_feature_similarity(self, data):
        pass

    def compute_feature_angular_snr(self, data):
        pass

    def compute_homophily_measure(self, data):
        pass

    def compute_attribute_assortativity(self, data):
        # for each graph, compute the assortativity of the node attributes
        for i in tqdm.tqdm(range(len(data))):
            G = to_networkx(data[i])
            attribute_assortativity = nx.attribute_assortativity_coefficient(G, "node_attr")
            self.attribute_assortativity.append(attribute_assortativity)

    def compute_algebraic_connectivity(self, data):
        # for each graph, compute the algebraic connectivity, i.e. 
        # the second smallest eigenvalue of the Laplacian matrix
        for i in tqdm.tqdm(range(len(data))):
            G = to_networkx(data[i])
            algebraic_connectivity = nx.algebraic_connectivity(G)
            self.algebraic_connectivity.append(algebraic_connectivity)

    def compute_curvature_gap(self, data):
        # for each graph, compute the Ollivier-Ricci curvature
        for i in tqdm.tqdm(range(len(data))):
            G = to_networkx(data[i])
            rc = OllivierRicci(G, alpha=0.5, verbose="ERROR")
            rc.compute_ricci_curvature()

            # fit a Gaussian mixture model to the Ricci curvature distribution
            curvature = np.array(list(rc.G.nodes(data="ricciCurvature")))
            curvature = curvature[:, 1].reshape(-1, 1)
            gmm = GaussianMixture(n_components=2, covariance_type="full", random_state=0).fit(curvature)
            
            # the curvature gap is the absolute difference between the means of the two Gaussian components,
            # divided by the root of half the sum of the squared standard deviations
            curvature_gap = abs(gmm.means_[0] - gmm.means_[1]) / np.sqrt(0.5 * (gmm.covariances_[0] ** 2 + gmm.covariances_[1] ** 2))
            self.curvature_gap.append(curvature_gap)

    def compute_gini_coeff_curvature(self, data):
        # for each graph, compute the gini coefficient of the curvature distribution
        for i in tqdm.tqdm(range(len(data))):
            G = to_networkx(data[i])
            rc = OllivierRicci(G, alpha=0.5, verbose="ERROR")
            rc.compute_ricci_curvature()
            curvature = np.array(list(rc.G.nodes(data="ricciCurvature")))
            curvature = curvature[:, 1]
            n = len(curvature)
            gini_coeff_curvature = sum([(2 * (i + 1) - n - 1) * curvature[i] for i in range(n)]) / (n * sum(curvature))
            self.gini_coeff_curvature.append(gini_coeff_curvature)

    def compute_variance_explained(self, data):
        # compute the variance explained by the first principal component of the node attributes
        for i in tqdm.tqdm(range(len(data))):
            G = to_networkx(data[i])
            node_attr = np.array(list(G.nodes(data="node_attr")))
            node_attr = node_attr[:, 1]
            pca = sklearn.decomposition.PCA(n_components=1)
            pca.fit(node_attr)
            variance_explained = pca.explained_variance_ratio_[0]
            self.variance_explained.append(variance_explained)

    def compute_max_clique(self, data):
        # compute the size of the largest clique and divide it by the number of nodes
        for i in tqdm.tqdm(range(len(data))):
            G = to_networkx(data[i])
            max_clique = nx.algorithms.clique.graph_clique_number(G)
            self.max_clique.append(max_clique / G.number_of_nodes())

    def compute_circumference(self, data):
        # compute the circumference of the graph, 
        # i.e. the length of the largest cycle in the graph
        for i in tqdm.tqdm(range(len(data))):
            G = to_networkx(data[i]).to_undirected()
            cycles = nx.algorithms.cycles.simple_cycles(G)
            circumference = 0
            for cycle in cycles:
                if len(cycle) > circumference:
                    circumference = len(cycle)
            self.circumference.append(circumference/ G.number_of_nodes())

    def compute_all_attributes(self, data):
        self.compute_edge_density(data)
        self.compute_average_degree(data)
        self.compute_degree_assortativity(data)
        self.compute_pseudo_diameter(data)
        self.compute_relative_size_of_largest_component(data)
        self.compute_average_clustering_coefficient(data)
        self.compute_transitivity(data)
        self.compute_degeneracy(data)
        self.compute_gini_coeff_degree(data)
        #self.compute_edge_homogeneity(data)
        #self.compute_in_feature_similarity(data)
        #self.compute_out_feature_similarity(data)
        #self.compute_feature_angular_snr(data)
        #self.compute_homophily_measure(data)
        #self.compute_attribute_assortativity(data)
        self.compute_algebraic_connectivity(data)
        self.compute_curvature_gap(data)
        self.compute_gini_coeff_curvature(data)
        self.compute_variance_explained(data)
        self.compute_max_clique(data)
        self.compute_circumference(data)

    # visualize the distribution of each attribute
    def visualize_attributes(self):
        # only include attributes whose list is non-empty,
        # use 100 bins for the histogram
        plt.figure(figsize=(20, 20))
        plt.subplot(4, 4, 1)
        plt.hist(self.edge_density, bins = 100)
        plt.title("edge density")
        plt.subplot(4, 4, 2)
        plt.hist(self.average_degree, bins = 100)
        plt.title("average degree")
        plt.subplot(4, 4, 3)
        plt.hist(self.degree_assortativity, bins = 100)
        plt.title("degree assortativity")
        plt.subplot(4, 4, 4)
        plt.hist(self.pseudo_diameter, bins = 100)
        plt.title("pseudo diameter")
        plt.subplot(4, 4, 5)
        plt.hist(self.relative_size_of_largest_component, bins = 100)
        plt.title("relative size of largest component")
        plt.subplot(4, 4, 6)
        plt.hist(self.average_clustering_coefficient, bins = 100)
        plt.title("average clustering coefficient")
        plt.subplot(4, 4, 7)
        plt.hist(self.transitivity, bins = 100)
        plt.title("transitivity")
        plt.subplot(4, 4, 8)
        plt.hist(self.degeneracy, bins = 100)
        plt.title("degeneracy")
        plt.subplot(4, 4, 9)
        plt.hist(self.gini_coeff_degree, bins = 100)
        plt.title("gini coefficient of degree distribution")
        plt.subplot(4, 4, 10)
        plt.hist(self.algebraic_connectivity, bins = 100)
        plt.title("algebraic connectivity")
        plt.subplot(4, 4, 11)
        plt.hist(self.curvature_gap, bins = 100)
        plt.title("curvature gap")
        plt.subplot(4, 4, 12)
        plt.hist(self.gini_coeff_curvature, bins = 100)
        plt.title("gini coefficient of curvature distribution")
        plt.subplot(4, 4, 13)
        plt.hist(self.variance_explained, bins = 100)
        plt.title("variance explained by first principal component")
        plt.subplot(4, 4, 14)
        plt.hist(self.max_clique, bins = 100)
        plt.title("relative size of largest clique")
        plt.subplot(4, 4, 15)
        plt.hist(self.circumference, bins = 100)
        plt.title("relative size of largest cycle")

        plt.show()