In [None]:
import os
from google.colab import drive
drive.mount('/content/gdrive')
!pwd
os.chdir('gdrive/My Drive/Graph_based_methods/HBcompare/')
!pwd

### Util.py

In [None]:
import networkx as nx
import numpy as np
import random
import scipy.sparse as sp
from sklearn.model_selection import StratifiedKFold
from scipy.sparse import isspmatrix

"""Adapted from https://github.com/weihua916/powerful-gnns/blob/master/util.py"""

class S2VGraph(object):
    def __init__(self, g, label, node_tags=None, node_features=None):
        '''
            g: a networkx graph
            label: an integer graph label
            node_tags: a list of integer node tags
            node_features: a torch float tensor, one-hot representation of the tag that is used as input to neural nets
            edge_mat: a torch long tensor, contain edge list, will be used to create torch sparse tensor
            neighbors: list of neighbors (without self-loop)
        '''
        self.label = label
        self.g = g
        self.node_tags = node_tags
        self.neighbors = []
        self.node_features = 0
        self.edge_mat = 0
        self.max_neighbor = 0


def load_data(dataset, degree_as_tag):
    '''
        dataset: name of dataset
        test_proportion: ratio of test train split
        seed: random seed for random splitting of dataset
    '''

    print('loading data')
    g_list = []
    label_dict = {}
    feat_dict = {}

    with open('/content/gdrive/My Drive/Graph_based_methods/Graph_Transformer/dataset/%s/%s.txt' % (dataset, dataset), 'r') as f:
        n_g = int(f.readline().strip())
        for i in range(n_g):
            row = f.readline().strip().split()
            n, l = [int(w) for w in row]
            if not l in label_dict:
                mapped = len(label_dict)
                label_dict[l] = mapped
            g = nx.Graph()
            node_tags = []
            node_features = []
            n_edges = 0
            for j in range(n):
                g.add_node(j)
                row = f.readline().strip().split()
                tmp = int(row[1]) + 2
                if tmp == len(row):
                    # no node attributes
                    row = [int(w) for w in row]
                    attr = None
                else:
                    row, attr = [int(w) for w in row[:tmp]], np.array([float(w) for w in row[tmp:]])
                if not row[0] in feat_dict:
                    mapped = len(feat_dict)
                    feat_dict[row[0]] = mapped
                node_tags.append(feat_dict[row[0]])

                if tmp > len(row):
                    node_features.append(attr)

                n_edges += row[1]
                for k in range(2, len(row)):
                    g.add_edge(j, row[k])

            if node_features != []:
                node_features = np.stack(node_features)
                node_feature_flag = True
            else:
                node_features = None
                node_feature_flag = False

            assert len(g) == n

            g_list.append(S2VGraph(g, l, node_tags))
     

    #add labels and edge_mat       
    for g in g_list:
        g.neighbors = [[] for i in range(len(g.g))]
        for i, j in g.g.edges():
            g.neighbors[i].append(j)
            g.neighbors[j].append(i)
        degree_list = []
        for i in range(len(g.g)):
            g.neighbors[i] = g.neighbors[i]
            degree_list.append(len(g.neighbors[i]))
        g.max_neighbor = max(degree_list)

        g.label = label_dict[g.label]

        edges = [list(pair) for pair in g.g.edges()]
        edges.extend([[i, j] for j, i in edges])

        deg_list = list(dict(g.g.degree(range(len(g.g)))).values())

        g.edge_mat = np.transpose(np.array(edges, dtype=np.int32), (1,0))

    if degree_as_tag:
        for g in g_list:
            g.node_tags = list(dict(g.g.degree).values())

    #Extracting unique tag labels   
    tagset = set([])
    for g in g_list:
        tagset = tagset.union(set(g.node_tags))

    tagset = list(tagset)
    tag2index = {tagset[i]:i for i in range(len(tagset))}

    for g in g_list:
        g.node_features = np.zeros((len(g.node_tags), len(tagset)), dtype=np.float32)
        g.node_features[range(len(g.node_tags)), [tag2index[tag] for tag in g.node_tags]] = 1


    print('# classes: %d' % len(label_dict))
    print('# maximum node tag: %d' % len(tagset))

    print("# data: %d" % len(g_list))

    return g_list, len(label_dict)


def separate_data(graph_list, fold_idx, seed=0):
    assert 0 <= fold_idx and fold_idx < 10, "fold_idx must be from 0 to 9."
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

    labels = [graph.label for graph in graph_list]
    idx_list = []
    for idx in skf.split(np.zeros(len(labels)), labels):
        idx_list.append(idx)
    train_idx, test_idx = idx_list[fold_idx]

    train_graph_list = [graph_list[i] for i in train_idx]
    test_graph_list = [graph_list[i] for i in test_idx]

    return train_graph_list, test_graph_list
    

"""Get indexes of train and test sets"""
def separate_data_idx(graph_list, fold_idx, seed=0):
    assert 0 <= fold_idx and fold_idx < 10, "fold_idx must be from 0 to 9."
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

    labels = [graph.label for graph in graph_list]
    idx_list = []
    for idx in skf.split(np.zeros(len(labels)), labels):
        idx_list.append(idx)
    train_idx, test_idx = idx_list[fold_idx]

    return train_idx, test_idx

"""Convert sparse matrix to tuple representation."""
def sparse_to_tuple(sparse_mx):
    def to_tuple(mx):
        if not sp.isspmatrix_coo(mx):
            mx = mx.tocoo()
        coords = np.vstack((mx.row, mx.col)).transpose()
        values = mx.data
        shape = mx.shape
        return coords, values, shape

    if isinstance(sparse_mx, list):
        for i in range(len(sparse_mx)):
            sparse_mx[i] = to_tuple(sparse_mx[i])
    else:
        sparse_mx = to_tuple(sparse_mx)

    return sparse_mx


# data_name = "dataset1"
# use_degree_as_tag = False

# graphs, num_classes = load_data(data_name, use_degree_as_tag)

### GCN_layer.py

In [None]:
# %tensorflow_version 1.x
!pip install tensorflow==1.15.0
import tensorflow as tf
print(tf.__version__)
import numpy as np

'''
    Thomas N. Kipf, Max Welling. 2017. Semi-Supervised Classification with Graph Convolutional Networks. ICLR.
    Modified from https://github.com/tkipf/gcn/blob/master/gcn/layers.py
'''

def uniform(shape, scale=0.05, name=None):
    """Uniform init."""
    initial = tf.random_uniform(shape, minval=-scale, maxval=scale, dtype=tf.float32)
    return tf.Variable(initial, name=name)

def glorot(shape, name=None):
    """Glorot & Bengio (AISTATS 2010) init."""
    init_range = np.sqrt(6.0 / (shape[0] + shape[1]))
    initial = tf.compat.v1.random_uniform(shape, minval=-init_range, maxval=init_range, dtype=tf.float32)
    return tf.Variable(initial, name=name)

def zeros(shape, name=None):
    """All zeros."""
    initial = tf.zeros(shape, dtype=tf.float32)
    return tf.Variable(initial, name=name)

def ones(shape, name=None):
    """All ones."""
    initial = tf.ones(shape, dtype=tf.float32)
    return tf.Variable(initial, name=name)

flags = tf.compat.v1.flags
FLAGS = flags.FLAGS

# global unique layer ID dictionary for layer name assignment
_LAYER_UIDS = {}

def get_layer_uid(layer_name=''):
    """Helper function, assigns unique layer IDs."""
    if layer_name not in _LAYER_UIDS:
        _LAYER_UIDS[layer_name] = 1
        return 1
    else:
        _LAYER_UIDS[layer_name] += 1
        return _LAYER_UIDS[layer_name]

def sparse_dropout(x, keep_prob, noise_shape):
    """Dropout for sparse tensors."""
    random_tensor = keep_prob
    random_tensor += tf.compat.v1.random_uniform(noise_shape)
    dropout_mask = tf.cast(tf.floor(random_tensor), dtype=tf.bool)
    pre_out = tf.compat.v1.sparse_retain(x, dropout_mask)
    return pre_out * (1./keep_prob)

def dot(x, y, sparse=False):
    """Wrapper for tf.matmul (sparse vs dense)."""
    if sparse:
        res = tf.compat.v1.sparse_tensor_dense_matmul(x, y)
    else:
        res = tf.matmul(x, y)
    return res

class Layer(object):
    """Base layer class. Defines basic API for all layer objects.
    Implementation inspired by keras (http://keras.io).

    # Properties
        name: String, defines the variable scope of the layer.
        logging: Boolean, switches Tensorflow histogram logging on/off

    # Methods
        _call(inputs): Defines computation graph of layer
            (i.e. takes input, returns output)
        __call__(inputs): Wrapper for _call()
        _log_vars(): Log all variables
    """

    def __init__(self, **kwargs):
        allowed_kwargs = {'name', 'logging'}
        for kwarg in kwargs.keys():
            assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg
        name = kwargs.get('name')
        if not name:
            layer = self.__class__.__name__.lower()
            name = layer + '_' + str(get_layer_uid(layer))
        self.name = name
        self.vars = {}
        logging = kwargs.get('logging', False)
        self.logging = logging
        self.sparse_inputs = False

    def _call(self, inputs):
        return inputs

    def __call__(self, inputs):
        with tf.name_scope(self.name):
            if self.logging and not self.sparse_inputs:
                tf.compat.v1.summary.histogram(self.name + '/inputs', inputs)
            outputs = self._call(inputs)
            if self.logging:
                tf.compat.v1.summary.histogram(self.name + '/outputs', outputs)
            return outputs

    def _log_vars(self):
        for var in self.vars:
            tf.compat.v1.summary.histogram(self.name + '/vars/' + var, self.vars[var])


class GraphConvolution(Layer):
    """Graph convolution layer."""
    def __init__(self, input_dim, output_dim, placeholders, dropout=0.,
                 sparse_inputs=False, act=tf.nn.relu, bias=False,
                 featureless=False, **kwargs):
        super(GraphConvolution, self).__init__(**kwargs)

        if dropout:
            self.dropout = placeholders['dropout']
        else:
            self.dropout = 0.

        self.act = act
        self.adj = placeholders['adj']
        self.sparse_inputs = sparse_inputs
        self.featureless = featureless
        self.bias = bias

        # helper variable for sparse dropout
        self.num_features_nonzero = placeholders['num_features_nonzero']

        with tf.compat.v1.variable_scope(self.name + '_vars'):
            self.vars['weights'] = glorot([input_dim, output_dim], name='weights')
            if self.bias:
                self.vars['bias'] = zeros([output_dim], name='bias')

        if self.logging:
            self._log_vars()

    def _call(self, inputs):
        x = inputs

        # dropout
        if self.sparse_inputs:
            x = sparse_dropout(x, 1-self.dropout, self.num_features_nonzero)
        else:
            x = tf.nn.dropout(x, 1-self.dropout)

        # convolve
        if not self.featureless:
            pre_sup = dot(x, self.vars['weights'], sparse=self.sparse_inputs)
        else:
            pre_sup = self.vars['weights']
        output = dot(self.adj, pre_sup, sparse=True)

        # bias
        if self.bias:
            output += self.vars['bias']

        return self.act(output)

### Model_unsup_gcn.py

In [None]:
class GCN_graph_cls(object):
    def __init__(self, feature_dim_size, hidden_size, num_GNN_layers, num_sampled, vocab_size):
        # Placeholders for input, output
        self.Adj_block = tf.compat.v1.sparse_placeholder(tf.float32, [None, None], name="Adj_block")
        self.X_concat = tf.compat.v1.sparse_placeholder(tf.float32, [None, feature_dim_size], name="X_concat")
        self.num_features_nonzero = tf.compat.v1.placeholder(tf.int32, name="num_features_nonzero")
        self.dropout = tf.compat.v1.placeholder(tf.float32, name="dropout")
        self.input_y = tf.compat.v1.placeholder(tf.int32, [None, 1], name="input_y")

        self.placeholders = {
            'adj': self.Adj_block,
            'dropout': self.dropout,
            'num_features_nonzero': self.num_features_nonzero
        }

        self.input = self.X_concat   # set hidden_size = feature_dim_size if not tuning sizes of hidden stacked layers
        in_hidden_size = feature_dim_size
        print('in_hidden_size = ', in_hidden_size)
        self.output_vectors = []
        #Construct k GNN layers
        for idx_layer in range(num_GNN_layers):
            sparse_inputs = False
            if idx_layer == 0:
                sparse_inputs = True
            gcn_gnn = GraphConvolution(input_dim=in_hidden_size,
                                                  output_dim=hidden_size,
                                                  placeholders=self.placeholders,
                                                  act=tf.nn.relu,
                                                  dropout=True,
                                                  sparse_inputs=sparse_inputs)

            in_hidden_size = hidden_size
            
            # print('in_hidden_size2 = ', in_hidden_size)
            # run --> output --> input for next layer
            self.input = gcn_gnn(self.input)

            # print('shape = ', self.input.get_shape)
            #
            self.output_vectors.append(self.input)

        self.output_vectors = tf.concat(self.output_vectors, axis=1)
        self.output_vectors = tf.nn.dropout(self.output_vectors, 1-self.dropout)

        with tf.name_scope("embedding"):
            self.embedding_matrix = glorot([vocab_size, hidden_size*num_GNN_layers], name='node_embeddings')
            self.softmax_biases = tf.Variable(tf.zeros([vocab_size]))

        self.total_loss = tf.reduce_mean(
            tf.nn.sampled_softmax_loss(weights=self.embedding_matrix, biases=self.softmax_biases,
                                       inputs=self.output_vectors, labels=self.input_y, num_sampled=num_sampled, num_classes=vocab_size))

        self.saver = tf.compat.v1.train.Saver(tf.global_variables(), max_to_keep=500)
        tf.logging.info('Seting up the main structure')

### Calculate Metrics

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn import preprocessing

def cal_acc_only(predicted, gt):
  correct = 0
  for i in range(len(predicted)):
    if(predicted[i] == gt[i]):
      correct += 1

  acc = correct/len(predicted)

  return acc

def calculate_acc(predicted, gt):
  correct = 0
  for i in range(len(predicted)):
    if(predicted[i] == gt[i]):
      correct += 1

  acc = correct/len(predicted)

  lb = preprocessing.LabelBinarizer()
  lb.fit(gt)

  gt_binary = lb.transform(gt)
  predicted_binary = lb.transform(predicted)

  auc = roc_auc_score(gt_binary, predicted_binary, average = 'macro')
  precision, recall, f1score, support = precision_recall_fscore_support(gt_binary, predicted_binary, average = 'macro')

  A = classification_report(predicted, gt, digits = 4)
  # print(A)


  return acc, auc, precision, recall, f1score

### GCN Graph Setup Utility

In [None]:
#! /usr/bin/env python
!pip install keras
!pip install keract

def FCN_classify(X_train, X_test, label_train, label_test, num_classes):
  model = models.Sequential()
  model.add(layers.Dense(128, activation='relu'))
  model.add(layers.Dense(num_classes, activation='softmax'))
  model.summary()

  model.compile(optimizer='adam',
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])
  
  model.fit(X_train, label_train, epochs = 166, verbose =0)

  predictions = model.predict(X_test)

  predicted_labales = np.argmax(predictions, axis = 1)

  # print(' acc ', acc, ' auc ', auc) 
  return predicted_labales

  

def get_Adj_matrix(batch_graph):
    edge_mat_list = []
    start_idx = [0]
    for i, graph in enumerate(batch_graph):
        start_idx.append(start_idx[i] + len(graph.g))
        edge_mat_list.append(graph.edge_mat + start_idx[i])

    Adj_block_idx = np.concatenate(edge_mat_list, 1)
    Adj_block_elem = np.ones(Adj_block_idx.shape[1])

    #self-loop
    num_node = start_idx[-1]
    self_loop_edge = np.array([range(num_node), range(num_node)])
    elem = np.ones(num_node)
    Adj_block_idx = np.concatenate([Adj_block_idx, self_loop_edge], 1)
    Adj_block_elem = np.concatenate([Adj_block_elem, elem], 0)

    Adj_block = coo_matrix((Adj_block_elem, Adj_block_idx), shape=(num_node, num_node))

    return Adj_block

def get_graphpool(batch_graph):
    start_idx = [0]
    # compute the padded neighbor list
    for i, graph in enumerate(batch_graph):
        start_idx.append(start_idx[i] + len(graph.g))

    idx = []
    elem = []
    for i, graph in enumerate(batch_graph):
        elem.extend([1] * len(graph.g))
        idx.extend([[i, j] for j in range(start_idx[i], start_idx[i + 1], 1)])

    elem = np.array(elem)
    idx = np.array(idx)

    graph_pool = coo_matrix((elem, (idx[:, 0], idx[:, 1])), shape=(len(batch_graph), start_idx[-1]))
    return graph_pool
  
def get_idx_nodes(selected_graph_idx):
    idx_nodes = [np.where(graph_pool.getrow(i).toarray()[0] == 1)[0] for i in selected_graph_idx]
    idx_nodes = np.reshape(np.concatenate(idx_nodes), (-1, 1))
    return idx_nodes

def get_batch_data(batch_graph):
    # features
    X_concat = np.concatenate([graph.node_features for graph in batch_graph], 0)
    if "REDDIT" in args.dataset:
        X_concat = np.tile(X_concat, feature_dim_size) #[1,1,1,1]
        X_concat = X_concat * 0.01

    X_concat = coo_matrix(X_concat)
    X_concat = sparse_to_tuple(X_concat)
    # adj
    Adj_block = get_Adj_matrix(batch_graph)
    Adj_block = sparse_to_tuple(Adj_block)

    num_features_nonzero = X_concat[1].shape
    return Adj_block, X_concat, num_features_nonzero

class Batch_Loader(object):
    def __call__(self):
        selected_idx = np.random.permutation(len(graphs))[:args.batch_size]
        batch_graph = [graphs[idx] for idx in selected_idx]
        Adj_block, X_concat, num_features_nonzero = get_batch_data(batch_graph)
        idx_nodes = get_idx_nodes(selected_idx)
        return Adj_block, X_concat, num_features_nonzero, idx_nodes



### Parameters

Most parameters here will change parameters used in training. Only "dataset" unused and instead can be changed in the function to be used for looping through all data sets.

In [None]:
import os
import time
import datetime
import pickle as cPickle
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter


# Parameters
# ==================================================

parser = ArgumentParser("GCN_Unsup", formatter_class=ArgumentDefaultsHelpFormatter, conflict_handler='resolve')

parser.add_argument("--run_folder", default="../", help="")
parser.add_argument("--dataset", default="dataset1", help="Name of the dataset.")
parser.add_argument("--learning_rate", default=0.001, type=float, help="Learning rate")
parser.add_argument("--batch_size", default = 1, type=int, help="Batch Size")
parser.add_argument("--num_epochs", default = 50, type=int, help="Number of training epochs")
parser.add_argument("--saveStep", default=1, type=int, help="")
parser.add_argument("--allow_soft_placement", default=False, type=bool, help="Allow device soft device placement")
parser.add_argument("--log_device_placement", default=False, type=bool, help="Log placement of ops on devices")
parser.add_argument("--model_name", default='MUTAG', help="")
parser.add_argument("--dropout", default=0.5, type=float, help="Dropout")
parser.add_argument("--num_GNN_layers", default=6, type=int, help="Number of stacked layers")
parser.add_argument("--hidden_size", default=64, type=int, help="size of hidden layers")
parser.add_argument('--num_sampled', default=512, type=int, help='')
# args = parser.parse_args()
args = parser.parse_args(args = [])

print(args)

# Load data
print("Loading data...")

use_degree_as_tag = False

if args.dataset == 'COLLAB' or args.dataset == 'IMDBBINARY' or args.dataset == 'IMDBMULTI':
    use_degree_as_tag = True
graphs, num_classes = load_data(args.dataset, use_degree_as_tag)
feature_dim_size = graphs[0].node_features.shape[1]
graph_labels = np.array([graph.label for graph in graphs])
if "REDDIT" in args.dataset:
    feature_dim_size = 4

### Train HBcompare

run_HBCompare_training() function runs the training of graph features in the GCN. 

Overview:
- Input: The function expects that a dataset has already been loaded, the graphs have been loaded as a graph_pool, and a batch loader created as batch_nodes.

- Output: Performance scores. More scores can be added to be appended to an array while running through iterations, and then printed at the end.

1. The GCN first creates a session and sets up the GCN architecture
2. An outer for loop goes through all 50 epochs
3. 2 inner for loops:
  - First loop does unsupervised training through all graph batches and updates the loss function
  - 2nd loop does a 5-fold CV and gives the average and stdev of all folds

*** Warning: google colab times out after a while if not constantly interacting, and 10 iteration loops for a dataset can take 2 hrs (650s * 10 iterations) -> for all 5 datasets can take 7-8 hrs. Recommended to run one dataset at a time and make sure to check captcha every once in a while, or run on local jupyter notebook.

In [None]:
import tensorflow as tf
import numpy as np
np.random.seed(123)
tf.compat.v1.set_random_seed(123)

import os
import time
import datetime
import pickle as cPickle
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from scipy.sparse import coo_matrix
import statistics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import learning_curve
from sklearn.svm import SVC

from tensorflow import keras
from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt
import numpy as np
import scipy.io
from tensorflow.keras import backend as K
from keract import get_activations
import keract

### Datasets to loop through for training in the list - 
### Warning: google colab times out after a while, and 10 iteration loops for a dataset can take 2 hrs (650s * 10 iterations) -> for all 5 datasets can take 7-8 hrs

# all_datasets = ["dataset1","dataset2","dataset3","dataset4","dataset5"]
all_datasets = ["dataset2"]


# Training
# ==================================================
def run_HBCompare_training():
  with tf.Graph().as_default():
      # Setup tensorflow session
      session_conf = tf.compat.v1.ConfigProto(allow_soft_placement=args.allow_soft_placement, log_device_placement=args.log_device_placement)
      session_conf.gpu_options.allow_growth = True
      sess = tf.compat.v1.Session(config=session_conf)
      with sess.as_default():
          global_step = tf.Variable(0, name="global_step", trainable=False)
          print("Feature_dim_size = ", feature_dim_size)
          print("Hidden_size = ", args.hidden_size)
          print("Graph_pool_shape = ", graph_pool.shape[0], " ... ", graph_pool.shape[1])
          print("num_sampled = ", args.num_sampled)

          unsup_gcn = GCN_graph_cls(feature_dim_size=feature_dim_size,
                        hidden_size=args.hidden_size,
                        num_GNN_layers=args.num_GNN_layers,
                        vocab_size=graph_pool.shape[1],
                        num_sampled=args.num_sampled,
                        )

          # Define Training procedure
          optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=args.learning_rate)
          grads_and_vars = optimizer.compute_gradients(unsup_gcn.total_loss)
          train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

          # Folder to save run logs
          out_dir = os.path.abspath(os.path.join(args.run_folder, "../runs_GCN_UnSup", args.model_name))
          print("Writing to {}\n".format(out_dir))
          # print("Data set: ", args.dataset)

          # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
          checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
          checkpoint_prefix = os.path.join(checkpoint_dir, "model")
          if not os.path.exists(checkpoint_dir):
              os.makedirs(checkpoint_dir)

          # Initialize all variables
          sess.run(tf.compat.v1.global_variables_initializer())
          graph = tf.compat.v1.get_default_graph()

          # Setup training step for unsupervised training of graph batches
          def train_step(Adj_block, X_concat, num_features_nonzero, idx_nodes):
              feed_dict = {
                  unsup_gcn.Adj_block: Adj_block,
                  unsup_gcn.X_concat: X_concat,
                  unsup_gcn.num_features_nonzero: num_features_nonzero,
                  unsup_gcn.dropout: args.dropout,
                  unsup_gcn.input_y:idx_nodes
              }
              _, step, loss = sess.run([train_op, global_step, unsup_gcn.total_loss], feed_dict)
              return loss

          # write_acc = open(checkpoint_prefix + '_acc.txt', 'w')
          max_acc = 0.0
          idx_epoch = 0
          num_batches_per_epoch = int((len(graphs) - 1) / args.batch_size) + 1
          predicted_labels = []
          acc_all = []
          auc_all = []
          f1score_all = []

          mean_each_epoch = []
          std_each_epoch = []
          time_each_epoch = []

          start_time = time.time()

          # Start epochs from 1-50
          for epoch in range(1, args.num_epochs+1):

              # feature update using loss function - loss update is unsupervised
              loss = 0
              for _ in range(num_batches_per_epoch):
                  Adj_block, X_concat, num_features_nonzero, idx_nodes = batch_nodes()
                  loss += train_step(Adj_block, X_concat, num_features_nonzero, idx_nodes)
                  # current_step = tf.compat.v1.train.global_step(sess, global_step)
              print(loss)
              # It will give tensor object
              node_embeddings = graph.get_tensor_by_name('embedding/node_embeddings:0')
              node_embeddings = sess.run(node_embeddings)
              graph_embeddings = graph_pool.dot(node_embeddings)
              # Keep acc for all CV folds to get avg
              acc_10folds = []

              # Do 5 fold cross validation per epoch
              for fold_idx in range(0,5):

                  # Setting up train/test split for 5fold-CV
                  train_idx, test_idx = separate_data_idx(graphs, fold_idx)
                  train_graph_embeddings = graph_embeddings[train_idx]
                  test_graph_embeddings = graph_embeddings[test_idx]
                  train_labels = graph_labels[train_idx]
                  test_labels = graph_labels[test_idx]

                  # Logistic regression model for training
                  cls = LogisticRegression(tol=0.001, max_iter = 2000)
                  # cls = SVC()
                  trained_model = cls.fit(train_graph_embeddings, train_labels)

                  # Model training setup
                  predicted = cls.predict(test_graph_embeddings)
                  ACC = cal_acc_only(predicted, test_labels)
                  acc_10folds.append(ACC)

                  # On last epoch, save final scores
                  if(epoch == args.num_epochs):
                    predicted = cls.predict(test_graph_embeddings)
                    predicted_labels.append(predicted)

                    acc, auc, precision, recall, f1score = calculate_acc(predicted, test_labels)
                    acc_all.append(acc)
                    auc_all.append(auc)
                    f1score_all.append(f1score)

                  # print('epoch ', epoch, ' fold ', fold_idx, ' acc ', ACC)


              # After CV folds, get mean accuracy and append to list - organized per epoch
              mean_10folds = statistics.mean(acc_10folds)
              std_10folds = statistics.stdev(acc_10folds)

              print('epoch ', epoch, ' mean: ', str(mean_10folds*100), ' std: ', str(std_10folds*100))

              mean_each_epoch.append(str(mean_10folds*100))
              std_each_epoch.append(str(std_10folds*100))
              time_each_epoch.append(time.time()-start_time)

  end_time = time.time()
  total_time = end_time - start_time
  print("--- %s seconds ---" % total_time)

  print('acc_all', acc_all)   
  print('f1_all', f1score_all)
  print('auc_all', auc_all)  

  return acc_all, auc_all, f1score_all, mean_each_epoch, std_each_epoch, time_each_epoch, total_time


all_data = []
iter = 5
for d in range(len(all_datasets)):
  all_acc = []
  all_auc = []
  all_f1score = []
  all_mean_each_epoch = []
  all_std_each_epoch = []
  all_time_each_epoch = []
  all_time = []
  for i in range(iter):
    graphs, num_classes = load_data(all_datasets[d], use_degree_as_tag)
    feature_dim_size = graphs[0].node_features.shape[1]
    graph_labels = np.array([graph.label for graph in graphs])

    graph_pool = get_graphpool(graphs)
    # print("Graph_pool :", graph_pool)
    batch_nodes = Batch_Loader()

    print("Loading data... finished!")
    print(all_datasets[d], " loaded")

    acc_all,auc_all, f1score_all, mean_each_epoch, std_each_epoch, time_each_epoch, total_time = run_HBCompare_training()
    all_acc.append(acc_all)
    all_auc.append(auc_all)
    all_f1score.append(f1score_all)
    all_mean_each_epoch.append(mean_each_epoch)
    all_std_each_epoch.append(std_each_epoch)
    all_time_each_epoch.append(time_each_epoch)
    all_time.append(total_time)
  
  all_data.append(all_acc)
  all_data.append(all_auc)
  all_data.append(all_f1score)
  all_data.append(all_mean_each_epoch)
  all_data.append(all_std_each_epoch)
  all_data.append(all_time_each_epoch)
  all_data.append(all_time)

In [None]:
# Organizing results from HBcompare
for i in range(len(all_datasets)):
  all_acc = all_data[i+0]
  all_auc = all_data[i+1]
  all_f1score = all_data[i+2]
  all_mean_each_epoch = all_data[i+3]
  all_std_each_epoch = all_data[i+4]
  all_time_each_epoch = all_data[i+5]
  all_time = all_data[i+6]

  print("ACC:", np.mean(all_acc), " STD:", np.std(all_acc))
  print("AUC:", np.mean(all_auc), " STD:", np.std(all_auc))
  print("F1:", np.mean(all_f1score), " STD:", np.std(all_f1score))

  print(all_mean_each_epoch)
  array_acc = []
  array_std = []
  array_time = []

  for i in range(args.num_epochs):
    sum_mean_acc = 0
    sum_std = 0
    sum_time = 0
    for j in range(iter):
      sum_mean_acc = sum_mean_acc + float(all_mean_each_epoch[j][i])
      sum_std = sum_std + float(all_std_each_epoch[j][i])
      sum_time = sum_time + float(all_time_each_epoch[j][i])
    array_acc.append(sum_mean_acc/iter)
    array_std.append(sum_std/iter)
    array_time.append(sum_time/iter)
  
  # Plot time vs accuracy 
  plt.figure(i)
  plt.plot(array_time, array_acc)
  plt.xlabel("Time (s)")
  plt.ylabel("Accuracy")
  plt.yticks(np.arange(75,105,5))
  plt.show()

  # Plot epoch vs accuracy
  plt.figure(i+1)
  plt.plot(range(0,50), array_acc)
  plt.xlabel("Epoch")
  plt.ylabel("Accuracy")
  plt.yticks(np.arange(75,105,5))
  plt.show()


Plot time vs accuracy and epoch vs accuracy figures

Save arrays to a csv file

- rewrite_array() rewrites an embedded array to a single array format for writing to csv

In [None]:
import csv

# Write data to a csv file

def rewrite_array(all_array):
  new_array = []
  for i in range(len(all_array)):
    for j in range(len(all_array[i])):
      new_array.append(float(all_array[i][j]))
  return new_array
with open('time_' + args.dataset + '_with_svm.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',',
                             quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(array_acc)
    spamwriter.writerow(array_time)
    spamwriter.writerow(array_std)
    spamwriter.writerow(rewrite_array(all_f1score))
    spamwriter.writerow(rewrite_array(all_auc))


### HBCompare coord data setup



In [None]:
def Vec2Matrix(X):
  num_graphs, M_square = X.shape 
  M_matrix = int(np.sqrt(M_square))
  X_matrix = np.zeros([num_graphs, int(np.sqrt(M_square)), int(np.sqrt(M_square))])
  for i in range(num_graphs):
    Xi = X[i,:]
    Xi_matrix = np.reshape(Xi, [M_matrix, M_matrix])
    X_matrix[i,:,:] = Xi_matrix

  return X_matrix


def load_RBF_mat(dataset):
  dataset_ = dataset[:-1] + '_' + dataset[-1]
  dataset_name = '/content/gdrive/My Drive/Graph_based_methods/Graph_Transformer/dataset/Features_Omar_new/' + dataset_ + '_RBF_plus_connectivity_manual_matrix.mat'

  mat = scipy.io.loadmat(dataset_name)
  X = mat['X_feature_tensor']

  ndims_X = np.ndim(X)
  X_feature = X
  if(ndims_X == 2):
    X_feature = Vec2Matrix(X)
  # elif(ndims_X == 3):
  #   X_feature = X

  label_name = '/content/gdrive/My Drive/Graph_based_methods/Graph_Transformer/dataset/Features_Omar_new/label_'+ dataset_ + '.mat'
  mat = scipy.io.loadmat(label_name)
  label_all = mat['label_all'][0]

  print('*********** dataset_name =', dataset_name,' *******************')
  classes = np.unique(label_all)
  num_classes = classes.size
  print('number of classes: ', num_classes)

  return X_feature, label_all

X_feature, label_all = load_RBF_mat('dataset1')
# print(X_feature)
use_degree_as_tag = False


def get_coord_Adj_matrix(selected_idx, X_feature, batch_graph):
  Adj_block_full = []
  for idx in selected_idx:
    mat = X_feature[idx]
    Adj_block = coo_matrix(mat, shape=(mat.shape[0], mat.shape[1]))
  # with np.printoptions(threshold=np.inf):
  #     print(Adj_block)
  Adj_block = sparse_to_tuple(Adj_block)
  return Adj_block


selected_idx = np.random.permutation(len(graphs))[:args.batch_size]
batch_graph = [graphs[idx] for idx in selected_idx]
print(selected_idx)
get_coord_Adj_matrix(selected_idx, X_feature, batch_graph)

def get_Adj_matrix(batch_graph):
    edge_mat_list = []
    start_idx = [0]
    for i, graph in enumerate(batch_graph):
        start_idx.append(start_idx[i] + len(graph.g))
        edge_mat_list.append(graph.edge_mat + start_idx[i])

    Adj_block_idx = np.concatenate(edge_mat_list, 1)
    # print("Block_idx:",Adj_block_idx)
    Adj_block_elem = np.ones(Adj_block_idx.shape[1])
    # print("Block_elem:", Adj_block_elem.shape)

    #self-loop
    num_node = start_idx[-1]
    self_loop_edge = np.array([range(num_node), range(num_node)])
    elem = np.ones(num_node)
    Adj_block_idx = np.concatenate([Adj_block_idx, self_loop_edge], 1)
    Adj_block_elem = np.concatenate([Adj_block_elem, elem], 0)

    # with np.printoptions(threshold=np.inf):  
    #   print("Block_elem:", Adj_block_elem)
    #   print("Block_idx:",Adj_block_idx)

    Adj_block = coo_matrix((Adj_block_elem, Adj_block_idx), shape=(num_node, num_node))
    # with np.printoptions(threshold=np.inf):
    #   print(Adj_block)
    return Adj_block

def get_batch_data_coord(batch_graph):
    # features
    X_concat = np.concatenate([graph.node_features for graph in batch_graph], 0)
    if "REDDIT" in args.dataset:
        X_concat = np.tile(X_concat, feature_dim_size) #[1,1,1,1]
        X_concat = X_concat * 0.01

    
    X_concat = coo_matrix(X_concat)
    X_concat = sparse_to_tuple(X_concat)
    # adj
    Adj_block = get_Adj_matrix(batch_graph)
    Adj_block = sparse_to_tuple(Adj_block)
    # with np.printoptions(threshold=np.inf):
    #   print(Adj_block[0].shape)
    #   print(Adj_block[1].shape)

    num_features_nonzero = X_concat[1].shape
    return Adj_block, X_concat, num_features_nonzero


class Batch_Loader_coord(object):
    def __call__(self):
        selected_idx = np.random.permutation(len(graphs))[:args.batch_size]
        batch_graph = [graphs[idx] for idx in selected_idx]
        Adj_block, X_concat, num_features_nonzero = get_batch_data_coord(batch_graph)
        idx_nodes = get_idx_nodes(selected_idx)

        Adj_block_coord = get_coord_Adj_matrix(selected_idx, X_feature, batch_graph)

        print(Adj_block[0].shape)
        print(Adj_block_coord[0].shape)
        print(X_concat[0])
        print(X_concat[1])
        print(X_concat[])
        print(num_features_nonzero)

        return Adj_block_, X_concat, num_features_nonzero, idx_nodes

### Run HBCompare Coord

Method for HBcompare coordinate features

TODO: Fix some bugs

In [None]:
import tensorflow as tf
import numpy as np
np.random.seed(123)
tf.compat.v1.set_random_seed(123)

import os
import time
import datetime
import pickle as cPickle
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from scipy.sparse import coo_matrix
import statistics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import learning_curve
from sklearn.svm import SVC

from tensorflow import keras
from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt
import numpy as np
import scipy.io
from tensorflow.keras import backend as K
from keract import get_activations
import keract


# Training
# ==================================================
def run_HBCompare_training():
  with tf.Graph().as_default():
      # Setup tensorflow session
      session_conf = tf.compat.v1.ConfigProto(allow_soft_placement=args.allow_soft_placement, log_device_placement=args.log_device_placement)
      session_conf.gpu_options.allow_growth = True
      sess = tf.compat.v1.Session(config=session_conf)
      with sess.as_default():
          global_step = tf.Variable(0, name="global_step", trainable=False)
          print("Feature_dim_size = ", feature_dim_size)
          print("Hidden_size = ", args.hidden_size)
          print("Graph_pool_shape = ", graph_pool.shape[0], " ... ", graph_pool.shape[1])
          print("num_sampled = ", args.num_sampled)

          unsup_gcn = GCN_graph_cls(feature_dim_size=feature_dim_size,
                        hidden_size=args.hidden_size,
                        num_GNN_layers=args.num_GNN_layers,
                        vocab_size=graph_pool.shape[1],
                        num_sampled=args.num_sampled,
                        )

          # Define Training procedure
          optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=args.learning_rate)
          grads_and_vars = optimizer.compute_gradients(unsup_gcn.total_loss)
          train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

          # Folder to save run logs
          out_dir = os.path.abspath(os.path.join(args.run_folder, "../runs_GCN_UnSup", args.model_name))
          print("Writing to {}\n".format(out_dir))
          # print("Data set: ", args.dataset)

          # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
          checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
          checkpoint_prefix = os.path.join(checkpoint_dir, "model")
          if not os.path.exists(checkpoint_dir):
              os.makedirs(checkpoint_dir)

          # Initialize all variables
          sess.run(tf.compat.v1.global_variables_initializer())
          graph = tf.compat.v1.get_default_graph()

          def train_step(Adj_block, X_concat, num_features_nonzero, idx_nodes):
              feed_dict = {
                  unsup_gcn.Adj_block: Adj_block,
                  unsup_gcn.X_concat: X_concat,
                  unsup_gcn.num_features_nonzero: num_features_nonzero,
                  unsup_gcn.dropout: args.dropout,
                  unsup_gcn.input_y:idx_nodes
              }
              _, step, loss = sess.run([train_op, global_step, unsup_gcn.total_loss], feed_dict)
              return loss

          # write_acc = open(checkpoint_prefix + '_acc.txt', 'w')
          max_acc = 0.0
          idx_epoch = 0
          num_batches_per_epoch = int((len(graphs) - 1) / args.batch_size) + 1
          predicted_labels = []
          acc_all = []
          auc_all = []
          f1score_all = []

          mean_each_epoch = []
          std_each_epoch = []
          time_each_epoch = []

          start_time = time.time()

          # Start epochs from 1-50
          # for epoch in range(1, args.num_epochs+1):
          for epoch in range(1, 2):

              # feature update using loss function - loss update is unsupervised
              loss = 0
              for _ in range(num_batches_per_epoch):
                  Adj_block, X_concat, num_features_nonzero, idx_nodes = batch_nodes()
                  loss += train_step(Adj_block, X_concat, num_features_nonzero, idx_nodes)
                  # current_step = tf.compat.v1.train.global_step(sess, global_step)
              print(loss)
              # It will give tensor object
              node_embeddings = graph.get_tensor_by_name('embedding/node_embeddings:0')
              node_embeddings = sess.run(node_embeddings)
              graph_embeddings = graph_pool.dot(node_embeddings)
              # Keep acc for all CV folds to get avg
              acc_10folds = []

              # Do 5 fold cross validation per epoch
              for fold_idx in range(0,5):

                  # Setting up train/test split for 5fold-CV
                  train_idx, test_idx = separate_data_idx(graphs, fold_idx)
                  train_graph_embeddings = graph_embeddings[train_idx]
                  test_graph_embeddings = graph_embeddings[test_idx]
                  train_labels = graph_labels[train_idx]
                  test_labels = graph_labels[test_idx]

                  # Logistic regression model for training
                  # cls = LogisticRegression(tol=0.001, max_iter = 2000)
                  cls = SVC()
                  trained_model = cls.fit(train_graph_embeddings, train_labels)

                  # Model training setup
                  predicted = cls.predict(test_graph_embeddings)
                  ACC = cal_acc_only(predicted, test_labels)
                  acc_10folds.append(ACC)

                  # On last epoch, save final scores
                  if(epoch == args.num_epochs):
                    predicted = cls.predict(test_graph_embeddings)
                    predicted_labels.append(predicted)

                    acc, auc, precision, recall, f1score = calculate_acc(predicted, test_labels)
                    acc_all.append(acc)
                    auc_all.append(auc)
                    f1score_all.append(f1score)

                  # print('epoch ', epoch, ' fold ', fold_idx, ' acc ', ACC)


              # After CV folds, get mean accuracy and append to list - organized per epoch
              mean_10folds = statistics.mean(acc_10folds)
              std_10folds = statistics.stdev(acc_10folds)

              print('epoch ', epoch, ' mean: ', str(mean_10folds*100), ' std: ', str(std_10folds*100))

              mean_each_epoch.append(str(mean_10folds*100))
              std_each_epoch.append(str(std_10folds*100))
              time_each_epoch.append(time.time()-start_time)

  end_time = time.time()
  total_time = end_time - start_time
  print("--- %s seconds ---" % total_time)

  print('acc_all', acc_all)   
  print('f1_all', f1score_all)
  print('auc_all', auc_all)  

  return acc_all, auc_all, f1score_all, mean_each_epoch, std_each_epoch, time_each_epoch, total_time


# all_datasets = ["dataset1","dataset2","dataset3","dataset4","dataset5"]
all_datasets = ["dataset1"]

all_acc = []
all_auc = []
all_f1score = []
all_mean_each_epoch = []
all_std_each_epoch = []
all_time_each_epoch = []
all_time = []
iter = 1

for d in range(len(all_datasets)):
  for i in range(iter):
    graphs, num_classes = load_data(all_datasets[d], use_degree_as_tag)
    feature_dim_size = graphs[0].node_features.shape[1]
    graph_labels = np.array([graph.label for graph in graphs])

    graph_pool = get_graphpool(graphs)
    # print("Graph_pool :", graph_pool)
    batch_nodes = Batch_Loader_coord()

    # X, label_all = load_RBF_mat(all_datasets[d])

    print("Loading data... finished!")
    print(all_datasets[d], " loaded")

    print("# of iterations:", i)
    acc_all,auc_all, f1score_all, mean_each_epoch, std_each_epoch, time_each_epoch, total_time = run_HBCompare_training()
    all_acc.append(acc_all)
    all_auc.append(auc_all)
    all_f1score.append(f1score_all)
    all_mean_each_epoch.append(mean_each_epoch)
    all_std_each_epoch.append(std_each_epoch)
    all_time_each_epoch.append(time_each_epoch)
    all_time.append(total_time)

### HBCompare - Parameter Grid

This function was used to test parameter grid for logistic regression. 
Only done on dataset1 and got the following parameters: 
- tol=0.001, max_iter = 2000, C=0.1



In [None]:
import tensorflow as tf
import numpy as np
np.random.seed(123)
tf.compat.v1.set_random_seed(123)

import os
import time
import datetime
import pickle as cPickle
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from scipy.sparse import coo_matrix
import statistics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import learning_curve
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

from sklearn import svm
from tensorflow import keras
from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt
import numpy as np
import scipy.io
from tensorflow.keras import backend as K
from keract import get_activations
import keract


# Training
# ==================================================
def run_HBCompare_training():
  with tf.Graph().as_default():
      session_conf = tf.compat.v1.ConfigProto(allow_soft_placement=args.allow_soft_placement, log_device_placement=args.log_device_placement)
      session_conf.gpu_options.allow_growth = True
      sess = tf.compat.v1.Session(config=session_conf)
      with sess.as_default():
          global_step = tf.Variable(0, name="global_step", trainable=False)
          print("Feature_dim_size = ", feature_dim_size)
          print("Hidden_size = ", args.hidden_size)
          print("Graph_pool_shape = ", graph_pool.shape[0], " ... ", graph_pool.shape[1])
          print("num_sampled = ", args.num_sampled)

          unsup_gcn = GCN_graph_cls(feature_dim_size=feature_dim_size,
                        hidden_size=args.hidden_size,
                        num_GNN_layers=args.num_GNN_layers,
                        vocab_size=graph_pool.shape[1],
                        num_sampled=args.num_sampled,
                        )

          # Define Training procedure
          optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=args.learning_rate)
          grads_and_vars = optimizer.compute_gradients(unsup_gcn.total_loss)
          train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

          # Folder to save run logs
          out_dir = os.path.abspath(os.path.join(args.run_folder, "../runs_GCN_UnSup", args.model_name))
          print("Writing to {}\n".format(out_dir))
          # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
          checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
          checkpoint_prefix = os.path.join(checkpoint_dir, "model")
          if not os.path.exists(checkpoint_dir):
              os.makedirs(checkpoint_dir)

          # Initialize all variables
          sess.run(tf.compat.v1.global_variables_initializer())
          graph = tf.compat.v1.get_default_graph()

          def train_step(Adj_block, X_concat, num_features_nonzero, idx_nodes):
              feed_dict = {
                  unsup_gcn.Adj_block: Adj_block,
                  unsup_gcn.X_concat: X_concat,
                  unsup_gcn.num_features_nonzero: num_features_nonzero,
                  unsup_gcn.dropout: args.dropout,
                  unsup_gcn.input_y:idx_nodes
              }
              _, step, loss = sess.run([train_op, global_step, unsup_gcn.total_loss], feed_dict)
              return loss

          # write_acc = open(checkpoint_prefix + '_acc.txt', 'w')
          max_acc = 0.0
          idx_epoch = 0
          num_batches_per_epoch = int((len(graphs) - 1) / args.batch_size) + 1
          predicted_labels = []
          acc_all = []
          acc_test_all = []
          acc_train_all = []

          mean_each_epoch = []


          start_time = time.time()

          # Start epochs from 1-50
          for epoch in range(1, args.num_epochs+51):

              # feature update using loss function
              loss = 0
              for _ in range(num_batches_per_epoch):
                  Adj_block, X_concat, num_features_nonzero, idx_nodes = batch_nodes()
                  loss += train_step(Adj_block, X_concat, num_features_nonzero, idx_nodes)
                  # current_step = tf.compat.v1.train.global_step(sess, global_step)
              print(loss)
              # It will give tensor object
              node_embeddings = graph.get_tensor_by_name('embedding/node_embeddings:0')
              node_embeddings = sess.run(node_embeddings)
              graph_embeddings = graph_pool.dot(node_embeddings)

              acc_10folds = []
              trainacc_10folds = []

              # Do 5 fold cross validation per epoch
              for fold_idx in range(0,5):
                  # Setting up train/test split for 5fold-CV
                  train_idx, test_idx = separate_data_idx(graphs, fold_idx)
                  train_graph_embeddings = graph_embeddings[train_idx]
                  test_graph_embeddings = graph_embeddings[test_idx]
                  train_labels = graph_labels[train_idx]
                  test_labels = graph_labels[test_idx]


                  # Logistic regression model for training

                  cls = LogisticRegression(tol=0.001, max_iter = 2000, C=0.01,)
                  # cls = svm.SVC()
                  trained_model = cls.fit(train_graph_embeddings, train_labels)

                  predicted_train = cls.predict(train_graph_embeddings)
                  ACC_train = cal_acc_only(predicted_train, train_labels)
                  trainacc_10folds.append(ACC_train)

                  predicted_test = cls.predict(test_graph_embeddings)
                  ACC = cal_acc_only(predicted_test, test_labels)
                  acc_10folds.append(ACC)
                  
                  print(LogisticRegression.get_params)
                  print("Starting pipeline")
                  lr_pipe = Pipeline([('mms', MinMaxScaler()),('lr', LogisticRegression())])
                  params = [{'lr__C': [0.001, 0.01, 0.1, 1.0, 10], 'lr__max_iter':[100,500,1000,2000], 'lr__tol':[0.0001, 0.001, 0.01]}]

                  gs_gcn = GridSearchCV(lr_pipe, param_grid=params, scoring='accuracy', cv=2)
                  gs_gcn.fit(train_graph_embeddings, train_labels)
                  print(gs_gcn.best_params_)
                  print(gs_gcn.score(train_graph_embeddings, train_labels))

                  if(epoch == args.num_epochs):
                    predicted = cls.predict(test_graph_embeddings)
                    predicted_labels.append(predicted)

                    acc, auc, precision, recall, f1score = calculate_acc(predicted, test_labels)
                    acc_all.append(acc)
              print("Epoch:", epoch)
              mean_10folds = statistics.mean(acc_10folds)
              print("Test ACC:", mean_10folds)
              mean_trainacc_10folds = statistics.mean(trainacc_10folds)
              print("Train ACC:", mean_trainacc_10folds)

              acc_test_all.append(mean_10folds)
              acc_train_all.append(mean_trainacc_10folds)
              

              # ACC_test = cls.score(test_graph_embeddings, test_labels)
              # acc_test_all.append(ACC_test)

              mean_each_epoch.append(str(mean_10folds*100))

  return acc_test_all, acc_train_all

# all_datasets = ["dataset1","dataset2","dataset3","dataset4","dataset5"]
all_datasets = ["dataset1"]
all_test_acc = []
all_train_acc = []
iter = 1
for d in range(len(all_datasets)):
  for i in range(iter):
    graphs, num_classes = load_data(all_datasets[d], use_degree_as_tag)
    feature_dim_size = graphs[0].node_features.shape[1]
    graph_labels = np.array([graph.label for graph in graphs])

    graph_pool = get_graphpool(graphs)
    # print("Graph_pool :", graph_pool)
    batch_nodes = Batch_Loader()

    print("Loading data... finished!")
    print(all_datasets[d], " loaded")

    acc_test_all, acc_train_all = run_HBCompare_training()
    all_test_acc.append(acc_test_all)
    all_train_acc.append(acc_train_all)

## Learning Curve for HBcompare

This function gets the training and testing accuracy of HBcompare and plots the values.

Differences between this and the main HBcompare training function:

- Adds 50 more epochs to show learning curve after maximum saturation
- Adds predictions on training set to get training accuracy
- Loops through all datasets in the "all_datasets" list
- Displays learning curve figures for all datasets

In [None]:
import tensorflow as tf
import numpy as np
np.random.seed(123)
tf.compat.v1.set_random_seed(123)

import os
import time
import datetime
import pickle as cPickle
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from scipy.sparse import coo_matrix
import statistics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import learning_curve

from sklearn import svm
from tensorflow import keras
from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt
import numpy as np
import scipy.io
from tensorflow.keras import backend as K
from keract import get_activations
import keract

# all_datasets = ["dataset1","dataset2","dataset3","dataset4","dataset5"]
# all_datasets = ["dataset4"]


# Training
# ==================================================
def run_HBCompare_training():
  with tf.Graph().as_default():
      session_conf = tf.compat.v1.ConfigProto(allow_soft_placement=args.allow_soft_placement, log_device_placement=args.log_device_placement)
      session_conf.gpu_options.allow_growth = True
      sess = tf.compat.v1.Session(config=session_conf)
      with sess.as_default():
          global_step = tf.Variable(0, name="global_step", trainable=False)
          print("Feature_dim_size = ", feature_dim_size)
          print("Hidden_size = ", args.hidden_size)
          print("Graph_pool_shape = ", graph_pool.shape[0], " ... ", graph_pool.shape[1])
          print("num_sampled = ", args.num_sampled)

          unsup_gcn = GCN_graph_cls(feature_dim_size=feature_dim_size,
                        hidden_size=args.hidden_size,
                        num_GNN_layers=args.num_GNN_layers,
                        vocab_size=graph_pool.shape[1],
                        num_sampled=args.num_sampled,
                        )

          # Define Training procedure
          optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=args.learning_rate)
          grads_and_vars = optimizer.compute_gradients(unsup_gcn.total_loss)
          train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

          # Folder to save run logs
          out_dir = os.path.abspath(os.path.join(args.run_folder, "../runs_GCN_UnSup", args.model_name))
          print("Writing to {}\n".format(out_dir))
          # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
          checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
          checkpoint_prefix = os.path.join(checkpoint_dir, "model")
          if not os.path.exists(checkpoint_dir):
              os.makedirs(checkpoint_dir)

          # Initialize all variables
          sess.run(tf.compat.v1.global_variables_initializer())
          graph = tf.compat.v1.get_default_graph()

          def train_step(Adj_block, X_concat, num_features_nonzero, idx_nodes):
              feed_dict = {
                  unsup_gcn.Adj_block: Adj_block,
                  unsup_gcn.X_concat: X_concat,
                  unsup_gcn.num_features_nonzero: num_features_nonzero,
                  unsup_gcn.dropout: args.dropout,
                  unsup_gcn.input_y:idx_nodes
              }
              _, step, loss = sess.run([train_op, global_step, unsup_gcn.total_loss], feed_dict)
              return loss

          # write_acc = open(checkpoint_prefix + '_acc.txt', 'w')
          max_acc = 0.0
          idx_epoch = 0
          num_batches_per_epoch = int((len(graphs) - 1) / args.batch_size) + 1
          predicted_labels = []
          acc_all = []
          acc_test_all = []
          acc_train_all = []

          mean_each_epoch = []


          start_time = time.time()

          # Start epochs from 1-50
          for epoch in range(1, args.num_epochs+51):

              # feature update using loss function
              loss = 0
              for _ in range(num_batches_per_epoch):
                  Adj_block, X_concat, num_features_nonzero, idx_nodes = batch_nodes()
                  loss += train_step(Adj_block, X_concat, num_features_nonzero, idx_nodes)
                  # current_step = tf.compat.v1.train.global_step(sess, global_step)
              print(loss)
              # It will give tensor object
              node_embeddings = graph.get_tensor_by_name('embedding/node_embeddings:0')
              node_embeddings = sess.run(node_embeddings)
              graph_embeddings = graph_pool.dot(node_embeddings)

              acc_10folds = []
              trainacc_10folds = []

              # Do 5 fold cross validation per epoch
              for fold_idx in range(0,5):
                  # Setting up train/test split for 5fold-CV
                  train_idx, test_idx = separate_data_idx(graphs, fold_idx)
                  train_graph_embeddings = graph_embeddings[train_idx]
                  test_graph_embeddings = graph_embeddings[test_idx]
                  train_labels = graph_labels[train_idx]
                  test_labels = graph_labels[test_idx]

                  # Logistic regression model for training
                  cls = LogisticRegression(tol=0.001, max_iter = 2000, C=0.1)
                  # cls = svm.SVC()
                  trained_model = cls.fit(train_graph_embeddings, train_labels)

                  predicted_train = cls.predict(train_graph_embeddings)
                  ACC_train = cal_acc_only(predicted_train, train_labels)
                  trainacc_10folds.append(ACC_train)

                  predicted_test = cls.predict(test_graph_embeddings)
                  ACC = cal_acc_only(predicted_test, test_labels)
                  acc_10folds.append(ACC)

                  if(epoch == args.num_epochs):
                    predicted = cls.predict(test_graph_embeddings)
                    predicted_labels.append(predicted)

                    acc, auc, precision, recall, f1score = calculate_acc(predicted, test_labels)
                    acc_all.append(acc)
              print("Epoch:", epoch)
              mean_10folds = statistics.mean(acc_10folds)
              print("Test ACC:", mean_10folds)
              mean_trainacc_10folds = statistics.mean(trainacc_10folds)
              print("Train ACC:", mean_trainacc_10folds)

              acc_test_all.append(mean_10folds)
              acc_train_all.append(mean_trainacc_10folds)
              

              # ACC_test = cls.score(test_graph_embeddings, test_labels)
              # acc_test_all.append(ACC_test)

              mean_each_epoch.append(str(mean_10folds*100))

  return acc_test_all, acc_train_all


all_test_acc = []
all_train_acc = []
iter = 2
for d in range(len(all_datasets)):
  for i in range(iter):
    graphs, num_classes = load_data(all_datasets[d], use_degree_as_tag)
    feature_dim_size = graphs[0].node_features.shape[1]
    graph_labels = np.array([graph.label for graph in graphs])

    graph_pool = get_graphpool(graphs)
    # print("Graph_pool :", graph_pool)
    batch_nodes = Batch_Loader()

    print("Loading data... finished!")
    print(all_datasets[d], " loaded")

    acc_test_all, acc_train_all = run_HBCompare_training()
    all_test_acc.append(acc_test_all)
    all_train_acc.append(acc_train_all)

In [None]:
# label = ["P1", "A1", "P2", "A2", "P3"]
# plt.figure(0)
fig_test_acc_per_epoch = []
fig_train_acc_per_epoch = []

temp = 0
start = 0
inc = iter


for idx in range(len(all_datasets)):
  test_list = []
  train_list = []
  for x in range(args.num_epochs + 50):
    temp = 0
    for i in range(start,start+inc):
      temp = temp + float(all_test_acc[i][x])
    test_list.append(float(temp/inc))

    temp_list = []
    temp = 0
    for i in range(start,start+inc):
      temp = temp + float(all_train_acc[i][x])
    train_list.append(float(temp/inc))

  fig_test_acc_per_epoch.append(test_list)
  fig_train_acc_per_epoch.append(train_list)
  start = start + inc

print(fig_test_acc_per_epoch)
print(fig_train_acc_per_epoch)

for idx in range(len(all_datasets)):
  plt.figure(idx, dpi=1000, figsize=[10,5])
  plt.xlabel("Epoch")
  plt.ylabel("Accuracy")

  plt.yticks(np.arange(0.0,1.05, 0.1))
  plt.ylim(0.5,1.05)

  plt.plot(range(100),fig_train_acc_per_epoch[idx], label = "Training")
  plt.plot(range(100),fig_test_acc_per_epoch[idx], label = "Testing")
  plt.legend()
  plt.show()


# for i in range(len(all_datasets)):
#   plt.figure(i)
#   plt.xlabel("Epoch")
#   plt.ylabel("Accuracy")
#   plt.plot(range(50),all_train_acc[i], label = "Training")
#   plt.plot(range(50),all_test_acc[i], label = "Testing")
#   plt.legend()
#   plt.show()


In [None]:
import csv

print(rewrite_array(all_test_acc))
print(rewrite_array(all_train_acc))

def rewrite_array(all_array):
  new_array = []
  for i in range(len(all_array)):
    for j in range(len(all_array[i])):
      new_array.append(float(all_array[i][j]))
  return new_array

csvfile = open('training_vs_testing_all_datasets.csv', 'w', newline='')
spamwriter = csv.writer(csvfile, delimiter=',')
spamwriter.writerow(rewrite_array(all_test_acc))
spamwriter.writerow(rewrite_array(all_train_acc))