In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

import os
from google.colab import drive
drive.mount('/content/gdrive')
!pwd
os.chdir('gdrive/My Drive/Graph_based_methods/Graph_Transformer/')
!pwd

### Util

In [None]:
from __future__ import print_function
import networkx as nx
import numpy as np
import random
import scipy.sparse as sp
from sklearn.model_selection import StratifiedKFold
from scipy.sparse import isspmatrix

"""Adapted from https://github.com/weihua916/powerful-gnns/blob/master/util.py"""

class S2VGraph(object):
    def __init__(self, g, label, node_tags=None, node_features=None):
        '''
            g: a networkx graph
            label: an integer graph label
            node_tags: a list of integer node tags
            node_features: a torch float tensor, one-hot representation of the tag that is used as input to neural nets
            edge_mat: a torch long tensor, contain edge list, will be used to create torch sparse tensor
            neighbors: list of neighbors (without self-loop)
        '''
        self.label = label
        self.g = g
        self.node_tags = node_tags
        self.neighbors = []
        self.node_features = 0
        self.edge_mat = 0
        self.max_neighbor = 0


def load_data(dataset, degree_as_tag):
    '''
        dataset: name of dataset
        test_proportion: ratio of test train split
        seed: random seed for random splitting of dataset
    '''

    print('loading data')
    g_list = []
    label_dict = {}
    feat_dict = {}

    with open('/content/gdrive/My Drive/Graph_based_methods/Graph_Transformer/dataset/%s/%s.txt' % (dataset, dataset), 'r') as f:
        n_g = int(f.readline().strip())
        for i in range(n_g):
            row = f.readline().strip().split()
            n, l = [int(w) for w in row]
            if not l in label_dict:
                mapped = len(label_dict)
                label_dict[l] = mapped
            g = nx.Graph()
            node_tags = []
            node_features = []
            n_edges = 0
            for j in range(n):
                g.add_node(j)
                row = f.readline().strip().split()
                tmp = int(row[1]) + 2
                if tmp == len(row):
                    # no node attributes
                    row = [int(w) for w in row]
                    attr = None
                else:
                    row, attr = [int(w) for w in row[:tmp]], np.array([float(w) for w in row[tmp:]])
                if not row[0] in feat_dict:
                    mapped = len(feat_dict)
                    feat_dict[row[0]] = mapped
                node_tags.append(feat_dict[row[0]])

                if tmp > len(row):
                    node_features.append(attr)

                n_edges += row[1]
                for k in range(2, len(row)):
                    g.add_edge(j, row[k])

            if node_features != []:
                node_features = np.stack(node_features)
                node_feature_flag = True
            else:
                node_features = None
                node_feature_flag = False

            assert len(g) == n

            g_list.append(S2VGraph(g, l, node_tags))
     

    #add labels and edge_mat       
    for g in g_list:
        g.neighbors = [[] for i in range(len(g.g))]
        for i, j in g.g.edges():
            g.neighbors[i].append(j)
            g.neighbors[j].append(i)
        degree_list = []
        for i in range(len(g.g)):
            g.neighbors[i] = g.neighbors[i]
            degree_list.append(len(g.neighbors[i]))
        g.max_neighbor = max(degree_list)

        g.label = label_dict[g.label]

        edges = [list(pair) for pair in g.g.edges()]
        edges.extend([[i, j] for j, i in edges])

        deg_list = list(dict(g.g.degree(range(len(g.g)))).values())

        g.edge_mat = np.transpose(np.array(edges, dtype=np.int32), (1,0))

    if degree_as_tag:
        for g in g_list:
            g.node_tags = list(dict(g.g.degree).values())

    #Extracting unique tag labels   
    tagset = set([])
    for g in g_list:
        tagset = tagset.union(set(g.node_tags))

    tagset = list(tagset)
    tag2index = {tagset[i]:i for i in range(len(tagset))}

    for g in g_list:
        g.node_features = np.zeros((len(g.node_tags), len(tagset)), dtype=np.float32)
        g.node_features[range(len(g.node_tags)), [tag2index[tag] for tag in g.node_tags]] = 1


    print('# classes: %d' % len(label_dict))
    print('# maximum node tag: %d' % len(tagset))

    print("# data: %d" % len(g_list))

    return g_list, len(label_dict)


"""Convert sparse matrix to tuple representation."""
def sparse_to_tuple(sparse_mx):
    def to_tuple(mx):
        if not sp.isspmatrix_coo(mx):
            mx = mx.tocoo()
        coords = np.vstack((mx.row, mx.col)).transpose()
        values = mx.data
        shape = mx.shape
        return coords, values, shape

    if isinstance(sparse_mx, list):
        for i in range(len(sparse_mx)):
            sparse_mx[i] = to_tuple(sparse_mx[i])
    else:
        sparse_mx = to_tuple(sparse_mx)

    return sparse_mx


### Set up dataset data

In [None]:
data_with_ = "dataset_2"
data_without_ = "dataset2"
data_name = data_without_
use_degree_as_tag = False

graphs, num_classes = load_data(data_name, use_degree_as_tag)

os.chdir('/content/gdrive/My Drive/Graph_based_methods/Graph_Transformer/dataset/Features_Omar_new')

dataset = data_with_
# dataset_name = dataset + '_RBF_plus_connectivity_manual_matrix.mat'
# dataset_name = dataset + '_RBF_manual_matrix.mat'
# dataset_name = dataset + '_label_embedding_matrix.mat'
dataset_name = dataset + '_connectivity_matrix.mat'

### KNN Classifier

In [None]:
def Knn_classifier(X, label_all, train_idx, test_idx):
  X_train = X[train_idx,:]
  X_test = X[test_idx,:]
  train_label = label_all[train_idx]
  test_label = label_all[test_idx]

  num_test = len(list(test_idx))
  num_train = len(list(train_idx))

  predict_labels = []

  for i in range(num_test):
    X_test_i = X_test[i,:]
    dist_i = []
    for j in range(num_train):
      X_train_j = X_train[j,:]
      dist_ij = np.linalg.norm(X_test_i - X_train_j)
      dist_i.append(dist_ij)

    min_idx = np.argmin(np.array(dist_i))
    label_i = train_label[min_idx]
    predict_labels.append(label_i)

  return predict_labels 


### Visualize Feature

In [None]:
from matplotlib import pyplot as plt
import time
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.datasets import fetch_openml
from sklearn import datasets
import matplotlib.pylab as plt
import scipy.sparse as sparse

def visualize_with_tSNE(X, labels):
  classes = np.unique(labels)
  num_classes = classes.size 
  y = labels

  tsne = TSNE(n_components=2, random_state=0)

  X_2d = tsne.fit_transform(X)

  target_ids = range(num_classes)
  label_name = np.array(range(num_classes))


  plt.figure(figsize=(6, 5))
  colors = 'r', 'g', 'b', 'c', 'm', 'y', 'k', 'w', 'orange', 'purple'
  for i, c, label in zip(target_ids, colors, label_name):
      if(i == 0):
        marker_label = '^'
        str_label = 'Class: 0'
      if(i == 1):
        marker_label = 'o'  
        str_label = 'Class: 1'
      if(i == 2):
        marker_label = '+'  
        str_label = 'Class: 2'
      if(i == 3):
        marker_label = '*'  
        str_label = 'Class: 3'      
      plt.scatter(X_2d[y == i, 0], X_2d[y == i, 1], marker = marker_label ,c=c, label= str(str_label))
  plt.legend()
  plt.show()


# # create a sparse diagonal matrix with ones on the diagonal
# A = sparse.eye(100)
# # visualize the sparse matrix with Spy
# plt.spy(A, markersize=2)

# num_graphs,_,_ = X_feature.shape

# for i in range(num_graphs):
#   X_feature_i = X_feature[10,:,:]
#   plt.spy(X_feature_i, markersize=2)

### FCN Classify

In [None]:
!pip install keract

import tensorflow as tf
from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt
import numpy as np
import scipy.io
from keras import backend as K
from keract import get_activations
import keract

def FCN_classify(X_train, X_test, label_train, label_test, num_classes):
  model = models.Sequential()
  model.add(layers.Dense(64, activation='softmax'))
  model.add(layers.Dense(num_classes))

  # model.summary()

  model.compile(optimizer='adam',
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])
  
  history = model.fit(X_train, label_train, epochs=19, verbose =0)

  predictions = model.predict(X_test)

  predicted_labales = np.argmax(predictions, axis = 1)

  # print(' acc ', acc, ' auc ', auc) 
  return predicted_labales

  
  


### Cal_metrics

In [None]:
, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from scipy.sparse import isspmatrix
from sklearn.linear_model import LogisticRegression


"""Get indexes of train and test sets"""
def separate_data_idx(label_all, fold_idx, seed=0):
    assert 0 <= fold_idx and fold_idx < 10, "fold_idx must be from 0 to 9."
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

    labels = label_all
    idx_list = []

    for idx in skf.split(np.zeros(len(labels)), labels):
        idx_list.append(idx)
    train_idx, test_idx = idx_list[fold_idx]

    return train_idx, test_idx


def calculate_acc(predicted, gt):
  correct = 0
  for i in range(len(predicted)):
    if(predicted[i] == gt[i]):
      correct += 1

  acc = correct/len(predicted)


  lb = preprocessing.LabelBinarizer()
  lb.fit(gt)

  gt_binary = lb.transform(gt)
  predicted_binary = lb.transform(predicted)

  auc = roc_auc_score(gt_binary, predicted_binary, average = 'macro')
  precision, recall, f1score, support = precision_recall_fscore_support(gt_binary, predicted_binary, average = 'macro')

  A = classification_report(predicted, gt, digits = 4)
  print(A)


  return acc, auc, precision, recall, f1score    

### Data Preprocessing

In [None]:
### PCA
import numpy as np

def Feature_reduction_PCA(X, R):
  mean_X = np.mean(X,axis=0)
  X_normalize = X - mean_X
  U, S, V_T = np.linalg.svd(X_normalize, full_matrices=False)
  V = np.transpose(V_T)
  V_truncate = V[:,0:R]
  X_Feature = np.matmul(X_normalize,V_truncate)

  return X_Feature

def Vec2Matrix(X):
  num_graphs, M_square = X.shape 
  M_matrix = int(np.sqrt(M_square))
  X_matrix = np.zeros([num_graphs, int(np.sqrt(M_square)), int(np.sqrt(M_square))])
  for i in range(num_graphs):
    Xi = X[i,:]
    Xi_matrix = np.reshape(Xi, [M_matrix, M_matrix])
    X_matrix[i,:,:] = Xi_matrix

  return X_matrix

### 2DPCA
def Feature_reduction_2DPCA(X,R):
  X_new = X.copy()
  N_graphs,_,_ = X.shape
  sum_X = 0
  for i in range(N_graphs):
    sum_X = sum_X + X[i,:,:]

  mean_X = sum_X/N_graphs

  for i in range(N_graphs):
    X_new[i,:,:] = X[i,:,:] - mean_X

  sum_X = 0

  for i in range(N_graphs):
    Xi = X_new[i,:,:]
    sum_X = sum_X + np.matmul(Xi, Xi.transpose())

  U,S,V_t = np.linalg.svd(sum_X)
  U = U[:,0:R]
  X_new_feature = np.zeros([N_graphs,R,R])

  for i in range(N_graphs):
    Xi = X_new[i,:,:]
    Xi = np.matmul(U.transpose(),Xi)  
    Xi = np.matmul(Xi,U)  
    X_new_feature[i,:,:] = Xi


  return X_new_feature  

def Matrix2Vec(X):
  N, M1,M2 = X.shape
  X_vector = np.zeros([N, M1*M2])
  for i in range(N):
    Xi = X[i,:,:]
    Xi_vector = np.reshape(Xi,[1, M1*M2])
    X_vector[i,:] = Xi_vector

  return X_vector 

def permute_matrix(X, ratio):
  size_X,_ = X.shape 
  A = np.eye(size_X)
  select_num = int(size_X * ratio)
  perm_index = np.random.permutation(size_X)   
  selected_index = perm_index[0:select_num]
  A[selected_index,:] = A[np.random.permutation(selected_index),:];

  permuted_X = np.matmul(np.matmul(A,X), A.transpose())

  return permuted_X, A

def load_and_process_node_features(data_name, use_degree_as_tag):  
    graphs, num_classes = load_data(data_name, use_degree_as_tag)
    min_num_nodes = 100000
    num_graphs = len(graphs)
    for i in range(num_graphs):
      graph_i = graphs[i]
      graph_i_feature = graph_i.node_features
      num_nodes,feature_size = graph_i_feature.shape
      if(num_nodes < min_num_nodes):
        min_num_nodes = num_nodes

    feature_all_selected = np.zeros([num_graphs,min_num_nodes*feature_size])  

    for i in range(num_graphs):
      graph_i = graphs[i]
      graph_i_feature = graph_i.node_features.copy()
      graph_i_feature_selected = graph_i_feature[0:min_num_nodes,:]
      graph_i_feature_vector = np.reshape(graph_i_feature_selected, [1,min_num_nodes*feature_size])
      feature_all_selected[i,:] = graph_i_feature_vector  

    return feature_all_selected

# data_name = "dataset1"
# use_degree_as_tag = False

# feature_all_selected = load_and_process_node_features(data_name, use_degree_as_tag)

###Print Results

In [None]:
from numpy import std
def print_results(total_acc, total_auc, total_f1score):
  print("Accuracy from all runs: ", total_acc)
  print("AUC from all runs: ",total_auc)
  total = 0
  count = 0
  for i in range(len(total_acc)):
    for each in total_acc[i]:
      total += each
      count += 1
  print("Average acc = ", total/count)
  print("STD = ", std(total_acc))

  total = 0
  count = 0
  for i in range(len(total_auc)):
    for each in total_auc[i]:
      total += each
      count += 1
  print("Average auc = ", total/count)
  print("STD = ", std(total_auc))

  total = 0
  count = 0
  for i in range(len(total_f1score)):
    for each in total_f1score[i]:
      total += each
      count += 1
  print("Average f1score = ", total/count)
  print("STD = ", std(total_f1score))

### Graph Kernel Setup

In [None]:
!pip install grakel
!pip install sklearn

import networkx as nx
import numpy as np
from scipy.sparse import spmatrix

from warnings import warn
from collections import Counter, Iterable
from grakel import Kernel, Graph

### Graph Kernel Method

** This method does not work at this time.
Applies grakel methods to a list of adjacency matrix representations of graphs.

In [None]:
from grakel.kernels.graphlet_sampling import GraphletSampling
, train_test_split
import numpy as np
import scipy.io
import scipy.sparse
import os
from sklearn import svm
from sklearn.model_selection import train_test_split
from grakel import GraphKernel, ShortestPath, Graph
from grakel.kernels import WeisfeilerLehman, VertexHistogram
from grakel.datasets import fetch_dataset
from grakel.utils import cross_validate_Kfold_SVM

# MUTAG = fetch_dataset("MUTAG", verbose=False)
# G = MUTAG.data
# y = MUTAG.target
# print(G)
# print(y)

# wl_kernel = WeisfeilerLehman(n_iter=5, normalize=True, base_graph_kernel=VertexHistogram)
# G_train, G_test, y_train, y_test = train_test_split(G, y, test_size=0.1, random_state=42)

# print(G_train)

os.chdir('/content/gdrive/My Drive/Graph_based_methods/Graph_Transformer/dataset/Features_Omar_new')

data_with_ = "dataset_3"
data_without_ = "dataset3"

dataset = data_with_
data_name = data_without_
use_degree_as_tag = False
# dataset_name = dataset + '_RBF_plus_connectivity_manual_matrix.mat'
# dataset_name = dataset + '_RBF_manual_matrix.mat'
# dataset_name = dataset + '_label_embedding_matrix.mat'
dataset_name = dataset + '_connectivity_matrix.mat'

mat = scipy.io.loadmat(dataset_name)
X = mat['X_feature_tensor']
ndims_X = np.ndim(X)

# if(ndims_X == 3):
#   X = Matrix2Vec(X)

# X_feature = X

label_name = 'label_'+ dataset + '.mat'
mat = scipy.io.loadmat(label_name)
label_all = mat['label_all'][0]
classes = np.unique(label_all)
num_classes = classes.size

print("Labels")
print(label_all)
print("Example matrix in X_feature")
print(X[5])

all_graphs = []
for i in X:
  sparse_mat = sparse.csr_matrix(i)
  graph = Graph(sparse_mat, node_labels = label_all)
  all_graphs.append(graph)

def run_gk():
  # visualize_with_tSNE(X_feature, label_all)

  print('*********** dataset_name =', dataset_name,' *******************')

  predicted_labels = []
  acc_all = []
  auc_all = []
  f1score_all = []

  ##**** Logistic Regression **** #####
  # for fold_idx in range(5):
      # train_idx, test_idx = separate_data_idx(label_all, fold_idx)
      # train_embeddings = X_feature[train_idx,:]
      # test_embeddings = X_feature[test_idx,:]
      # train_labels = label_all[train_idx]
      # test_labels = label_all[test_idx]

  G_train, G_test, y_train, y_test = train_test_split(all_graphs, label_all, test_size=0.1)

  #GK transform
  # gk = WeisfeilerLehman(base_graph_kernel = VertexHistogram, normalize=True)
  # gk = ShortestPath(normalize=True, with_labels=False)
  # gk = GraphletSampling(normalize=True)

  # for i in G_train:
  # K_train = gk.fit_transform(G_train)
  # K_test = gk.transform(G_test)

  # cls = LogisticRegression(tol=0.001, max_iter = 2000)
  # cls.fit(train_embeddings, train_labels)
  cls = svm.SVC(kernel="linear")
  cls.fit(G_train, y_train)
  ACC = cls.score(G_test, y_test)

  predicted = cls.predict(G_test)
  print(predicted)

  acc, auc, precision, recall, f1score = calculate_acc(predicted, y_test)

  print(' acc ', acc, ' auc ', auc, ' precision ', precision, ' recall ', recall, ' f1score ', f1score) 

  
  acc_all.append(acc)
  auc_all.append(auc)
  f1score_all.append(f1score)

  acc_all = np.array(acc_all)
  auc_all = np.array(auc_all)
  f1score_all = np.array(f1score_all)

  # print('acc_all = ', acc_all)   

  # print('auc_all = ', auc_all) 
  
  return acc_all, auc_all, f1score_all

total_acc = []
total_auc = []
total_f1score = []
iter = 1
for i in range(iter):
  # run_gk()
  acc_all,auc_all, f1score_all = run_gk()
  total_acc.append(acc_all)
  total_auc.append(auc_all)
  total_f1score.append(f1score_all)

# print_results(total_acc, total_auc, total_f1score)

In [None]:
print("ACC:", np.mean(total_acc))

### GK Linear Kernel

This method applies a linear kernel to the adjacency matrix representing the overall graph.

In [None]:
import numpy as np
import scipy.io
import os
from sklearn import svm

os.chdir('/content/gdrive/My Drive/Graph_based_methods/Graph_Transformer/dataset/Features_Omar_new')

data_with_ = "dataset_5"
data_without_ = "dataset5"
dataset = data_with_
data_name = data_without_
use_degree_as_tag = False
# dataset_name = dataset + '_RBF_plus_connectivity_manual_matrix.mat'
# dataset_name = dataset + '_RBF_manual_matrix.mat'
# dataset_name = dataset + '_label_embedding_matrix.mat'
dataset_name = dataset + '_connectivity_matrix.mat'

def run_PCA():
  # random.seed()
  mat = scipy.io.loadmat(dataset_name)
  X = mat['X_feature_tensor']

  ndims_X = np.ndim(X)

  if(ndims_X == 3):
    X = Matrix2Vec(X)

  X_feature = X

  label_name = 'label_'+ dataset + '.mat'
  mat = scipy.io.loadmat(label_name)
  label_all = mat['label_all'][0]
  classes = np.unique(label_all)
  num_classes = classes.size

  # visualize_with_tSNE(X_feature, label_all)

  print('*********** dataset_name =', dataset_name,' *******************')

  predicted_labels = []
  acc_all = []
  auc_all = []
  f1score_all = []


  ##**** Logistic Regression **** #####
  for fold_idx in range(5):
      train_idx, test_idx = separate_data_idx(label_all, fold_idx)
      train_embeddings = X_feature[train_idx,:]
      test_embeddings = X_feature[test_idx,:]
      train_labels = label_all[train_idx]
      test_labels = label_all[test_idx]

      # cls = LogisticRegression(tol=0.001, max_iter = 2000)
      cls = svm.SVC(kernel="linear")
      cls.fit(train_embeddings, train_labels)
      ACC = cls.score(test_embeddings, test_labels)

      predicted = cls.predict(test_embeddings)
      predicted_labels.append(predicted)

      acc, auc, precision, recall, f1score = calculate_acc(predicted, test_labels)

      print('fold ', fold_idx, ' acc ', ACC, ' auc ', auc, ' precision ', precision, ' recall ', recall, ' f1score ', f1score) 

      
      acc_all.append(acc)
      auc_all.append(auc)
      f1score_all.append(f1score)

  acc_all = np.array(acc_all)
  auc_all = np.array(auc_all)
  f1score_all = np.array(f1score_all)

  # print('acc_all = ', acc_all)   

  # print('auc_all = ', auc_all) 
  
  return acc_all, auc_all, f1score_all

total_acc = []
total_auc = []
total_f1score = []
iter = 1
for i in range(iter):
  acc_all,auc_all, f1score_all = run_PCA()
  total_acc.append(acc_all)
  total_auc.append(auc_all)
  total_f1score.append(f1score_all)

print_results(total_acc, total_auc, total_f1score)

### GK Kernel Pre-computed

**This method does not work at this time.

This function takes a pre-computed kernel matrix and inputs into a 5-fold CV with SVM classifier. 


In [None]:
from grakel.kernels.graphlet_sampling import GraphletSampling
, train_test_split
import numpy as np
import scipy.io
import scipy.sparse
import os
from sklearn import svm
from sklearn.model_selection import train_test_split
from grakel import GraphKernel, ShortestPath, Graph
from grakel.kernels import WeisfeilerLehman, VertexHistogram
from grakel.datasets import fetch_dataset
from grakel.utils import cross_validate_Kfold_SVM


os.chdir('/content/gdrive/My Drive/Graph_based_methods/Graph_Transformer/dataset/dataset1')
kernel_name = dataset + '_SPkernel.mat'
kernel_mat = scipy.io.loadmat(kernel_name)
kernel = kernel_mat['Kernel']

os.chdir('/content/gdrive/My Drive/Graph_based_methods/Graph_Transformer/dataset/Features_Omar_new')

data_with_ = "dataset_1"
data_without_ = "dataset1"

dataset = data_with_
data_name = data_without_
use_degree_as_tag = False
# dataset_name = dataset + '_RBF_plus_connectivity_manual_matrix.mat'
# dataset_name = dataset + '_RBF_manual_matrix.mat'
# dataset_name = dataset + '_label_embedding_matrix.mat'
dataset_name = dataset + '_connectivity_matrix.mat'

mat = scipy.io.loadmat(dataset_name)
X = mat['X_feature_tensor']
ndims_X = np.ndim(X)

if(ndims_X == 3):
  X = Matrix2Vec(X)

X_feature = X
X_feature = kernel

label_name = 'label_'+ dataset + '.mat'
mat = scipy.io.loadmat(label_name)
label_all = mat['label_all'][0]
classes = np.unique(label_all)
num_classes = classes.size

# X_feature = my_kernel(X_feature,label_all)

# def my_kernel(X, Y):
#         """
#         We create a custom kernel:

#                     (2  0)
#         k(X, Y) = X  (    ) Y.T
#                     (0  1)
#         """
#         # print("X shape",X.shape)
#         # print("Kernel shape", kernel.shape)
        
#         # s1 = np.dot(X.T,kernel)
#         # print(s1.shape)
#         # print("label shape", Y.shape)
#         # s2 = np.dot(s1, Y)
#         s2 = np.dot(X,X.T)
#         # print("return shape",s2.shape)
#         return s2

# X_feature = my_kernel(X_feature, label_all)
# print("X_feature final shape",X_feature.shape)

def run_gk():
  # visualize_with_tSNE(X_feature, label_all)

  print('*********** dataset_name =', dataset_name,' *******************')

  predicted_labels = []
  acc_all = []
  auc_all = []
  f1score_all = []

  ##**** Logistic Regression **** #####
  for fold_idx in range(5):
      train_idx, test_idx = separate_data_idx(label_all, fold_idx)
      G_train = X_feature[train_idx,:]
      G_test = X_feature[test_idx,:]
      y_train = label_all[train_idx]
      y_test = label_all[test_idx]

      # G_train, G_test, y_train, y_test = train_test_split(X_feature, label_all, test_size=0.1)

      #GK transform
      # K_train = gk.fit_transform(G_train)
      # K_test = gk.transform(G_test)

      # cls = LogisticRegression(tol=0.001, max_iter = 2000)
      cls = svm.SVC(kernel=kernel)
      print("training...")
      cls.fit(G_train, y_train)
      print("done.")
      
      # cls.fit(K_train, y_train)
      # ACC = cls.score(test_embeddings, test_labels)

      predicted = cls.predict(G_test)
      print(predicted)

      acc, auc, precision, recall, f1score = calculate_acc(predicted, y_test)

      print(' acc ', acc, ' auc ', auc, ' precision ', precision, ' recall ', recall, ' f1score ', f1score) 

      
      acc_all.append(acc)
      auc_all.append(auc)
      f1score_all.append(f1score)

      acc_all = np.array(acc_all)
      auc_all = np.array(auc_all)
      f1score_all = np.array(f1score_all)

  # print('acc_all = ', acc_all)   

  # print('auc_all = ', auc_all) 
  
  return acc_all, auc_all, f1score_all

total_acc = []
total_auc = []
total_f1score = []
iter = 1
for i in range(iter):
  # run_gk()
  acc_all,auc_all, f1score_all = run_gk()
  total_acc.append(acc_all)
  total_auc.append(auc_all)
  total_f1score.append(f1score_all)

# print_results(total_acc, total_auc, total_f1score)

In [None]:
print("ACC:", np.mean(total_acc))

### PCA

In [None]:
import numpy as np
import scipy.io
import os

os.chdir('/content/gdrive/My Drive/Graph_based_methods/Graph_Transformer/dataset/Features_Omar_new')

# dataset = data_with_
# data_name = data_without_
# use_degree_as_tag = False
# dataset_name = dataset + '_RBF_plus_connectivity_manual_matrix.mat'
# dataset_name = dataset + '_RBF_manual_matrix.mat'
# dataset_name = dataset + '_label_embedding_matrix.mat'
# dataset_name = dataset + '_connectivity_matrix.mat'

def run_PCA():
  # random.seed()
  mat = scipy.io.loadmat(dataset_name)
  X = mat['X_feature_tensor']

  ndims_X = np.ndim(X)

  if(ndims_X == 3):
    X = Matrix2Vec(X)

  X_feature = Feature_reduction_PCA(X, 60)
  print(X_feature.shape)
  # X_feature = X

  label_name = 'label_'+ dataset + '.mat'
  mat = scipy.io.loadmat(label_name)
  label_all = mat['label_all'][0]
  classes = np.unique(label_all)
  num_classes = classes.size

  # visualize_with_tSNE(X_feature, label_all)

  print('*********** dataset_name =', dataset_name,' *******************')

  predicted_labels = []
  acc_all = []
  auc_all = []
  f1score_all = []


  ##**** Logistic Regression **** #####
  for fold_idx in range(5):
      train_idx, test_idx = separate_data_idx(label_all, fold_idx)
      train_embeddings = X_feature[train_idx,:]
      test_embeddings = X_feature[test_idx,:]
      train_labels = label_all[train_idx]
      test_labels = label_all[test_idx]

      cls = LogisticRegression(tol=0.001, max_iter = 2000)
      cls.fit(train_embeddings, train_labels)
      ACC = cls.score(test_embeddings, test_labels)

      predicted = cls.predict(test_embeddings)
      predicted_labels.append(predicted)

      acc, auc, precision, recall, f1score = calculate_acc(predicted, test_labels)

      print('fold ', fold_idx, ' acc ', ACC, ' auc ', auc, ' precision ', precision, ' recall ', recall, ' f1score ', f1score) 

      
      acc_all.append(acc)
      auc_all.append(auc)
      f1score_all.append(f1score)

  acc_all = np.array(acc_all)
  auc_all = np.array(auc_all)
  f1score_all = np.array(f1score_all)

  # print('acc_all = ', acc_all)   

  # print('auc_all = ', auc_all) 
  
  return acc_all, auc_all, f1score_all

total_acc = []
total_auc = []
total_f1score = []
iter = 1
for i in range(iter):
  acc_all,auc_all, f1score_all = run_PCA()
  total_acc.append(acc_all)
  total_auc.append(auc_all)
  total_f1score.append(f1score_all)

print_results(total_acc, total_auc, total_f1score)
##**** FCN Classify **** #####
# for fold_idx in range(5):
#     train_idx, test_idx = separate_data_idx(label_all, fold_idx)
#     train_embeddings = X_feature[train_idx,:]
#     test_embeddings = X_feature[test_idx,:]
#     train_labels = label_all[train_idx]
#     test_labels = label_all[test_idx] 

#     predicted = FCN_classify(train_embeddings, test_embeddings, train_labels, test_labels, num_classes)
#     predicted_labels.append(predicted)

#     acc, auc = calculate_acc(predicted, test_labels)

#     print('fold ', fold_idx, ' acc ', acc, ' auc ', auc) 

#     auc_all.append(auc)
#     acc_all.append(acc)

# acc_all = np.array(acc_all)
# auc_all = np.array(auc_all)

# print('acc_all = ', acc_all)   

# print('auc_all = ', auc_all) 


##**** Repeat FCN Classify **** #####
# iter = 20
# acc_all = []
# for i in range(iter):
#   acc_i = 0
#   for fold_idx in range(5):
#       train_idx, test_idx = separate_data_idx(label_all, fold_idx)
#       train_embeddings = X_feature[train_idx,:]
#       test_embeddings = X_feature[test_idx,:]
#       train_labels = label_all[train_idx]
#       test_labels = label_all[test_idx] 

#       predicted = FCN_classify(train_embeddings, test_embeddings, train_labels, test_labels, num_classes)
#       predicted_labels.append(predicted)

#       acc, auc = calculate_acc(predicted, test_labels)

#       # print('fold ', fold_idx, ' acc ', acc, ' auc ', auc) 

#       acc_i += acc

#   avg_acc_i = acc_i/(fold_idx + 1) 
#   acc_all.append(avg_acc_i)   



# acc_all = np.array(acc_all)
# # auc_all = np.array(auc_all)

# print('acc_all = ', acc_all)   

# # print('auc_all = ', auc_all) 

### PCA + node features


In [None]:
import numpy as np
import scipy.io
import os

# dataset = data_with_
# data_name = data_without_
# use_degree_as_tag = False
# dataset_name = dataset + '_RBF_plus_connectivity_manual_matrix.mat'
# dataset_name = dataset + '_RBF_manual_matrix.mat'
# dataset_name = dataset + '_label_embedding_matrix.mat'
# dataset_name = dataset + '_connectivity_matrix.mat'

def run_PCA_NF():
  mat = scipy.io.loadmat(dataset_name)
  X = mat['X_feature_tensor']

  ndims_X = np.ndim(X)

  if(ndims_X == 3):
    X = Matrix2Vec(X)

  X_feature = Feature_reduction_PCA(X, 60)
  X_node_feature = load_and_process_node_features(data_name, use_degree_as_tag)

  print(X_feature.shape)
  print(X_node_feature.shape)

  X_feature = np.concatenate((X_feature, X_node_feature), axis = 1)

  print(X_feature.shape)

  label_name = 'label_'+ dataset + '.mat'
  mat = scipy.io.loadmat(label_name)
  label_all = mat['label_all'][0]
  classes = np.unique(label_all)
  num_classes = classes.size

  # visualize_with_tSNE(X_feature, label_all)

  print('*********** dataset_name =', dataset_name,' *******************')

  predicted_labels = []
  acc_all = []
  auc_all = []
  f1score_all = []


  ##**** Logistic Regression **** #####
  for fold_idx in range(5):
      train_idx, test_idx = separate_data_idx(label_all, fold_idx)
      print(test_idx)
      train_embeddings = X_feature[train_idx,:]
      test_embeddings = X_feature[test_idx,:]
      train_labels = label_all[train_idx]
      test_labels = label_all[test_idx]

      cls = LogisticRegression(tol=0.001, max_iter = 2000)
      cls.fit(train_embeddings, train_labels)
      ACC = cls.score(test_embeddings, test_labels)

      predicted = cls.predict(test_embeddings)
      predicted_labels.append(predicted)

      acc, auc, precision, recall, f1score = calculate_acc(predicted, test_labels)

      print('fold ', fold_idx, ' acc ', ACC, ' auc ', auc, ' precision ', precision, ' recall ', recall, ' f1score ', f1score) 

      
      acc_all.append(acc)
      auc_all.append(auc)
      f1score_all.append(f1score)

  acc_all = np.array(acc_all)
  auc_all = np.array(auc_all)

  # print('acc_all = ', acc_all)   

  # print('auc_all = ', auc_all) 
  return acc_all, auc_all, f1score_all


total_acc = []
total_auc = []
total_f1score = []
iter = 10
for i in range(iter):
  acc_all, auc_all, f1score_all = run_PCA_NF()
  total_acc.append(acc_all)
  total_auc.append(auc_all)
  total_f1score.append(f1score_all)

print_results(total_acc, total_auc, total_f1score)

### 2DPCA

In [None]:
import numpy as np
import scipy.io
import os

# os.chdir('/content/gdrive/My Drive/Graph_based_methods/Graph_Transformer/dataset/Features_Omar_new')

# dataset = data_with_
# dataset_name = dataset + '_RBF_plus_connectivity_manual_matrix.mat'
# dataset_name = dataset + '_RBF_manual_matrix.mat'
# dataset_name = dataset + '_connectivity_matrix.mat'

def run_2DPCA():
  mat = scipy.io.loadmat(dataset_name)
  X = mat['X_feature_tensor']

  ndims_X = np.ndim(X)
  if(ndims_X == 2):
    # X = Matrix2Vec(X)
    X_feature = Vec2Matrix(X)
  X_feature = X;

  X_feature_r = Feature_reduction_2DPCA(X_feature, 60)

  X_feature = Matrix2Vec(X_feature_r)

  label_name = 'label_'+ dataset + '.mat'
  mat = scipy.io.loadmat(label_name)
  label_all = mat['label_all'][0]
  classes = np.unique(label_all)
  num_classes = classes.size

  print('*********** dataset_name =', dataset_name,' *******************')


  #### Logistic Regression #####
  predicted_labels = []
  acc_all = []
  auc_all = []
  f1score_all = []


  for fold_idx in range(5):
      train_idx, test_idx = separate_data_idx(label_all, fold_idx)
      train_embeddings = X_feature[train_idx,:]
      test_embeddings = X_feature[test_idx,:]
      train_labels = label_all[train_idx]
      test_labels = label_all[test_idx]

      cls = LogisticRegression(tol=0.001, max_iter = 2000)
      cls.fit(train_embeddings, train_labels)
      ACC = cls.score(test_embeddings, test_labels)

      predicted = cls.predict(test_embeddings)
      predicted_labels.append(predicted)

      acc, auc, precision, recall, f1score = calculate_acc(predicted, test_labels)

      print('fold ', fold_idx, ' acc ', ACC, ' auc ', auc, ' precision ', precision, ' recall ', recall, ' f1score ', f1score) 

      auc_all.append(auc)
      acc_all.append(acc)
      f1score_all.append(f1score)

  acc_all = np.array(acc_all)
  auc_all = np.array(auc_all)
  f1score_all = np.array(f1score_all)

  return acc_all, auc_all, f1score_all

total_acc = []
total_auc = []
total_f1score = []
iter = 10
for i in range(iter):
  acc_all,auc_all, f1score_all = run_2DPCA()
  total_acc.append(acc_all)
  total_auc.append(auc_all)
  total_f1score.append(f1score_all)

print_results(total_acc, total_auc, total_f1score)
  # print('acc_all = ', acc_all)   

  # print('auc_all = ', auc_all) 

##**** Repeat FCN Classify **** #####
# iter = 20
# acc_all = []
# for i in range(iter):
#   acc_i = 0
#   for fold_idx in range(5):
#       train_idx, test_idx = separate_data_idx(label_all, fold_idx)
#       train_embeddings = X_feature[train_idx,:]
#       test_embeddings = X_feature[test_idx,:]
#       train_labels = label_all[train_idx]
#       test_labels = label_all[test_idx] 

#       predicted = FCN_classify(train_embeddings, test_embeddings, train_labels, test_labels, num_classes)
#       predicted_labels.append(predicted)

#       acc, auc = calculate_acc(predicted, test_labels)

#       # print('fold ', fold_idx, ' acc ', acc, ' auc ', auc) 

#       acc_i += acc

#   avg_acc_i = acc_i/(fold_idx + 1) 
#   acc_all.append(avg_acc_i)   



# acc_all = np.array(acc_all)
# # auc_all = np.array(auc_all)

# print('acc_all = ', acc_all)   

# print('auc_all = ', auc_all) 

### CNN classifier

In [None]:

!pip install keract

%load_ext tensorboard
import tensorflow as tf
import datetime, os
import tensorflow.keras
from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt
import numpy as np
import scipy.io
from keras import backend as K
from keract import get_activations
import keract



def initialize_model(num_classes):

    model = models.Sequential()
    model.add(layers.Conv2D(6, (6, 6), activation='relu', input_shape = (600,600,1)))
    model.add(layers.MaxPooling2D((2, 2)))

    model.add(layers.Conv2D(6, (6, 6), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))

    # model.add(layers.Conv2D(6, (6, 6), activation='relu'))
    # model.add(layers.MaxPooling2D((2, 2)))

    # model.add(layers.Conv2D(6, (6, 6), activation='relu'))
    # model.add(layers.MaxPooling2D((2, 2)))

    # model.add(layers.Conv2D(6, (6, 6), activation='relu'))
    # model.add(layers.MaxPooling2D((2, 2)))

    # model.add(layers.Conv2D(6, (6, 6), activation='relu'))
    # model.add(layers.MaxPooling2D((2, 2)))

    model.add(layers.Flatten())
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(num_classes, activation='softmax'))

    
    return model

permute_ratio = 0


# os.chdir('/content/gdrive/My Drive/Graph_based_methods/Graph_Transformer/dataset/Features_Omar_new')

# dataset = data_with_
# dataset_name = dataset + '_RBF_manual_matrix.mat'
# dataset_name = dataset + '_RBF_plus_connectivity_manual_matrix.mat'
# dataset_name = dataset + '_RBF_based_on_connectivity.mat'
# dataset_name = dataset + '_connectivity_matrix.mat'

def run_CNN():
  mat = scipy.io.loadmat(dataset_name)
  X = mat['X_feature_tensor']

  ndims_X = np.ndim(X)

  if(ndims_X == 2):
    X_feature = Vec2Matrix(X)
  elif(ndims_X == 3):
    X_feature = X

  num_graph,W,H = X_feature.shape
  A_all = []

  # for i in range(num_graph):
  #   X_i = X_feature[i,:,:].copy()
  #   X_i_permute,A_i = permute_matrix(X_i, permute_ratio)
  #   X_feature[i,:,:] = X_i_permute
  #   A_all.append(A_i)


  label_name = 'label_'+ dataset + '.mat'
  mat = scipy.io.loadmat(label_name)
  label_all = mat['label_all'][0]

  print('*********** dataset_name =', dataset_name,' *******************')
  classes = np.unique(label_all)
  num_classes = classes.size
  print('number of classes: ', num_classes)


  predicted_labels = []
  acc_all = []
  auc_all = []
  f1score_all = []

  for fold_idx in range(5):    
      train_idx, test_idx = separate_data_idx(label_all, fold_idx)
      intersect = np.intersect1d(train_idx, test_idx)
      print(intersect)

      train_embeddings = X_feature[train_idx,:,:]
      test_embeddings = X_feature[test_idx,:,:]
      train_labels = label_all[train_idx]
      test_labels = label_all[test_idx]

      N, H, W = train_embeddings.shape
      train_embeddings = np.reshape(train_embeddings, [N,H,W,1])


      N, H, W = test_embeddings.shape
      test_embeddings = np.reshape(test_embeddings, [N,H,W,1])


      model = initialize_model(num_classes)

      model.compile(optimizer='adam',
                    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                    metrics=['accuracy'])
      
      logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
      tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

      model.fit(x=train_embeddings, 
              y=train_labels, 
              epochs = 30,
              batch_size = 6, 
              validation_data=(test_embeddings, test_labels), 
              callbacks=[tensorboard_callback], verbose = 1)
      
      predictions = model.predict(test_embeddings)

      predicted_labals = np.argmax(predictions, axis = -1)

      acc, auc, precision, recall, f1score = calculate_acc(predicted_labals, test_labels)

      auc_all.append(auc)
      acc_all.append(acc) 
      f1score_all.append(f1score)

      print('fold ', fold_idx, ' acc ', acc, ' auc ', auc, ' precision ', precision, ' recall ', recall, ' f1score ', f1score) 
      
      #### Predict_with_Logistic_regression
      # extractor = tensorflow.keras.Model(inputs=model.inputs,
      #                     outputs=[layer.output for layer in model.layers])

      # features = extractor(X_feature)

      # X_feature_new = features[5]

      # X_feature_new = np.array(X_feature_new)

      # train_embeddings = X_feature_new[train_idx,:]
      # test_embeddings = X_feature_new[test_idx,:]
      # train_labels = label_all[train_idx]
      # test_labels = label_all[test_idx]

      # cls = LogisticRegression(tol=0.001, max_iter = 2000)
      # cls.fit(train_embeddings, train_labels)

      # predicted_labals = cls.predict(test_embeddings)

      # acc, auc = calculate_acc(predicted_labals, test_labels)
      # print('fold ', fold_idx, ' acc ', acc, ' auc ', auc)

      # auc_all.append(auc)
      # acc_all.append(acc) 

      ## Predict with KNN
      # extractor = tensorflow.keras.Model(inputs=model.inputs,
      #                     outputs=[layer.output for layer in model.layers])

      # features = extractor(X_feature)

      # X_feature_new = features[7]
      # X_feature_new = np.array(X_feature_new)

      # predicted_labals = Knn_classifier(X_feature_new, label_all, train_idx, test_idx)
      # acc, auc = calculate_acc(predicted_labals, test_labels)

      # auc_all.append(auc)
      # acc_all.append(acc)

      # print('fold ', fold_idx, ' acc ', acc, ' auc ', auc) 

  acc_all = np.array(acc_all)
  auc_all = np.array(auc_all)
  f1score_all = np.array(f1score_all)

  print('avg_acc = ',np.mean(acc_all), 'avg_auc = ',np.mean(auc_all))
  print('acc_all = ', acc_all)   
  print('auc_all = ', auc_all)   

  return acc_all, auc_all, f1score_all

total_acc = []
total_auc = []
total_f1score = []
iter = 2
for i in range(iter):
  acc_all,auc_all, f1score_all = run_CNN()
  total_acc.append(acc_all)
  total_auc.append(auc_all)
  total_f1score.append(f1score_all)

print_results(total_acc, total_auc, total_f1score)



In [None]:
import tensorflow.keras

extractor = tensorflow.keras.Model(inputs=model.inputs,
                        outputs=[layer.output for layer in model.layers])

features = extractor(X_feature)

X = features[5]

print(X.shape)


visualize_with_tSNE(X, label_all)

# X_feature_new = np.array(X)
# print(X.shape)
# auc_all = []
# acc_all = []
# # ##**** Logistic Regression **** #####
# for fold_idx in range(5):
#     train_idx, test_idx = separate_data_idx(label_all, fold_idx)
#     train_embeddings = X_feature_new[train_idx,:]
#     test_embeddings = X_feature_new[test_idx,:]
#     train_labels = label_all[train_idx]
#     test_labels = label_all[test_idx]



#     cls = LogisticRegression(tol=0.001, max_iter = 2000)
#     cls.fit(train_embeddings, train_labels)
#     ACC = cls.score(test_embeddings, test_labels)

#     predicted = cls.predict(test_embeddings)
#     predicted_labels.append(predicted)

#     acc, auc = calculate_acc(predicted, test_labels)

#     print('fold ', fold_idx, ' acc ', ACC, ' auc ', auc) 

#     auc_all.append(auc)
#     acc_all.append(acc)

# acc_all = np.array(acc_all)
# auc_all = np.array(auc_all)

# print('mean_acc = ', np.mean(acc_all))

# print('acc_all = ', acc_all)   

# print('auc_all = ', auc_all) 


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Fixing random state for reproducibility
np.random.seed(19680801)


# N = 10
# r0 = 0.6
# x = 0.9 * np.random.rand(N)
# y = 0.9 * np.random.rand(N)
# area = (20 * np.random.rand(N))**2  # 0 to 10 point radii
# c = np.sqrt(area)
# r = np.sqrt(x ** 2 + y ** 2)
# area1 = np.ma.masked_where(r < r0, area)
# area2 = np.ma.masked_where(r >= r0, area)
# plt.scatter(x, y, s=area1, marker='^', c=c)
# plt.scatter(x, y, s=area2, marker='o', c=c)
# # Show the boundary between the regions:
# theta = np.arange(0, np.pi / 2, 0.01)
# plt.plot(r0 * np.cos(theta), r0 * np.sin(theta))

# plt.show()


x = np.random.rand(10,1)
y = np.random.rand(10,1)
plt.scatter(x, y, marker='^')
plt.scatter(x+1, y+1, marker='o')
plt.show()
