In [1]:
# import dependencies
import spektral
import tensorflow as tf
from spektral.layers import GraphConv
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dropout
import numpy as np

### Perform GCN on the Cora dataset ###

In [2]:
## load cora data set
cora_A, cora_X, cora_L, cora_train_mask, cora_val_mask, cora_test_mask = spektral.datasets.citation.load_data(dataset_name='cora',
                                                                                                              normalize_features=False, 
                                                                                                              random_split=False)

Loading cora dataset


cora_A - adjacency matrix

cora_X - feature matrix (doc by term)

cora_L - label

In [3]:
# define number of nodes, features and classes
num_cora_nodes = cora_A.shape[0]
num_cora_features = cora_X.shape[1]
num_cora_classes = cora_L.shape[1]

In [4]:
# print out attributes
print('shape of cora Adjacency Matrix: {} x {}'.format(num_cora_nodes, num_cora_nodes))
print('number of cora features (number of termrs): ', num_cora_features)
print('number of cora classes: ', num_cora_classes)

shape of cora Adjacency Matrix: 2708 x 2708
number of cora features (number of termrs):  1433
number of cora classes:  7


In [5]:
# check the distribution of each class for balance
cora_L.sum(axis = 0)

array([351, 217, 418, 818, 426, 298, 180])

In [6]:
# define feature and adjacency input to the first GCN layer
cora_X_in = Input(shape = (num_cora_features, ))
cora_A_in = Input(shape = (num_cora_nodes, ), sparse = True)
# construct 3 layers of GCN, features beuing reduced to
# 64, 32 and 7 (number of classes)
# use drop out of 0.5 to minimize overfitting
cora_X_1 = GraphConv(64, 'relu')([cora_X_in, cora_A_in])
cora_X_1 = Dropout(0.5)(cora_X_1)
cora_X_2 = GraphConv(32, 'relu')([cora_X_1, cora_A_in])
cora_X_2 = Dropout(0.5)(cora_X_2)
cora_X_3 = GraphConv(num_cora_classes, 'softmax')([cora_X_2, cora_A_in])
# use keras functional API to construct a GCN model
cora_model = Model(inputs = [cora_X_in, cora_A_in], outputs = cora_X_3, name = 'cora_GCN_model')

In [7]:
# preprocess adjacency matrix to add self-loops and scale edge weights
cora_A = GraphConv.preprocess(cora_A).astype('f4')

In [8]:
# compile model
cora_model.compile(optimizer = 'adam',
             loss = 'categorical_crossentropy',
             weighted_metrics = ['acc', tf.keras.metrics.AUC()])
cora_model.summary()

Model: "cora_GCN_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1433)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 2708)]       0                                            
__________________________________________________________________________________________________
graph_conv (GraphConv)          (None, 64)           91776       input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 64)           0           graph_conv[0][0]    

In [9]:
# define number of epochs
cora_epochs = 50
# Prepare data
cora_X = cora_X.toarray()

In [10]:
# prepare val data
cora_val_data = ([cora_X, cora_A], cora_L, cora_val_mask)

# Train model
cora_model.fit([cora_X, cora_A], cora_L,
               sample_weight = cora_train_mask,
               validation_data = cora_val_data,
               epochs = cora_epochs,
               batch_size = num_cora_nodes,
               shuffle = False)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1422f9510>

In [11]:
# Evaluate model
eval_results = cora_model.evaluate([cora_X, cora_A], 
                                   cora_L,
                                   sample_weight = cora_test_mask,
                                   batch_size = num_cora_nodes)
print('Done.\n'
      'Test loss: {}\n'
      'Test accuracy: {}'.format(*eval_results))

Done.
Test loss: 0.38267821073532104
Test accuracy: 0.8040000200271606


### Use Naive Bayes Classifier for Cora dataset (Multinomial)

In [12]:
# import dependencies
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [13]:
# instantiate a Multinominal NB classifier class
nb_multinomial = MultinomialNB()

In [14]:
# convert label from hot-encoding back to single value encoding
cora_y = [list(label).index(1) + 1 for label in cora_L]
cora_y = np.asarray(cora_y)

In [15]:
# train_test split the original data
cora_X_train, cora_X_test, cora_y_train, cora_y_test = train_test_split(cora_X, cora_y,
                                                                       test_size = 0.3,
                                                                       random_state = 44)
print('train size: ', cora_X_train.shape[0])
print('test size: ', cora_X_test.shape[0])

train size:  1895
test size:  813


In [16]:
# fit classiifer on training data
nb_multinomial.fit(cora_X_train, cora_y_train)
# make predition and evaluate on training data
print('mean accuracy on training set: ', nb_multinomial.score(cora_X_train, cora_y_train))

mean accuracy on training set:  0.8981530343007915


In [17]:
# make predition and evaluate on testing data
print('mean accuracy on testing set: ', nb_multinomial.score(cora_X_test, cora_y_test))

mean accuracy on testing set:  0.7712177121771218


### Use Logistic Regression Classifier for Cora dataset

In [18]:
# import dependencies
from sklearn.linear_model import LogisticRegression

In [19]:
# instantiate a logistic regressor class
LR_model = LogisticRegression(penalty = 'l2', 
                              solver = 'lbfgs')

In [20]:
LR_model.fit(cora_X_train, cora_y_train)
# make predition and evaluate on training data
print('mean accuracy on training set: ', LR_model.score(cora_X_train, cora_y_train))

mean accuracy on training set:  0.9920844327176781


In [21]:
# make predition and evaluate on testing data
print('mean accuracy on testing set: ', LR_model.score(cora_X_test, cora_y_test))

mean accuracy on testing set:  0.7724477244772447


### Perform GCN on the PubMed dataset ###

In [22]:
## load PubMed data set
med_A, med_X, med_L, med_train_mask, med_val_mask, med_test_mask = spektral.datasets.citation.load_data(dataset_name ='pubmed',
                                                                                                        normalize_features = True, 
                                                                                                        random_split = False)

num_med_nodes = med_A.shape[0]
num_med_features = med_X.shape[1]
num_med_classes = med_L.shape[1]

# print out attributes
print('shape of pubmed Adjacency Matrix: {} x {}'.format(num_med_nodes, num_med_nodes))
print('number of pubmed features (number of termrs): ', num_med_features)
print('number of pubmed classes: ', num_med_classes)

med_L.sum(axis = 0)

Loading pubmed dataset
Pre-processing node features
shape of pubmed Adjacency Matrix: 19717 x 19717
number of pubmed features (number of termrs):  500
number of pubmed classes:  3


array([4103, 7739, 7875])

In [23]:
med_X_in = Input(shape = (num_med_features, ))
med_A_in = Input(shape = (num_med_nodes, ), sparse = True)

med_X_1 = GraphConv(32, 'relu')([med_X_in, med_A_in])
med_X_1 = Dropout(0.5)(med_X_1)
med_X_2 = GraphConv(8, 'relu')([med_X_1, med_A_in])
med_X_2 = Dropout(0.5)(med_X_2)
med_X_3 = GraphConv(num_med_classes, 'softmax')([med_X_2, med_A_in])

med_model = Model(inputs = [med_X_in, med_A_in], outputs = med_X_3, name = 'med_GCN_model')

med_A = GraphConv.preprocess(med_A).astype('f4')
# compile model
med_model.compile(optimizer = 'adam',
                  loss = 'categorical_crossentropy',
                  weighted_metrics = ['acc', tf.keras.metrics.AUC()])
med_model.summary()

Model: "med_GCN_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 500)]        0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 19717)]      0                                            
__________________________________________________________________________________________________
graph_conv_3 (GraphConv)        (None, 32)           16032       input_3[0][0]                    
                                                                 input_4[0][0]                    
__________________________________________________________________________________________________
dropout_2 (Dropout)             (None, 32)           0           graph_conv_3[0][0]   

In [24]:
# define number of epochs
med_epochs = 50
# Prepare data
med_X = med_X.toarray()

med_val_data = ([med_X, med_A], med_L, med_val_mask)

med_model.fit([med_X, med_A], med_L,
             sample_weight = med_train_mask,
             validation_data = med_val_data,
             epochs = med_epochs,
             batch_size = num_med_nodes,
             shuffle = False)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1422bf390>

In [25]:
# Evaluate model
med_eval_results = med_model.evaluate([med_X, med_A], 
                                      med_L,
                                      sample_weight = med_test_mask,
                                      batch_size = num_med_nodes)
print('Done.\n'
      'Test loss: {}\n'
      'Test accuracy: {}'.format(*med_eval_results))

Done.
Test loss: 0.05528887361288071
Test accuracy: 0.4519999921321869


### Perform GCN on the citeseer dataset ###

In [29]:
## load Citeseer data set
cs_A, cs_X, cs_L, cs_train_mask, cs_val_mask, cs_test_mask = spektral.datasets.citation.load_data(dataset_name ='citeseer',
                                                                                                        normalize_features = True, 
                                                                                                        random_split = False)

num_cs_nodes = cs_A.shape[0]
num_cs_features = cs_X.shape[1]
num_cs_classes = cs_L.shape[1]

# print out attributes
print('shape of citeseer Adjacency Matrix: {} x {}'.format(num_cs_nodes, num_cs_nodes))
print('number of citeseer features (number of termrs): ', num_cs_features)
print('number of citeseer classes: ', num_cs_classes)

cs_L.sum(axis = 0)

Loading citeseer dataset
Pre-processing node features
shape of citeseer Adjacency Matrix: 3327 x 3327
number of citeseer features (number of termrs):  3703
number of citeseer classes:  6


  r_inv = np.power(rowsum, -1).flatten()


array([249., 590., 668., 701., 596., 508.])

In [30]:
cs_X_in = Input(shape = (num_cs_features, ))
cs_A_in = Input(shape = (num_cs_nodes, ), sparse = True)

cs_X_1 = GraphConv(64, 'relu')([cs_X_in, cs_A_in])
cs_X_1 = Dropout(0.5)(cs_X_1)
cs_X_2 = GraphConv(32, 'relu')([cs_X_1, cs_A_in])
cs_X_2 = Dropout(0.5)(cs_X_2)
cs_X_3 = GraphConv(num_cs_classes, 'softmax')([cs_X_2, cs_A_in])

cs_model = Model(inputs = [cs_X_in, cs_A_in], outputs = cs_X_3, name = 'cs_GCN_model')

cs_A = GraphConv.preprocess(cs_A).astype('f4')
# compile model
cs_model.compile(optimizer = 'adam',
                  loss = 'categorical_crossentropy',
                  weighted_metrics = ['acc', tf.keras.metrics.AUC()])
cs_model.summary()

Model: "cs_GCN_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 3703)]       0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            [(None, 3327)]       0                                            
__________________________________________________________________________________________________
graph_conv_9 (GraphConv)        (None, 64)           237056      input_7[0][0]                    
                                                                 input_8[0][0]                    
__________________________________________________________________________________________________
dropout_6 (Dropout)             (None, 64)           0           graph_conv_9[0][0]    

In [31]:
# define number of epochs
cs_epochs = 50
# Prepare data
cs_X = cs_X.toarray()

cs_val_data = ([cs_X, cs_A], cs_L, cs_val_mask)

cs_model.fit([cs_X, cs_A], cs_L,
             sample_weight = cs_train_mask,
             validation_data = cs_val_data,
             epochs = cs_epochs,
             batch_size = num_cs_nodes,
             shuffle = False)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x144c22150>

In [32]:
# Evaluate model
cs_eval_results = cs_model.evaluate([cs_X, cs_A], 
                                    cs_L,
                                    sample_weight = cs_test_mask,
                                    batch_size = num_cs_nodes)
print('Done.\n'
      'Test loss: {}\n'
      'Test accuracy: {}'.format(*cs_eval_results))

Done.
Test loss: 0.5254522562026978
Test accuracy: 0.6460000276565552
