In [132]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import SpectralClustering, KMeans
from sklearn.metrics import pairwise_distances
from sklearn import metrics
import os
import networkx as nx
import numpy as np
import pandas as pd

# from sklearn.linear_model import LogisticRegressionCV
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import f1_score

from spektral.layers import GraphConv
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dropout

%matplotlib inline

In [133]:
data_dir = os.getcwd()
edge_location = os.path.expanduser(os.path.join(data_dir, "fb-CMU-Carnegie49/fb-CMU-Carnegie49.edges"))
g_nx = nx.read_edgelist(path=edge_location)


class_data_location = os.path.expanduser(os.path.join(data_dir, "fb-CMU-Carnegie49/fb-CMU-Carnegie49.node_labels"))
node_attr = pd.read_csv(class_data_location, sep=',', header=None)
values = { str(row.tolist()[0]): row.tolist()[-1] for _, row in node_attr.iterrows()}
nx.set_node_attributes(g_nx, values, 'class')

column_names =  ["node_id" ,"class"]
node_data = pd.read_csv(os.path.join(data_dir, "fb-CMU-Carnegie49/fb-CMU-Carnegie49.node_labels"), header=None, names=column_names)



In [134]:
g_nx_ccs = (g_nx.subgraph(c).copy() for c in nx.connected_components(g_nx))
g_nx = max(g_nx_ccs, key=len)
node_ids = list(g_nx.nodes())
print("Largest subgraph statistics: {} nodes, {} edges".format(
    g_nx.number_of_nodes(), g_nx.number_of_edges()))

node_targets = [ g_nx.nodes[node_id]['class'] for node_id in node_ids]

print(f"There are {len(np.unique(node_targets))} unique labels on the nodes.")

print(f"There are {len(g_nx.nodes())} nodes in the network.")


Largest subgraph statistics: 6621 nodes, 249959 edges
There are 3 unique labels on the nodes.
There are 6621 nodes in the network.


In [149]:
s = set(node_data["class"])
#build a dictionary to convert string to numbers
convert_table = {e:idx for idx, e in enumerate(s)}

def word2idx(word):
    return convert_table[word]

ground_truth =  [word2idx(i) for i in node_targets]
print(len(ground_truth))

6621


In [136]:
A = nx.to_numpy_array(g_nx) 
X = np.diag(np.ones(len(g_nx.nodes()))) #6621
y =  np.zeros((len(ground_truth), max(ground_truth)+1))
y[np.arange(len(ground_truth)),ground_truth] = 1



In [137]:
N = A.shape[0] # N = 6621
F = X.shape[-1] #X.shape = 6621*6621 
n_classes = y.shape[-1] #3

print(N, ' ', F, ' ', n_classes)
print(type(y))

6621   6621   3
<class 'numpy.ndarray'>


In [138]:
import matplotlib.pyplot as plt

degree_sequence = sorted([(d, n) for n, d in g_nx.degree()], reverse=True)
print(degree_sequence[0:6])


[(840, '2110'), (785, '2491'), (777, '3076'), (742, '2785'), (711, '3104'), (642, '6592')]


In [139]:
train_mask,  val_mask, test_mask =  (np.zeros(N) for i in range(3))

train_mask[0:400] = [1 for i in range(400)]
val_mask[400:2000] = [1 for i in range(1600)]
test_mask[2000:] = [1 for i in range(N-2000)]

print(type(train_mask))

<class 'numpy.ndarray'>


In [140]:
# Model definition
X_in = Input(shape=(F, ))  # This imply expected input will be batches of F-dimensional matrix (F=1433, input features)
A_in = Input((N, ), sparse=True)  # IThis imply expected input will be batches of N-dimensional matrix (N=2704, input adjacency), it is a sparse matrix.

graph_conv_1 = GraphConv(128, activation='relu')([X_in, A_in])
dropout1 = Dropout(0.3)(graph_conv_1)

# graph_conv_2 = GraphConv(12, activation='relu')([dropout1, A_in])
# dropout2 = Dropout(0.3)(graph_conv_2)

graph_conv_3 = GraphConv(n_classes, activation='softmax')([dropout1, A_in])
# Build model
model = Model(inputs=[X_in, A_in], outputs=graph_conv_3)

In [141]:
from spektral import utils
from scipy import sparse
A = sparse.csr_matrix(A)
A = utils.localpooling_filter(A).astype('f4') 

In [142]:
print(type(A))
print(type(X))
print(type(y))
print(type(train_mask))
print(type(val_mask))
print(type(test_mask))

<class 'scipy.sparse.csr.csr_matrix'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [143]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              weighted_metrics=['acc'])
model.summary()

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           [(None, 6621)]       0                                            
__________________________________________________________________________________________________
input_14 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
graph_conv_13 (GraphConv)       (None, 128)          847616      input_13[0][0]                   
                                                                 input_14[0][0]                   
__________________________________________________________________________________________________
dropout_6 (Dropout)             (None, 128)          0           graph_conv_13[0][0]        

In [144]:
from tensorflow.keras.callbacks import EarlyStopping
# Prepare data
validation_data = ([X, A], y, val_mask)

# Train model
model.fit([X, A],
          y,
          sample_weight=train_mask,
          epochs=300,
          batch_size=N, #batch size = no of nodes. Put all nodes into neural network at once.
          validation_data=validation_data,
          shuffle=False,  # Shuffling data means shuffling the whole graph
          callbacks=[
              EarlyStopping(patience=10,  restore_best_weights=True)
          ])


0.5200
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78/300
Epoch 79/300
Epoch 80/300
Epoch 81/300
Epoch 82/300
Epoch 83/300
Epoch 84/300
Epoch 85/300
Epoch 86/300
Epoch 87/300
Epoch 88/300
Epoch 89/300
Epoch 90/300
Epoch 91/300
Epoch 92/300
Epoch 93/300
Epoch 94/300
Epoch 95/300
Epoch 96/300
Epoch 97/300
Epoch 98/300
Epoch 99/300
Epoch 100/300
Epoch 101/300
Epoch 102/300
Epoch 103/300
Epoch 104/300
Epoch 105/300
Epoch 106/300
Epoch 107/300
Epoch 108/300
Epoch 109/300
Epoch 110/300
Epoch 111/300
Epoch 112/300
Epoch 113/300
Epoch 114/300
Epoch 115/300
Epoch 116/300
Epoch 117/300
Epoch 118/300
Epoch 119/300
Epoch 120/300
Epoch 121/300
Epoch 122/300
Epoch 123/300
Epoch 124/300
Epoch 125/300
Epoch 126/300
Epoch 127/300
Epoch 128/300
Epoch 129/300
Epoch 130/300
Epoch 131/300
Epoch 132/300
Epoch 133/300
Epoch 134/300
Epoch 135/300
Epoch 136/300
Epoch 137/300
Epoch 138/300
Epoch 139/300
Epoch 140/300
Epoch 141/300
Epoch 142/300
Epoch 143/300


<tensorflow.python.keras.callbacks.History at 0x229554022e8>

In [145]:
# Evaluate model
eval_results = model.evaluate([X, A],
                              y,
                              sample_weight=test_mask,
                              batch_size=N)
print('Done.\n'
      'Test loss: {}\n'
      'Test accuracy: {}'.format(*eval_results))

  ...
    to  
  ['...']
Done.
Test loss: 0.6190159320831299
Test accuracy: 0.5700064897537231


In [146]:

y_result = model.predict([X,A], batch_size=N)
# y_group = []
# for index, item in enumerate(y_result):
#     y_group.append(np.argmax(y_result[index]))
y_group = np.argmax(y_result, axis=-1)

In [147]:
from sklearn import metrics
print(metrics.adjusted_rand_score(ground_truth, y_group))
print(metrics.adjusted_mutual_info_score(ground_truth, y_group))
print(metrics.accuracy_score(ground_truth, y_group))
#print(ground_truth)
#print(y_group)

0.042012659002485554
0.022234073748231345
0.5760459145144238
