<a href="https://colab.research.google.com/github/JAYANTHNITW/Graph_Neural_Networks/blob/main/Node_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import networkx as nx
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

##Cora dataset consists of cora.cites and cora.content download


In [2]:
zip_file = keras.utils.get_file(
fname="cora.tgz",
origin="https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz",
extract=True,
)
data_dir = os.path.join(os.path.dirname(zip_file), "cora")


Downloading data from https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz


In [8]:
citations = pd.read_csv(
os.path.join(data_dir, "cora.cites"),
sep="\t",
header=None,
names=["target", "source"],)
print("Citations shape:", citations.shape)

Citations shape: (5429, 2)


In [7]:
citations.head(5)

Unnamed: 0,target,source
0,35,1033
1,35,103482
2,35,103515
3,35,1050679
4,35,1103960


##Loaded data into pandas dataframe

In [9]:
column_names = ["paper_id"] + [f"term_{idx}" for idx in range(1433)] + ["subject"]
papers = pd.read_csv(
os.path.join(data_dir, "cora.content"), sep="\t", header=None, names=column_names,
)
print("Papers shape:", papers.shape)

Papers shape: (2708, 1435)


In [None]:
papers.columns

Index(['paper_id', 'term_0', 'term_1', 'term_2', 'term_3', 'term_4', 'term_5',
       'term_6', 'term_7', 'term_8',
       ...
       'term_1424', 'term_1425', 'term_1426', 'term_1427', 'term_1428',
       'term_1429', 'term_1430', 'term_1431', 'term_1432', 'subject'],
      dtype='object', length=1435)

In [None]:
papers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2708 entries, 0 to 2707
Columns: 1435 entries, paper_id to subject
dtypes: int64(1434), object(1)
memory usage: 29.6+ MB


In [None]:
print(papers.sample(5).T)

                      979                    278              2538    1986  \
paper_id            131318                1095507           684531  167656   
term_0                   0                      0                0       0   
term_1                   0                      0                0       0   
term_2                   0                      0                0       0   
term_3                   0                      0                0       0   
...                    ...                    ...              ...     ...   
term_1429                0                      0                0       0   
term_1430                0                      0                0       0   
term_1431                0                      0                0       0   
term_1432                0                      0                0       0   
subject    Neural_Networks  Probabilistic_Methods  Neural_Networks  Theory   

                      1023  
paper_id             14083  
term_

##Conversion of paper_id and subject_id into zero based indeces


In [None]:
class_values = sorted(papers["subject"].unique())
class_idx = {name: id for id, name in enumerate(class_values)}
paper_idx = {name: idx for idx, name in enumerate(sorted(papers["paper_id"].unique()))}
papers["paper_id"] = papers["paper_id"].apply(lambda name: paper_idx[name])
citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])
papers["subject"] = papers["subject"].apply(lambda value: class_idx[value])
print(papers.subject.value_counts())

2    818
3    426
1    418
6    351
0    298
4    217
5    180
Name: subject, dtype: int64


##Spilit data into train and test

In [12]:
train_data, test_data = [], []
for _, group_data in papers.groupby("subject"):
# Select around 50% of the dataset for training.
  random_selection = np.random.rand(len(group_data.index)) <= 0.5
  train_data.append(group_data[random_selection])
  test_data.append(group_data[~random_selection])
train_data = pd.concat(train_data).sample(frac=1)
test_data = pd.concat(test_data).sample(frac=1)
print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

Train data shape: (1337, 1435)
Test data shape: (1371, 1435)


In [13]:
train_data

Unnamed: 0,paper_id,term_0,term_1,term_2,term_3,term_4,term_5,term_6,term_7,term_8,...,term_1424,term_1425,term_1426,term_1427,term_1428,term_1429,term_1430,term_1431,term_1432,subject
2125,735311,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Neural_Networks
1762,753264,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Theory
872,696342,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Neural_Networks
306,91975,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Reinforcement_Learning
954,52515,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Probabilistic_Methods
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2359,1102567,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Case_Based
714,646809,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Genetic_Algorithms
169,282700,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Neural_Networks
1335,3233,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Probabilistic_Methods


##Implement train and test experiment

In [15]:
hidden_units = [32, 32]
learning_rate = 0.01
dropout_rate = 0.5
num_epochs = 300
batch_size = 256

def run_experiment(model, x_train, y_train):
  # Compile the model.
  model.compile(
  optimizer=keras.optimizers.Adam(learning_rate),
  loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")],
  )
  # Create an early stopping callback.
  early_stopping = keras.callbacks.EarlyStopping(
  monitor="val_acc", patience=50, restore_best_weights=True)

  # Fit the model.
  history = model.fit(
  x=x_train,
  y=y_train,
  epochs=num_epochs,
  batch_size=batch_size,
  validation_split=0.15,
  callbacks=[early_stopping],
  )
  return history


In [16]:
def display_learning_curves(history):
  fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
  ax1.plot(history.history["loss"])
  ax1.plot(history.history["val_loss"])
  ax1.legend(["train", "test"], loc="upper right")
  ax1.set_xlabel("Epochs")
  ax1.set_ylabel("Loss")
  ax2.plot(history.history["acc"])
  ax2.plot(history.history["val_acc"])
  ax2.legend(["train", "test"], loc="upper right")
  ax2.set_xlabel("Epochs")
  ax2.set_ylabel("Accuracy")
  plt.show()


##Feed forword network Implementation

In [17]:
def create_ffn(hidden_units, dropout_rate, name=None):
  fnn_layers = []
  for units in hidden_units:
    fnn_layers.append(layers.BatchNormalization())
    fnn_layers.append(layers.Dropout(dropout_rate))
    fnn_layers.append(layers.Dense(units, activation=tf.nn.gelu))
  return keras.Sequential(fnn_layers, name=name)

##Prepare dataset for GNN model


In [19]:
feature_names = set(papers.columns) - {"paper_id", "subject"}
num_features = len(feature_names)
#num_classes = len(class_idx)
# Create train and test features as a numpy array.
x_train = train_data[feature_names].to_numpy()
x_test = test_data[feature_names].to_numpy()
# Create train and test targets as a numpy array.
y_train = train_data["subject"]
y_test = test_data["subject"]


  x_train = train_data[feature_names].to_numpy()
  x_test = test_data[feature_names].to_numpy()


In [21]:
num_features

1433

In [23]:
train_data[feature_names]

  train_data[feature_names]


Unnamed: 0,term_14,term_1237,term_1243,term_1264,term_1234,term_137,term_619,term_1189,term_195,term_1430,...,term_961,term_1034,term_866,term_1320,term_1188,term_885,term_387,term_581,term_585,term_507
2125,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1762,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
872,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
306,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
954,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2359,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
714,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
169,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1335,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0


In [22]:
x_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [31]:
edges = citations[["source", "target"]].to_numpy().T
# Create an edge weights array of ones.
edge_weights = tf.ones(shape=edges.shape[1])
# Create a node features array of shape [num_nodes, num_features].
node_features = tf.cast(
papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
# Create graph info tuple with node_features, edges, and edge_weights.
graph_info = (node_features, edges, edge_weights)
print("Edges shape:", edges.shape)
print("Nodes shape:", node_features.shape)

  papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32


Edges shape: (2, 5429)
Nodes shape: (2708, 1433)


In [33]:
citations[["source", "target"]].to_numpy().T

array([[   1033,  103482,  103515, ..., 1140289,  853118, 1155073],
       [     35,      35,      35, ...,  853118,  853155,  954315]])

##Implement a graph convolution layer

In [24]:
class GraphConvLayer(layers.Layer):
  def __init__(
    self,
    hidden_units,
    dropout_rate=0.2,
    aggregation_type="mean",
    combination_type="concat",
    normalize=False,
    *args,
    **kwargs,):
    super().__init__(*args, **kwargs)
    self.aggregation_type = aggregation_type
    self.combination_type = combination_type
    self.normalize = normalize

    self.ffn_prepare = create_ffn(hidden_units, dropout_rate)
    if self.combination_type == "gated":
      self.update_fn = layers.GRU(
      units=hidden_units,
      activation="tanh",
      recurrent_activation="sigmoid",
      dropout=dropout_rate,
      return_state=True,
      recurrent_dropout=dropout_rate,
      )
    else:
      self.update_fn = create_ffn(hidden_units, dropout_rate)


In [25]:
def prepare(self, node_repesentations, weights=None):
  # node_repesentations shape is [num_edges, embedding_dim].
  messages = self.ffn_prepare(node_repesentations)
  if weights is not None:
    messages = messages * tf.expand_dims(weights, -1)
  return messages

In [26]:
def aggregate(self, node_indices, neighbour_messages, node_repesentations):
  # node_indices shape is [num_edges].
  # neighbour_messages shape: [num_edges, representation_dim].
  # node_repesentations shape is [num_nodes, representation_dim]
  num_nodes = node_repesentations.shape[0]
  if self.aggregation_type == "sum":
    aggregated_message = tf.math.unsorted_segment_sum(
    neighbour_messages, node_indices, num_segments=num_nodes
  )
  elif self.aggregation_type == "mean":
    aggregated_message = tf.math.unsorted_segment_mean(
    neighbour_messages, node_indices, num_segments=num_nodes
  )
  elif self.aggregation_type == "max":
    aggregated_message = tf.math.unsorted_segment_max(
    neighbour_messages, node_indices, num_segments=num_nodes
  )
  else:
    raise ValueError(f"Invalid aggregation type: {self.aggregation_type}.")
  return aggregated_message


In [27]:
def update(self, node_repesentations, aggregated_messages):
  # node_repesentations shape is [num_nodes, representation_dim].
  # aggregated_messages shape is [num_nodes, representation_dim].
  if self.combination_type == "gru":
    # Create a sequence of two elements for the GRU layer.
    h = tf.stack([node_repesentations, aggregated_messages], axis=1)
  elif self.combination_type == "concat":
    # Concatenate the node_repesentations and aggregated_messages.
    h = tf.concat([node_repesentations, aggregated_messages], axis=1)
  elif self.combination_type == "add":
    # Add node_repesentations and aggregated_messages.
    h = node_repesentations + aggregated_messages
  else:
    raise ValueError(f"Invalid combination type: {self.combination_type}.")
    # Apply the processing function.
    node_embeddings = self.update_fn(h)
  if self.combination_type == "gru":
    node_embeddings = tf.unstack(node_embeddings, axis=1)[-1]
  if self.normalize:
    node_embeddings = tf.nn.l2_normalize(node_embeddings, axis=-1)
  return node_embeddings


In [28]:
def call(self, inputs):
  """Process the inputs to produce the node_embeddings.
  inputs: a tuple of three elements: node_repesentations, edges, edge_weights.
  Returns: node_embeddings of shape [num_nodes, representation_dim].
  """
  node_repesentations, edges, edge_weights = inputs
  # Get node_indices (source) and neighbour_indices (target) from edges.
  node_indices, neighbour_indices = edges[0], edges[1]
  # neighbour_repesentations shape is [num_edges, representation_dim].
  neighbour_repesentations = tf.gather(node_repesentations, neighbour_indices)
  # Prepare the messages of the neighbours.
  neighbour_messages = self.prepare(neighbour_repesentations, edge_weights)
  # Aggregate the neighbour messages.
  aggregated_messages = self.aggregate(
  node_indices, neighbour_messages, node_repesentations
  )
  # Update the node embedding with the neighbour messages.
  return self.update(node_repesentations, aggregated_messages)

##implement a GNN classifier

In [29]:
class GNNNodeClassifier(tf.keras.Model):
  def __init__(
    self,
    graph_info,
    num_classes,
    hidden_units,
    aggregation_type="sum",
    combination_type="concat",
    dropout_rate=0.2,
    normalize=True,
    *args,
    **kwargs,
    ):
    super().__init__(*args, **kwargs)
    # Unpack graph_info to three elements: node_features, edges, and edge_weight.
    node_features, edges, edge_weights = graph_info
    self.node_features = node_features
    self.edges = edges
    self.edge_weights = edge_weights
    # Set edge_weights to ones if not provided.
    if self.edge_weights is None:
      self.edge_weights = tf.ones(shape=edges.shape[1])
  # Scale edge_weights to sum to 1.
    self.edge_weights = self.edge_weights / tf.math.reduce_sum(self.edge_weights)
  # Create a process layer.
    self.preprocess = create_ffn(hidden_units, dropout_rate, name="preprocess")
  # Create the first GraphConv layer.
    self.conv1 = GraphConvLayer(
        hidden_units,
    dropout_rate,
    aggregation_type,
    combination_type,
    normalize,
    name="graph_conv1",
    )
    # Create the second GraphConv layer.
    self.conv2 = GraphConvLayer(
    hidden_units,
    dropout_rate,
    aggregation_type,
    combination_type,
    normalize,
    name="graph_conv2",
    )
    # Create a postprocess layer.
    self.postprocess = create_ffn(hidden_units, dropout_rate, name="postprocess")
    # Create a compute logits layer.
    self.compute_logits = layers.Dense(units=num_classes, name="logits")
  def call(self, input_node_indices):
    # Preprocess the node_features to produce node representations.
    x = self.preprocess(self.node_features)
    # Apply the first graph conv layer.
    x1 = self.conv1((x, self.edges, self.edge_weights))
    # Skip connection.
    x = tf.cast(x, tf.float32)
    x = x1 + x
    # Apply the second graph conv layer.
    x2 = self.conv2((x, self.edges, self.edge_weights))
    # Skip connection.
    x = x2 + x
    # Postprocess node embedding.
    x = self.postprocess(x)
    # Fetch node embeddings for the input node_indices.
    node_embeddings = tf.gather(x, input_node_indices)
    # Compute logits
    return self.compute_logits(node_embeddings)

##Train the GNN model

In [30]:
gnn_model = GNNNodeClassifier(
  graph_info=graph_info,
  num_classes=num_classes,
  hidden_units=hidden_units,
  dropout_rate=dropout_rate,
  name="gnn_model",
  )
print("GNN output shape:", gnn_model([1, 10, 100]))
gnn_model.summary()

NameError: name 'graph_info' is not defined

In [None]:
x_train = train_data.paper_id.to_numpy()
history = run_experiment(gnn_model, x_train, y_train)

In [None]:
from tensorflow.python.ops.math_ops import mean
import statistics
from statistics import mean
mean(history.history['acc'])

In [None]:
x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

In [None]:
fig, ax1 = plt.subplots(1, figsize=(4, 2))
ax1.plot(history.history["loss"])
ax1.plot(history.history["val_loss"])
ax1.legend(["Train", "Test"], loc="upper right")
ax1.set_xlabel("Epochs")
ax1.set_ylabel("Loss")

In [None]:
y_pred=gnn_model.predict(x_test)
print(y_pred)
len(y_pred)

In [None]:
rounded_predictions=np.argmax(y_pred,axis=-1)
len(rounded_predictions)

In [None]:
for i in rounded_predictions:
print(i)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,rounded_predictions))

In [None]:
%matplotlib inline
from sklearn.metrics import confusion_matrix
import itertools
import matplotlib.pyplot as plt
cm=confusion_matrix(y_true=y_test,y_pred=rounded_predictions)
def plot_confusion_matrix(cm,classes,normalize=False,title='Confusion_matrix,cmap=plt.cm.blues'):
  plt.imshow(cm,interpolation='nearest',cmap=cmap)
  plot.title(title)
  plt.colorbar()
  tick_marks=np.arrange(len(classes))
  plt.xticks(tick_marks,classes,rotation=45)
  plt.ytick(tick_marks,classes)


In [None]:
def plot_confusion_matrix(cm, classes,
    normalize=False,
    title='Confusion matrix',
    cmap=plt.cm.Blues):
  """
  This function prints and plots the confusion matrix.
  Normalization can be applied by setting `normalize=True`.
  """
  if normalize:
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print("Normalized confusion matrix")
  else:
    print('Confusion matrix, without normalization')
  print(cm)
  plt.imshow(cm, interpolation='nearest', cmap=cmap)
  plt.title(title)
  plt.colorbar()
  tick_marks = np.arange(len(classes))
  plt.xticks(tick_marks, classes, rotation=45)
  plt.yticks(tick_marks, classes)
  fmt = '.2f' if normalize else 'd'
  thresh = cm.max() / 2.
  for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(j, i, format(cm[i, j], fmt),
    horizontalalignment="center",
    color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
  plt.ylabel('True label')
  plt.xlabel('Predicted label')
cnf_matrix = confusion_matrix(y_test, rounded_predictions, labels=[0,1,2,3,4,5,6,7])
np.set_printoptions(precision=2)
# Plot confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['0','1','2','3','4','5','6'],normalize= False, title='Confusion matrix')


In [None]:
sensitivity = cm[0,0]/(cm[0,0]+cm[0,1])
print(sensitivity)

In [None]:
specificity = cm[1,1]/(cm[1,0]+cm[1,1])
print('Specificity : ', specificity)