In [1]:
%pip install torch
#importing libraries
import dgl
import torch
print(torch.__version__)
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

import tqdm
import sklearn.metrics

import os

from numpy import array
from numpy import split
from numpy import dot
from numpy.linalg import norm
import pickle
import string
import random
import csv
import configparser

Note: you may need to restart the kernel to use updated packages.
1.11.0+cu102


# Defining Models

In [2]:
from dgl.nn import SAGEConv
from dgl.nn import GATConv
from dgl.nn import GraphConv


#defining the Model
class Model_SAGE(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(Model_SAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, aggregator_type='mean')
        self.conv2 = SAGEConv(h_feats, num_classes, aggregator_type='mean')
        self.h_feats = h_feats

    def forward(self, mfgs, x):
        h_dst = x[:mfgs[0].num_dst_nodes()] 
        h = self.conv1(mfgs[0], (x, h_dst))  
        h = F.relu(h)
        h_dst = h[:mfgs[1].num_dst_nodes()]  
        h = self.conv2(mfgs[1], (h, h_dst))  
        return h

#defining the Model
class Model_GraphConv(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(Model_GraphConv, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.conv2 = GraphConv(h_feats, num_classes)

    def forward(self, mfgs, x):
        h_dst = x[:mfgs[0].num_dst_nodes()] 
        h = self.conv1(mfgs[0], (x, h_dst))  
        h = F.relu(h)
        h_dst = h[:mfgs[1].num_dst_nodes()]  
        h = self.conv2(mfgs[1], (h, h_dst))  
        return h


#@TODO: implement Model_GATConv

def Model(alg, num_features, num_classes, device):
  if (alg == "SAGEConv"):
    model = Model_SAGE(num_features, 128, num_classes).to(device)
  elif (alg == "GraphConv"):
    model = Model_GraphConv(num_features, 128, num_classes).to(device)
  elif (alg == "GATConv"):
    model = Model_GATConv(num_features, 128, num_classes).to(device)
  return model

# Utility Functions

In [3]:
from dgl.data import CiteseerGraphDataset
from dgl.data import CoraGraphDataset
from dgl.data import PubmedGraphDataset
from dgl.data import CoauthorCSDataset
from dgl.data import CoauthorPhysicsDataset

def load_dataset(dataset):
  if(dataset == 'Cora'):
    return CoraGraphDataset()
  if(dataset == 'Citeseer'):
    return CiteseerGraphDataset()
  if(dataset == 'Pubmed'):
    return PubmedGraphDataset()
  if(dataset == 'CoauthorCS'):
    return CoauthorCSDataset()
  if(dataset == 'Physics'):
    return CoauthorPhysicsDataset()

def overlay_edges(dataset, cos):
  ###
  # 1) Per grafi piccoli: 
  #    pairwise cosine similarity tra le feature iniziali (gia' disponibili dal dataset)
  #    esempi di feature solo le keyword degli articoli, ecc.
  # 2) Per grafi grandi:
  #    pairwise cosine similarity ma tenendo solo quelli che superano una soglia
  #    - ad esempio se la soglia e' 0.5 per ogni nodo n, avro' solo i vicini  (tra tutti)
  #    per i quali c'e' una similarita' uguale o superiore a 0.5
  #    Se uno n' non e' vicino di n per una soglia s1 non lo sara' anche per s2>s1
  # - Abbiamo quindi "matrici" di similarita' tra nodi per 0.5, 0.6, 0.7, 0.8, 0.9
  #  
  ###


  # 1) Caricare gli edge weights dal disco e immetterli come edge feature nel grafo
  # 

  ##############################
  cos = str(round(cos, 2)).translate(str.maketrans("", "", string.punctuation)) #cos is a float 0.x, but I need the string "0x"
  path = "/home/studio-lab-user/sagemaker-studiolab-notebooks/Overlay GNN/Overlay Edges/" + dataset + "/overlay_edges_" + cos
  with open(path, "rb") as fp:
    overlay_edges = pickle.load(fp)
  return (overlay_edges[0], overlay_edges[1])
  ############################################################

### Edges are removed from nodes that will have new edges added from the overlay
## @TODOs: sarebbe utile non rimuovere random? ma quelli con similarita' minore?
def remove(graph, seed_list):
  n_overlay = {}
  for seed_node in seed_list:
    if (seed_node not in n_overlay):
      n_overlay[seed_node] = 1
    else:
      n_overlay[seed_node] = n_overlay[seed_node] + 1
  #defining an utility function inside remove
  def remove_edges(node, n_overlay_edges, n_removable_edges, out_edges):
    if(n_removable_edges == 0):
      return 0
    remove_edges = []
    while(n_overlay_edges > 0 and n_removable_edges > 0):
      remove_edges.append(out_edges.pop(random.randrange(len(out_edges))))
      n_overlay_edges -= 1
      n_removable_edges -= 1
    dgl.remove_edges(graph, remove_edges)
    return len(remove_edges)
  #now back inside remove
  adj_matrix = graph.adj(scipy_fmt='coo') #getting the adj_matrix in the scipy coo sparse matrix
  total_removed = 0
  for node in n_overlay:
    k = n_overlay[node]
    out_edges = graph.out_edges(node)
    n_removable = len(out_edges[0])
    out_edges = graph.edge_ids(out_edges[0], out_edges[1]) 
    #now out_edges is a tensor(id_1, id_2, ... , id_k) where id_x is an edge id 
    removed = remove_edges(node, k, n_removable, out_edges.tolist()) 
    total_removed += removed
  return graph, total_removed

# Training Function

In [4]:
def training(alg, dataset_str, graph, device, feat, labels, num_classes, num_features, train_nids, test_nids, valid_nids, total_removed, total_added, cos_sim, per_train, num_iterations, b_size, path_iter, path_avg):
  #Initialize Model 
  model = Model(alg, num_features, num_classes, device)

  
  sampler = dgl.dataloading.MultiLayerNeighborSampler([4, 4])

  # @TODO: my_sampler=ImplementazioneMySampler


  #Initialize the training dataloader with MultilayerNeighborSampler.
  train_dataloader = dgl.dataloading.NodeDataLoader(
      # The following arguments are specific to NodeDataLoader.
      graph,              # The graph
      train_nids,         # The node IDs to iterate over in minibatches
      sampler,            # The neighbor sampler
      device=device,      # Put the sampled MFGs on CPU or GPU
      # The following arguments are inherited from PyTorch DataLoader.
      batch_size=b_size,    # Batch size
      shuffle=True,       # Whether to shuffle the nodes for every epoch
      drop_last=False,    # Whether to drop the last incomplete batch
      #num_workers=0       # Number of sampler processes
  )

  #Initialize validation dataloader.
  valid_dataloader = dgl.dataloading.NodeDataLoader(
      graph, valid_nids, sampler,
      batch_size=b_size,
      shuffle=False,
      drop_last=False,
      num_workers=0,
      device=device
  )
  iterations = num_iterations
  acc_array = []*iterations
  while (iterations > 0):
    #Initialize model and define the optimizer.
    opt = torch.optim.Adam(model.parameters())
    #Training loop.
    best_accuracy = 0
    best_model_path = 'model.pt'
    for epoch in range(100):
        model.train() #we put our model in training mode. In training we train with something we already know.
        with tqdm.tqdm(train_dataloader) as tq: #this statement refers to the progress bar.
        
            for step, (input_nodes, output_nodes, mfgs) in enumerate(tq):
                # feature copy from CPU to GPU takes place here
                inputs = mfgs[0].srcdata['feat'] #we get the input features from the first layer.
                labels = mfgs[-1].dstdata['label'] #and the labels from the last one. These are the ones that we'll use to compare with our predictions.
                predictions = model(mfgs, inputs) #getting predictions.
                loss = F.cross_entropy(predictions, labels) #measuring loss. Between predictions and what we get.
                opt.zero_grad()
                loss.backward()
                opt.step()

                ###########
                accuracy = sklearn.metrics.accuracy_score(labels.cpu().numpy(), predictions.argmax(1).detach().cpu().numpy())
                tq.set_postfix({'loss': '%.03f' % loss.item(), 'acc': '%.03f' % accuracy}, refresh=False)

        model.eval() #we switch to evaluation mode.

        predictions = []
        labels = []
        with tqdm.tqdm(valid_dataloader) as tq, torch.no_grad():
            for input_nodes, output_nodes, mfgs in tq:
                inputs = mfgs[0].srcdata['feat']
                labels.append(mfgs[-1].dstdata['label'].cpu().numpy())
                predictions.append(model(mfgs, inputs).argmax(1).cpu().numpy())
            predictions = np.concatenate(predictions)
            labels = np.concatenate(labels)
            accuracy = sklearn.metrics.accuracy_score(labels, predictions)
            if (epoch == 99):
              print('Epoch {} Validation Accuracy {}'.format(epoch, accuracy))
            if best_accuracy < accuracy:
                best_accuracy = accuracy
                torch.save(model.state_dict(), best_model_path)
    acc_array.append(round(accuracy, 3))
    #refresh model for next iteration
    model = Model(alg, num_features, num_classes, device)
    iterations -= 1
  
  #now writing on the result iter file all the accuracies saved in acc_array
  with open(path_iter, "a") as csvfile:
    fieldnames = ["Iteration", "Algorithm", "Dataset", "ThresholdSim", "Training%", "NumAddedEdges", "NumRemovedEdges", "Accuracy"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    for i in reversed(range(len(acc_array))):
      writer.writerow({"Iteration": i, "Algorithm": alg, "Dataset": dataset_str, "ThresholdSim": cos_sim, "Training%": str(round(per_train, 4)), "NumAddedEdges": total_added, "NumRemovedEdges": total_removed, "Accuracy": acc_array[i]})

  
  
  with open(path_avg, "a") as csvfile:
      fieldnames = ["Algorithm", "Dataset", "ThresholdSim", "Training%", "NumAddedEdges", "NumRemovedEdges", "AvgAccuracy"]
      writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
      writer.writerow({"Algorithm": alg, "Dataset": dataset_str, "ThresholdSim": cos_sim, "Training%": str(round(per_train, 4)), "NumAddedEdges": total_added, "NumRemovedEdges": total_removed, "AvgAccuracy": round(sum(acc_array)/len(acc_array), 3)})
  return

# Execution Function

In [5]:
def execution(dataset_str, alg, training_values, cos_list, f_remove, num_iterations, batch_size, path_iter, path_avg, device):
  if (alg not in ['SAGEConv', 'GraphConv', 'GATConv']):
    print("Error: algorithm must be either SAGEConv, GraphConv or GATConv, instead " + alg + " was given")
    return 1
  if (dataset_str not in ['Cora', 'Citeseer', 'Pubmed', 'Physics', 'CoauthorCS']):
    print("Error: dataset must be either Cora, Citeseer, Pubmed, Physics or CoauthorCS, instead " + dataset_str + " was given")
    return 1


  for cos in cos_list:
    #Loading Dataset
    dataset = load_dataset(dataset_str)
    graph = dataset[0]
    num_classes = dataset.num_classes #they are 6
    
    feat = graph.ndata['feat'] #node features
    labels = graph.ndata['label'] #ground truth labels (for node classification)
    num_features = feat.shape[1]
    n_nodes = graph.num_nodes() #number of nodes in the graph

    total_removed = 0
    total_added = 0
    if(cos > 0):
        #### these edges are NOT already present in the graph
        seed_list, dest_list = overlay_edges(dataset_str, cos)

        if (f_remove): #removing random neighbor edges
          graph, total_removed = remove(graph, seed_list)
        #adding overlay edges
        total_added = len(seed_list)
        graph.add_edges(torch.tensor(seed_list), torch.tensor(dest_list)) 
    for training_value in training_values:
      n_train = round(n_nodes*training_value) #effective number of train_nids
      train_list = [i for i in range(n_nodes)]
      train_list = random.sample(train_list, n_train) #sampling n_train values from train_list
      train_mask = [True if i in train_list else False for i in range(n_nodes)]
      train_nids = graph.nodes()[train_mask]

      test_mask = np.logical_not(train_mask) > 0
      test_nids = graph.nodes()[test_mask]

      half = round(len(test_nids)/2)
      val_mask = [True for _ in range(half)]
      val_mask.extend([False for _ in range(len(test_nids) - half)])
      valid_nids = test_nids[val_mask]
      training(alg, dataset_str, graph, device, feat, labels, num_classes, num_features, train_nids, test_nids, valid_nids, total_removed, total_added, cos, training_value, num_iterations, batch_size, path_iter, path_avg)
  return

# General Function

In [6]:
def general(dataset_list, algorithm, cos_sim, training, step_cos, step_train, num_iterations, learning_rate, batch_size, path_iter, path_avg, no_overlay, device):

#@TODO: 
# Learning rate
# batch_size
# Adam: lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False, *, maximize=False

  '''Safety Checks'''
  if(len(dataset_list) == 0):
    print("Error: Dataset List is Empty.")
    return 1
  if(len(algorithm) == 0):
    print("Error: Algorithm List is Empty")
    return 1
  if(len(cos_sim) < 2):
    print("Error: give low and high value for cosine similarity: [low, high]")
    return 1
  if(len(training) < 2):
    print("Error: give low and high value for training: [low, high]")
    return 1
  if(cos_sim[0] > cos_sim[1]):
    print("Error: cos_sim[0] must be less or equal than cos_sim[1]")
    return 1
  if(training[0] > training[1]):
    print("Error: training[0] must be less or equal than training[1]")
    return 1
  if(device != "cuda" and device != "cpu"):
    print("Error: device must be cuda or cpu")
    return 1

  #Parsing Inputs:
  datasets = [x for x in dataset_list]
  algorithms = [x for x in algorithm]


  train_low = training[0]
  train_high = training[1]
  training_values = np.arange(train_low, train_high+step_train, step_train)
  
  cos_low = cos_sim[0]
  cos_high = cos_sim[1]
  cos_list = np.arange(cos_low, cos_high+step_cos, step_cos)
  if(no_overlay == 'y'):
    cos_list = np.insert(cos_list, 0, 0.0)
    
  with open(path_iter, "w") as csvfile:
    fieldnames = ["Iteration", "Algorithm", "Dataset", "ThresholdSim", "Training%", "NumAddedEdges", "NumRemovedEdges", "Accuracy"]
    writer = csv.DictWriter(csvfile, fieldnames = fieldnames)
    writer.writeheader()

  with open(path_avg, "w") as csvfile:
    fieldnames = ["Algorithm", "Dataset", "ThresholdSim", "Training%", "NumAddedEdges", "NumRemovedEdges", "AvgAccuracy"]
    writer = csv.DictWriter(csvfile, fieldnames = fieldnames)
    writer.writeheader()
 
  for dataset in datasets:
    for alg in algorithms:
      for f_remove_edges in [True, False]:
        execution(dataset, alg, training_values, cos_list, f_remove_edges, num_iterations, batch_size, path_iter, path_avg, device)

# Main Function

In [7]:
def main(path = '/home/studio-lab-user/sagemaker-studiolab-notebooks/Git/generalization_overlay/config_sage.ini'):
  parser = configparser.ConfigParser()
  parser.read(path)

  dataset_list = [x for x in ["Cora", "Citeseer", "Pubmed", "Physics", "CoauthorCS"] if parser["DatasetList"][x] == "y"]
  alg_list = [x for x in ["SAGEConv", "GraphConv", "GATConv"] if parser["AlgList"][x] == "y"]
  cos_sim = [float(parser["RangeValues"]["CosLow"]), float(parser["RangeValues"]["CosHigh"])]
  train_sim = [float(parser["RangeValues"]["TrainLow"]), float(parser["RangeValues"]["TrainHigh"])]
  step_cos = float(parser["RangeValues"]["StepCos"])
  step_train = float(parser["RangeValues"]["StepTrain"])

  no_overlay = parser["TrainingValues"]["NoOverlay"]
  num_iterations = int(parser["TrainingValues"]["NumIterations"])
  learning_rate = parser["TrainingValues"]["LearningRate"]
  batch_size = int(parser["TrainingValues"]["batch_size"])
  device = parser["TrainingValues"]["device"]

  path_iter = parser["Paths"]["SingleIteration"]
  path_avg = parser["Paths"]["Avg"]

  general(dataset_list, alg_list, cos_sim, train_sim, step_cos, step_train, num_iterations, learning_rate, batch_size, path_iter, path_avg, no_overlay, device)
  return

In [None]:
main()