In [1]:
#%pip install torch
#importing libraries
import dgl
import torch
print(torch.__version__)
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn # for link prediction

import tqdm
import sklearn.metrics
from sklearn.metrics import roc_auc_score
import itertools
import scipy.sparse as sp

import os

from numpy import array
from numpy import split
from numpy import dot
from numpy.linalg import norm
import pickle
import string
import random
import csv
import configparser

1.11.0+cu102


# Dot Predictor Class for evaluating Link Prediction Results

In [2]:
class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]

# Defining Models

In [3]:
from dgl.nn import SAGEConv
from dgl.nn import GATConv
from dgl.nn import GraphConv
from dgl.nn import GatedGraphConv


#defining the Model
class Model_SAGE(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(Model_SAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, aggregator_type='mean')
        self.conv2 = SAGEConv(h_feats, num_classes, aggregator_type='mean')
        self.h_feats = h_feats

    def forward(self, mfgs, x):
        h_dst = x[:mfgs[0].num_dst_nodes()] 
        h = self.conv1(mfgs[0], (x, h_dst))  
        h = F.relu(h)
        h_dst = h[:mfgs[1].num_dst_nodes()]  
        h = self.conv2(mfgs[1], (h, h_dst))  
        return h

#defining the Model
class Model_GraphConv(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(Model_GraphConv, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats, allow_zero_in_degree=True)
        self.conv2 = GraphConv(h_feats, num_classes, allow_zero_in_degree=True)

    def forward(self, mfgs, x):
        h_dst = x[:mfgs[0].num_dst_nodes()] 
        h = self.conv1(mfgs[0], (x, h_dst))  
        h = F.relu(h)
        h_dst = h[:mfgs[1].num_dst_nodes()]  
        h = self.conv2(mfgs[1], (h, h_dst))  
        return h


class Model_GATConv(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(Model_GATConv, self).__init__()
        self.conv1 = GATConv(in_feats, h_feats, num_heads = 1, feat_drop = 0.4, attn_drop = 0.2, allow_zero_in_degree=True)
        self.conv2 = GATConv(h_feats, num_classes, num_heads = 1, feat_drop = 0.4, attn_drop = 0.2, allow_zero_in_degree=True)
        
        #@TODO
        self.classify = nn.Sequential(nn.Linear(333 * 1, num_classes))
        
        #self.linear=nn.Linear(h_feats,num_classes)
    
    def forward(self, mfgs, x):
        h_dst = x[:mfgs[0].num_dst_nodes()] 
        h = self.conv1(mfgs[0], (x, h_dst))  
        h = F.relu(h)
        h = h.mean(dim=1) 
        h_dst = h[:mfgs[1].num_dst_nodes()]  
        h = self.conv2(mfgs[1], (h, h_dst))  
        h = h.mean(dim=1) 
        return h
    
class Model_Gated(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(Model_Gated, self).__init__()
        self.conv1 = GatedGraphConv(in_feats, h_feats, 2, 1)
        self.conv2 = GatedGraphConv(h_feats, num_classes, 2, 1)
        
    def forward(self, mfgs, x):
        h_dst = x[:mfgs[0].num_dst_nodes()] 
        h = self.conv1(mfgs[0], (x, h_dst), None)  
        h = F.relu(h)
        h_dst = h[:mfgs[1].num_dst_nodes()]  
        h = self.conv2(mfgs[1], (h, h_dst), None)  
        return h

class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h


class Link_GraphConv(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(Link_GraphConv, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats, allow_zero_in_degree=True)
        self.conv2 = GraphConv(h_feats, h_feats, allow_zero_in_degree=True)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h
    
    
class Link_GATConv(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(Link_GATConv, self).__init__()
        self.conv1 = GATConv(in_feats, h_feats, num_heads = 1, feat_drop = 0.4, attn_drop = 0.2, allow_zero_in_degree=True)
        self.conv2 = GATConv(h_feats, h_feats, num_heads = 1, feat_drop = 0.4, attn_drop = 0.2, allow_zero_in_degree=True)
        
        #@TODO
        self.classify = nn.Sequential(nn.Linear(333 * 1, h_feats))
        
        #self.linear=nn.Linear(h_feats,num_classes)
    
    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = h.mean(dim=1)
        h = self.conv2(g, h)
        h = h.mean(dim=1)
        return h
    
def Model(alg, num_features, num_classes, device):
  if (alg == "SAGEConv"):
    model = Model_SAGE(num_features, 128, num_classes).to(device)
  elif (alg == "GraphConv"):
    model = Model_GraphConv(num_features, 128, num_classes).to(device)
  elif (alg == "GATConv"):
    model = Model_GATConv(num_features, 128, num_classes).to(device)
  elif (alg == "GatedGraphConv"):
    model = Model_Gated(num_features, 128, num_classes).to(device)
  return model

def Model_link(alg, num_features):
    if (alg == "SAGEConv"):
        model = GraphSAGE(num_features, 16)
    elif (alg == "GraphConv"):
        model = Link_GraphConv(num_features, 16)
    elif (alg == "GATConv"):
        model = Link_GATConv(num_features, 16)
    return model

# Utility Functions

In [4]:
from dgl.data import CiteseerGraphDataset
from dgl.data import CoraGraphDataset
from dgl.data import PubmedGraphDataset
from dgl.data import CoauthorCSDataset
from dgl.data import CoauthorPhysicsDataset

def load_dataset(dataset):
  if(dataset == 'Cora'):
    return CoraGraphDataset()
  if(dataset == 'Citeseer'):
    return CiteseerGraphDataset()
  if(dataset == 'Pubmed'):
    return PubmedGraphDataset()
  if(dataset == 'CoauthorCS'):
    return CoauthorCSDataset()
  if(dataset == 'Physics'):
    return CoauthorPhysicsDataset()

def overlay_edges(dataset, cos):
  ###
  # 1) Per grafi piccoli: 
  #    pairwise cosine similarity tra le feature iniziali (gia' disponibili dal dataset)
  #    esempi di feature solo le keyword degli articoli, ecc.
  # 2) Per grafi grandi:
  #    pairwise cosine similarity ma tenendo solo quelli che superano una soglia
  #    - ad esempio se la soglia e' 0.5 per ogni nodo n, avro' solo i vicini  (tra tutti)
  #    per i quali c'e' una similarita' uguale o superiore a 0.5
  #    Se uno n' non e' vicino di n per una soglia s1 non lo sara' anche per s2>s1
  # - Abbiamo quindi "matrici" di similarita' tra nodi per 0.5, 0.6, 0.7, 0.8, 0.9
  #  
  ###


  # 1) Caricare gli edge weights dal disco e immetterli come edge feature nel grafo
  # 

  ##############################
  cos = str(round(cos, 2)).translate(str.maketrans("", "", string.punctuation)) #cos is a float 0.x, but I need the string "0x"
  path = "/home/studio-lab-user/sagemaker-studiolab-notebooks/Overlay GNN/Overlay Edges/" + dataset + "/overlay_edges_" + cos
  with open(path, "rb") as fp:
    overlay_edges = pickle.load(fp)
  return (overlay_edges[0], overlay_edges[1])
  ############################################################

### Edges are removed from nodes that will have new edges added from the overlay
## @TODOs: sarebbe utile non rimuovere random? ma quelli con similarita' minore?
def remove(graph, seed_list):
  n_overlay = {}
  for seed_node in seed_list:
    if (seed_node not in n_overlay):
      n_overlay[seed_node] = 1
    else:
      n_overlay[seed_node] = n_overlay[seed_node] + 1
  #defining an utility function inside remove
  def remove_edges(graph, node, n_overlay_edges, n_removable_edges, out_edges):
    if(n_removable_edges == 0):
      return 0
    remove_edges = []
    while(n_overlay_edges > 0 and n_removable_edges > 0):
      remove_edges.append(out_edges.pop(random.randrange(len(out_edges))))
      n_overlay_edges -= 1
      n_removable_edges -= 1
    tensor_removed = []
    for edge in remove_edges:
        tensor_removed.append(graph.find_edges(edge))
    graph = dgl.remove_edges(graph, remove_edges)
    return len(remove_edges), graph, tensor_removed
  #now back inside remove
  adj_matrix = graph.adj(scipy_fmt='coo') #getting the adj_matrix in the scipy coo sparse matrix
  total_removed = 0
  tensor_total = []
  n_overlay_after = {} #dictionary that helps us adding only a number of overlay edges equal to the number of removed edges
  for key in n_overlay:
        n_overlay_after[key] = 0
  for node in n_overlay:
    k = n_overlay[node]
    out_edges = graph.out_edges(node)
    n_removable = len(out_edges[0])
    out_edges = graph.edge_ids(out_edges[0], out_edges[1]) 
    #now out_edges is a tensor(id_1, id_2, ... , id_k) where id_x is an edge id 
    removed, graph, tensor_removed = remove_edges(graph, node, k, n_removable, out_edges.tolist())
    n_overlay_after[node] = removed
    total_removed += removed
    tensor_total.extend(tensor_removed)
  return graph, total_removed, n_overlay_after, n_overlay, tensor_total


def addEdges(graph, seeds, dests, dict_after, dict_before):
    dict_remove = {} #for each node, it stores the number of edges to be removed from seeds and dests
    for key in dict_before:
        dict_remove[key] = dict_before[key] - dict_after[key]
    indices_mask = []
    for i in range(len(seeds)):
        node = seeds[i]
        if (dict_remove[node] > 0):
            indices_mask.append(False)
            dict_remove[node] -= 1
        else:
            indices_mask.append(True)
    seeds = np.array(seeds) #change in numpy array required in order to perform masking
    dests = np.array(dests)
    seeds = seeds[indices_mask]
    dests = dests[indices_mask]
    return len(seeds.tolist()), seeds.tolist(), dests.tolist()

import dgl.function as fn

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]
        
def remove_negatives(train_neg_u, train_neg_v, seed_list, dest_list):
    new_train_neg_u = []
    new_train_neg_v = []
    for i in range(len(train_neg_u)):
        ns = train_neg_u[i]
        nd = train_neg_v[i]
        yes = True
        for j in range(len(seed_list)):
            s = seed_list[j]
            d = dest_list[j]
            if(s == ns and d == nd):
                yes = False
                break
        if(yes):
            new_train_neg_u.append(ns)
            new_train_neg_v.append(nd)
    return new_train_neg_u, new_train_neg_v

# Training Function

In [5]:
def training(alg, dataset_str, graph, device, feat, labels, num_classes, num_features, train_nids, test_nids, valid_nids, total_removed, total_added, cos_sim, per_train, num_iterations, b_size, path_iter, path_avg, n_epoch):
  #Initialize Model 
  model = Model(alg, num_features, num_classes, device)

  
  sampler = dgl.dataloading.MultiLayerNeighborSampler([4, 4])

  # @TODO: my_sampler=ImplementazioneMySampler


  #Initialize the training dataloader with MultilayerNeighborSampler.
  train_dataloader = dgl.dataloading.NodeDataLoader(
      # The following arguments are specific to NodeDataLoader.
      graph,              # The graph
      train_nids,         # The node IDs to iterate over in minibatches
      sampler,            # The neighbor sampler
      device=device,      # Put the sampled MFGs on CPU or GPU
      # The following arguments are inherited from PyTorch DataLoader.
      batch_size=b_size,    # Batch size
      shuffle=True,       # Whether to shuffle the nodes for every epoch
      drop_last=False,    # Whether to drop the last incomplete batch
      #num_workers=0       # Number of sampler processes
  )

  #Initialize validation dataloader.
  valid_dataloader = dgl.dataloading.NodeDataLoader(
      graph, valid_nids, sampler,
      batch_size=b_size,
      shuffle=False,
      drop_last=False,
      num_workers=0,
      device=device
  )
  iterations = num_iterations
  acc_array = []*iterations
  while (iterations > 0):
    #Initialize model and define the optimizer.
    opt = torch.optim.Adam(model.parameters())
    #Training loop.
    best_accuracy = 0
    best_model_path = 'model.pt'
    for epoch in range(n_epoch):
        model.train() #we put our model in training mode. In training we train with something we already know.
        with tqdm.tqdm(train_dataloader) as tq: #this statement refers to the progress bar.
            for step, (input_nodes, output_nodes, mfgs) in enumerate(tq):
                # feature copy from CPU to GPU takes place here
                inputs = mfgs[0].srcdata['feat'] #we get the input features from the first layer.
                labels = mfgs[-1].dstdata['label'] #and the labels from the last one. These are the ones that we'll use to compare with our predictions.
                
                predictions = model(mfgs, inputs) #getting predictions.
                print("predictions.shape=",predictions.shape)
                print("labels.shape=",labels.shape)

                #print(predictions)
                #exit()
                #print(#######)
                
                #if(alg == "GATConv"):
                #predictions = torch.argmax(predictions, dim=1)
                
                #for p in predictions:
                #    print(p)
                #print(predictions)  
                
                loss = F.cross_entropy(predictions, labels) #measuring loss. Between predictions and what we get.
                opt.zero_grad()
                loss.backward()
                opt.step()

                ###########
                accuracy = sklearn.metrics.accuracy_score(labels.cpu().numpy(), predictions.argmax(1).detach().cpu().numpy())
                tq.set_postfix({'loss': '%.03f' % loss.item(), 'acc': '%.03f' % accuracy}, refresh=False)

        model.eval() #we switch to evaluation mode.

        predictions = []
        labels = []
        with tqdm.tqdm(valid_dataloader) as tq, torch.no_grad():
            for input_nodes, output_nodes, mfgs in tq:
                inputs = mfgs[0].srcdata['feat']
                labels.append(mfgs[-1].dstdata['label'].cpu().numpy())
                predictions.append(model(mfgs, inputs).argmax(1).cpu().numpy())
            predictions = np.concatenate(predictions)
            labels = np.concatenate(labels)
            accuracy = sklearn.metrics.accuracy_score(labels, predictions)
            if (epoch == 99):
              print('Epoch {} Validation Accuracy {}'.format(epoch, accuracy))
            if best_accuracy < accuracy:
                best_accuracy = accuracy
                torch.save(model.state_dict(), best_model_path)
    acc_array.append(round(accuracy, 3))
    #refresh model for next iteration
    model = Model(alg, num_features, num_classes, device)
    iterations -= 1
  
  #now writing on the result iter file all the accuracies saved in acc_array
  with open(path_iter, "a") as csvfile:
    fieldnames = ["Iteration", "Algorithm", "Dataset", "ThresholdSim", "Training%", "NumAddedEdges", "NumRemovedEdges", "Accuracy"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    for i in reversed(range(len(acc_array))):
      writer.writerow({"Iteration": i, "Algorithm": alg, "Dataset": dataset_str, "ThresholdSim": str(round(cos_sim,2)), "Training%": str(round(per_train, 4)), "NumAddedEdges": total_added, "NumRemovedEdges": total_removed, "Accuracy": acc_array[i]})

  
  
  with open(path_avg, "a") as csvfile:
      fieldnames = ["Algorithm", "Dataset", "ThresholdSim", "Training%", "NumAddedEdges", "NumRemovedEdges", "AvgAccuracy"]
      writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
      writer.writerow({"Algorithm": alg, "Dataset": dataset_str, "ThresholdSim": str(round(cos_sim,2)), "Training%": str(round(per_train, 4)), "NumAddedEdges": total_added, "NumRemovedEdges": total_removed, "AvgAccuracy": round(sum(acc_array)/len(acc_array), 3)})
  return

In [6]:
def training_link(alg, train_g, n_epoch, num_iterations, train_pos_g, train_neg_g, test_pos_g, test_neg_g, dataset_str, cos_sim, per_train, total_added, total_removed, path_iter, path_avg):
    model = Model_link(alg, train_g.ndata['feat'].shape[1])
    pred = DotPredictor()

    def compute_loss(pos_score, neg_score):
        scores = torch.cat([pos_score, neg_score])
        labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
        return F.binary_cross_entropy_with_logits(scores, labels)

    def compute_auc(pos_score, neg_score):
        scores = torch.cat([pos_score, neg_score]).numpy()
        labels = torch.cat(
            [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
        return roc_auc_score(labels, scores)
    
    
    
    # ----------- training -------------------------------- #
    acc_array = []*num_iterations
    for i in range(num_iterations):
        optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.01)
        for e in range(n_epoch):
            # forward
            h = model(train_g, train_g.ndata['feat'])
            pos_score = pred(train_pos_g, h)
            neg_score = pred(train_neg_g, h)
            loss = compute_loss(pos_score, neg_score)
            #print('In epoch {}, loss: {}'.format(e, loss))
            # backward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # ----------- check results ------------------------ #
        with torch.no_grad():
            pos_score = pred(test_pos_g, h)
            neg_score = pred(test_neg_g, h)
            auc = compute_auc(pos_score, neg_score)
            print('AUC', auc)
        acc_array.append(auc)
        #refresh model for next iteration
        model = Model_link(alg, train_g.ndata['feat'].shape[1])
    #now writing on the result iter file all the accuracies saved in acc_array
    with open(path_iter, "a") as csvfile:
        fieldnames = ["Iteration", "Algorithm", "Dataset", "ThresholdSim", "Training%", "NumAddedEdges", "NumRemovedEdges", "Accuracy"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        for i in range(len(acc_array)):
          writer.writerow({"Iteration": i, "Algorithm": alg, "Dataset": dataset_str, "ThresholdSim": str(round(cos_sim,2)), "Training%": str(round(per_train, 4)), "NumAddedEdges": total_added, "NumRemovedEdges": total_removed, "Accuracy": acc_array[i]})



    with open(path_avg, "a") as csvfile:
        fieldnames = ["Algorithm", "Dataset", "ThresholdSim", "Training%", "NumAddedEdges", "NumRemovedEdges", "AvgAccuracy"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writerow({"Algorithm": alg, "Dataset": dataset_str, "ThresholdSim": str(round(cos_sim,2)), "Training%": str(round(per_train, 4)), "NumAddedEdges": total_added, "NumRemovedEdges": total_removed, "AvgAccuracy": round(sum(acc_array)/len(acc_array), 3)})
    return
    

# Execution Function

In [7]:
def execution(dataset_str, alg, mode, training_values, cos_list, num_iterations, batch_size, path_iter, path_avg, n_epochs, bool_remove, device):
    if (alg not in ['SAGEConv', 'GraphConv', 'GATConv', 'GatedGraphConv']):
        print("Error: algorithm must be either SAGEConv, GraphConv, GatedGraphConv or GATConv, instead " + alg + " was given")
        return 1
    if (dataset_str not in ['Cora', 'Citeseer', 'Pubmed', 'Physics', 'CoauthorCS']):
        print("Error: dataset must be either Cora, Citeseer, Pubmed, Physics or CoauthorCS, instead " + dataset_str + " was given")
        return 1


    dataset_base = load_dataset(dataset_str)
    graph = dataset_base[0]

    if(mode == "Node"):
        num_classes = dataset_base.num_classes 

        feat = graph.ndata['feat'] #node features
        labels = graph.ndata['label'] #ground truth labels (for node classification)
        num_features = feat.shape[1]
        n_nodes = graph.num_nodes() #number of nodes in the graph

        #pre processing train, test and valid nids in order to use the same ones for different cosine similarity and make accurate comparisons
        dict_train = {}
        for training_value in training_values:
            n_train = round(n_nodes*training_value) #effective number of train_nids
            train_list = [i for i in range(n_nodes)]
            train_list = random.sample(train_list, n_train) #sampling n_train values from train_list
            train_mask = [True if i in train_list else False for i in range(n_nodes)]
            train_nids = graph.nodes()[train_mask]
            test_mask = np.logical_not(train_mask) > 0
            test_nids = graph.nodes()[test_mask]
            half = round(len(test_nids)/2)
            val_mask = [True for _ in range(half)]
            val_mask.extend([False for _ in range(len(test_nids) - half)])
            valid_nids = test_nids[val_mask]
            dict_train[training_value] = [train_nids, test_nids, valid_nids]

        if(cos_list[0] < 0.1):
            standard_done = False

        for f_remove in bool_remove:
            for cos in cos_list:
                if(cos < 0.1 and standard_done == True):
                    continue
                elif(cos < 0.1 and standard_done == False):
                    standard_done = True
                #Loading Dataset
                dataset = dataset_base
                graph = dataset[0]

                total_removed = 0
                total_added = 0
                if(cos > 0):
                    #### these edges are NOT already present in the graph
                    seed_list, dest_list = overlay_edges(dataset_str, cos)

                    if (f_remove): #removing random neighbor edges
                        graph, total_removed, n_overlay_after, dict_overlay, tensor_total = remove(graph, seed_list)
                        #adding overlay edges
                        total_added, seed_list, dest_list = addEdges(graph, seed_list, dest_list, n_overlay_after, dict_overlay)
                    else:
                        total_added = len(seed_list)
                    graph.add_edges(torch.tensor(seed_list), torch.tensor(dest_list))
                    if(alg in ["GraphConv", "GATConv"]):
                        graph = dgl.add_self_loop(graph)
                for training_value in training_values:
                    train_nids, test_nids, valid_nids = dict_train[training_value]
                    training(alg, dataset_str, graph, device, feat, labels, num_classes, num_features, train_nids, test_nids, valid_nids, total_removed, total_added, cos, training_value, num_iterations, batch_size, path_iter, path_avg, n_epochs)
                
    elif(mode == "Link"):
        g = graph
        # Split edge set for training and testing
        u, v = g.edges()

        eids = np.arange(g.number_of_edges())
        eids = np.random.permutation(eids)
        
        
        for percentage in training_values:
            train_size = int(len(eids) * percentage)
            test_size = g.number_of_edges() - train_size
            test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
            train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

            # Find all negative edges and split them for training and testing
            adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
            adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
            neg_u, neg_v = np.where(adj_neg != 0)

            neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
            test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
            train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]

            train_g = dgl.remove_edges(g, eids[:test_size])

            train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
            train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

            test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
            test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())
            if(cos_list[0] < 0.1):
                standard_done = False
            for f_remove in bool_remove:
                for cos in cos_list:
                    if(cos < 0.1 and standard_done == True):
                        continue
                    elif(cos < 0.1 and standard_done == False):
                        standard_done = True
                    total_added = 0
                    total_removed = 0
                    if(cos > 0.1):
                        seed_list, dest_list = overlay_edges(dataset_str, cos)

                        if (f_remove): #removing random neighbor edges
                            graph, total_removed, n_overlay_after, dict_overlay, tensor_total = remove(graph, seed_list)
                            remove_s = []
                            remove_d = []
                            for pair in tensor_total:
                                remove_s.append(pair[0].item())
                                remove_d.append(pair[1].item())
                            new_train_pos_u, new_train_pos_v = remove_negatives(train_pos_u, train_pos_v, remove_s, remove_d)
                            print("after remove negatives")
                            new_test_pos_u, new_test_pos_v = remove_negatives(test_pos_u, test_pos_v, remove_s, remove_d)
                            train_pos_g = dgl.graph((new_train_pos_u, new_train_pos_v), num_nodes=g.number_of_nodes())
                            test_pos_g = dgl.graph((new_test_pos_u, new_test_pos_v), num_nodes=g.number_of_nodes())
                            #adding overlay edges
                            total_added, seed_list, dest_list = addEdges(graph, seed_list, dest_list, n_overlay_after, dict_overlay)
                        else:
                            total_added = len(seed_list)
                        graph.add_edges(torch.tensor(seed_list), torch.tensor(dest_list))
                        new_train_neg_u, new_train_neg_v = remove_negatives(train_neg_u, train_neg_v, seed_list, dest_list)
                        new_test_neg_u, new_test_neg_v = remove_negatives(test_neg_u, test_neg_v, seed_list, dest_list)
                        train_neg_g = dgl.graph((new_train_neg_u, new_train_neg_v), num_nodes=g.number_of_nodes())
                        test_neg_g = dgl.graph((new_test_neg_u, new_test_neg_v), num_nodes=g.number_of_nodes())

                    training_link(alg, train_g, n_epochs, num_iterations, train_pos_g, train_neg_g, test_pos_g, test_neg_g, dataset_str, cos, percentage, total_added, total_removed, path_iter, path_avg)
                
    return

# General Function

In [8]:
def general(dataset_list, mode, algorithm, cos_sim, training, step_cos, step_train, num_iterations, learning_rate, batch_size, path_iter, path_avg, no_overlay, remove_edges, n_epochs, device):

#@TODO: 
# Learning rate
# Adam: lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False, *, maximize=False

  '''Safety Checks'''
  if(len(dataset_list) == 0):
    print("Error: Dataset List is Empty.")
    return 1
  if(len(algorithm) == 0):
    print("Error: Algorithm List is Empty")
    return 1
  if(len(cos_sim) < 2):
    print("Error: give low and high value for cosine similarity: [low, high]")
    return 1
  if(len(training) < 2):
    print("Error: give low and high value for training: [low, high]")
    return 1
  if(cos_sim[0] > cos_sim[1]):
    print("Error: cos_sim[0] must be less or equal than cos_sim[1]")
    return 1
  if(training[0] > training[1]):
    print("Error: training[0] must be less or equal than training[1]")
    return 1
  if(device != "cuda" and device != "cpu"):
    print("Error: device must be cuda or cpu")
    return 1

  #Parsing Inputs:
  datasets = [x for x in dataset_list]
  algorithms = [x for x in algorithm]


  train_low = training[0]
  train_high = training[1]
  training_values = np.arange(train_low, train_high+step_train, step_train)
  
  cos_low = cos_sim[0]
  cos_high = cos_sim[1]
  cos_list = np.arange(cos_low, cos_high+step_cos, step_cos)
  if(no_overlay == 'y'):
    cos_list = np.insert(cos_list, 0, 0.0)
    
  with open(path_iter, "w") as csvfile:
    fieldnames = ["Iteration", "Algorithm", "Dataset", "ThresholdSim", "Training%", "NumAddedEdges", "NumRemovedEdges", "Accuracy"]
    writer = csv.DictWriter(csvfile, fieldnames = fieldnames)
    writer.writeheader()

  with open(path_avg, "w") as csvfile:
    fieldnames = ["Algorithm", "Dataset", "ThresholdSim", "Training%", "NumAddedEdges", "NumRemovedEdges", "AvgAccuracy"]
    writer = csv.DictWriter(csvfile, fieldnames = fieldnames)
    writer.writeheader()

  if(remove_edges == 'y'):
    bool_remove = [True]
  elif(remove_edges == 'b'):
    bool_remove = [True, False]
  else:
    bool_remove = [False]
    
  for dataset in datasets:
      for alg in algorithms:
        execution(dataset, alg, mode, training_values, cos_list, num_iterations, batch_size, path_iter, path_avg, n_epochs, bool_remove, device)

# Main Function

In [9]:
def main(path = '/home/studio-lab-user/sagemaker-studiolab-notebooks/Git/generalization_overlay/config_sage.ini'):
  parser = configparser.ConfigParser()
  parser.read(path)

  dataset_list = [x for x in ["Cora", "Citeseer", "Pubmed", "Physics", "CoauthorCS"] if parser["DatasetList"][x] == "y"]
  alg_list = [x for x in ["SAGEConv", "GraphConv", "GATConv", "GatedGraphConv"] if parser["AlgList"][x] == "y"]
  cos_sim = [float(parser["RangeValues"]["CosLow"]), float(parser["RangeValues"]["CosHigh"])]
  train_sim = [float(parser["RangeValues"]["TrainLow"]), float(parser["RangeValues"]["TrainHigh"])]
  step_cos = float(parser["RangeValues"]["StepCos"])
  step_train = float(parser["RangeValues"]["StepTrain"])

  mode = parser["TrainingValues"]["Mode"]
  no_overlay = parser["TrainingValues"]["NoOverlay"]
  remove_edges = parser["TrainingValues"]["RemoveEdges"]
  n_epochs = int(parser["TrainingValues"]["NumEpochs"])
  num_iterations = int(parser["TrainingValues"]["NumIterations"])
  learning_rate = parser["TrainingValues"]["LearningRate"]
  batch_size = int(parser["TrainingValues"]["batch_size"])
  device = parser["TrainingValues"]["device"]

  path_iter = parser["Paths"]["SingleIteration"]
  path_avg = parser["Paths"]["Avg"]

  general(dataset_list, mode, alg_list, cos_sim, train_sim, step_cos, step_train, num_iterations, learning_rate, batch_size, path_iter, path_avg, no_overlay, remove_edges, n_epochs, device)
  return

In [10]:
main('/home/studio-lab-user/sagemaker-studiolab-notebooks/Git/generalization_overlay/config_link.ini')

  0%|          | 0/11 [00:00<?, ?it/s]

predictions.shape= torch.Size([1024, 5])
labels.shape= torch.Size([1024])


  9%|▉         | 1/11 [00:00<00:08,  1.15it/s, loss=1.689, acc=0.208]

predictions.shape= torch.Size([1024, 5])
labels.shape= torch.Size([1024])


 18%|█▊        | 2/11 [00:01<00:06,  1.41it/s, loss=0.897, acc=0.725]

predictions.shape= torch.Size([1024, 5])
labels.shape= torch.Size([1024])


 27%|██▋       | 3/11 [00:02<00:05,  1.55it/s, loss=0.580, acc=0.839]

predictions.shape= torch.Size([1024, 5])
labels.shape= torch.Size([1024])


 36%|███▋      | 4/11 [00:02<00:04,  1.63it/s, loss=0.421, acc=0.886]

predictions.shape= torch.Size([1024, 5])
labels.shape= torch.Size([1024])


 45%|████▌     | 5/11 [00:03<00:03,  1.68it/s, loss=0.317, acc=0.920]

predictions.shape= torch.Size([1024, 5])
labels.shape= torch.Size([1024])


 55%|█████▍    | 6/11 [00:03<00:02,  1.71it/s, loss=0.318, acc=0.911]

predictions.shape= torch.Size([1024, 5])
labels.shape= torch.Size([1024])


 64%|██████▎   | 7/11 [00:04<00:02,  1.72it/s, loss=0.235, acc=0.937]

predictions.shape= torch.Size([1024, 5])
labels.shape= torch.Size([1024])


 73%|███████▎  | 8/11 [00:04<00:01,  1.73it/s, loss=0.243, acc=0.939]

predictions.shape= torch.Size([1024, 5])
labels.shape= torch.Size([1024])


 82%|████████▏ | 9/11 [00:05<00:01,  1.74it/s, loss=0.223, acc=0.938]

predictions.shape= torch.Size([1024, 5])
labels.shape= torch.Size([1024])


100%|██████████| 11/11 [00:06<00:00,  1.79it/s, loss=0.124, acc=0.981]


predictions.shape= torch.Size([108, 5])
labels.shape= torch.Size([108])


100%|██████████| 12/12 [00:02<00:00,  4.59it/s]


RuntimeError: [enforce fail at inline_container.cc:300] . unexpected pos 4310144 vs 4310096