## Training the GNN with the 80% of the positive network and testing on the remaining

### Import packages

In [1]:
import numpy as np
import pandas as pd 


import torch
import torch_geometric.data as data
from torch_geometric.nn import GCNConv
import torch_geometric.transforms as T
import torch.nn.functional as F
from torch_geometric.utils import negative_sampling,train_test_split_edges,to_dense_adj
from sklearn.metrics import roc_auc_score
from torch_geometric.transforms import RandomLinkSplit
from sklearn import preprocessing

device = "cpu"

### Create networks and standarize data

In [2]:
import networkx as nx
import pandas as pd
## Just prepare the data
nodes = pd.read_csv(r"Nodes_t1.csv",sep=";",encoding = 'unicode_escape')
edges = pd.read_csv(r"Edges_t1.csv",sep=";",encoding = 'unicode_escape')
edges = edges.apply(lambda x: x - x.min(),axis = 0)
###Erase ESO 
nodes["Curso"] = nodes["Curso"].astype(str).str[0].astype("int64")
del nodes["Unnamed: 0"]
edges["weight"] = edges["weight"].apply(lambda x:x+1)
pos_edges = edges[edges["weight"]> 3]
neg_edges = edges[edges["weight"]< 3]
### One hot encode and normalize node attributes
nodes_dummy = pd.get_dummies(nodes[["Curso","Grupo"]])
rng = np.random.default_rng()
#nodes_dummy = pd.DataFrame(rng.integers(0, 2, size=(409, 10)), columns=list('ABCDEFGHIJ'))

x = nodes_dummy.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
nodes_norm = pd.DataFrame(x_scaled)

### Split G_positive into a G_positive_train and G_positive_test

In [3]:
shuffled_df = pos_edges.sample(frac=1)
len_train_set = int(len(shuffled_df)*0.8)

In [4]:
edges_train = pos_edges[:len_train_set]
edges_test = pos_edges[len_train_set:]

### Define the complementary network of the training network (for the null model)

In [5]:
edges_real = []
for elem in edges_train[["from","to"]].to_numpy():
    edges_real.append(tuple(elem))
chosen_edges = list(nx.complete_graph(409,create_using=nx.DiGraph()).edges())
for elem in edges_real:
    chosen_edges.remove(elem)


In [6]:
train_data = data.Data(x=torch.tensor(nodes_norm.to_numpy(),dtype=torch.float32),
                          edge_index=torch.tensor(edges_train[["from","to"]].to_numpy().T))
test_data = data.Data(x=torch.tensor(nodes_norm.to_numpy(),dtype=torch.float32),
                          edge_index=torch.tensor(edges_train[["from","to"]].to_numpy().T))

In [7]:
data = train_data.clone()
data.num_nodes = len(data._store["x"])
data = train_test_split_edges(data)



In [8]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(data.num_features, 128)
        self.conv2 = GCNConv(128, 64)

    def encode(self):
        x = self.conv1(data.x, data.train_pos_edge_index) # convolution 1
        x = x.relu()
        return self.conv2(x, data.train_pos_edge_index) # convolution 2

    def decode(self, z, pos_edge_index, neg_edge_index): # only pos and neg edges
        edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=-1) # concatenate pos and neg edges
        logits = (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1)  # dot product 
        return logits

    def decode_all(self, z): 
        prob_adj = z @ z.t() # get adj NxN
        return (prob_adj > 1-10e-10).nonzero(as_tuple=False).t() # get predicted edge_list 

In [9]:
model, data = Net().to(device), data.to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)

In [10]:

def get_link_labels(pos_edge_index, neg_edge_index):
    # returns a tensor:
    # [1,1,1,1,...,0,0,0,0,0,..] with the number of ones is equel to the lenght of pos_edge_index
    # and the number of zeros is equal to the length of neg_edge_index
    E = pos_edge_index.size(1) + neg_edge_index.size(1)
    link_labels = torch.zeros(E, dtype=torch.float, device=device)
    link_labels[:pos_edge_index.size(1)] = 1.
    return link_labels


def train():
    model.train()

    neg_edge_index = negative_sampling(
        edge_index=data.train_pos_edge_index, #positive edges
        num_nodes=data.num_nodes, # number of nodes
        num_neg_samples=data.train_pos_edge_index.size(1)) # number of neg_sample equal to number of pos_edges

    optimizer.zero_grad()
    
    z = model.encode() #encode
    link_logits = model.decode(z, data.train_pos_edge_index, neg_edge_index) # decode
    
    link_labels = get_link_labels(data.train_pos_edge_index, neg_edge_index)
    loss = F.binary_cross_entropy_with_logits(link_logits, link_labels)
    loss.backward()
    optimizer.step()

    return loss


@torch.no_grad()
def test():
    model.eval()
    perfs = []
    for prefix in ["val", "test"]:
        pos_edge_index = data[f'{prefix}_pos_edge_index']
        neg_edge_index = data[f'{prefix}_neg_edge_index']

        z = model.encode() # encode train
        link_logits = model.decode(z, pos_edge_index, neg_edge_index) # decode test or val
        link_probs = link_logits.sigmoid() # apply sigmoid
        
        link_labels = get_link_labels(pos_edge_index, neg_edge_index) # get link
        
        perfs.append(roc_auc_score(link_labels.cpu(), link_probs.cpu())) #compute roc_auc score
    return perfs


In [11]:
best_val_perf = test_perf = 0
for epoch in range(1, 1001):
    train_loss = train()
    val_perf, tmp_test_perf = test()
    if val_perf > best_val_perf:
        best_val_perf = val_perf
        test_perf = tmp_test_perf
    log = 'Epoch: {:03d}, Loss: {:.4f}, Val: {:.4f}, Test: {:.4f}'
    if epoch % 100 == 0:
        print(log.format(epoch, train_loss, best_val_perf, test_perf))

Epoch: 100, Loss: 0.4510, Val: 0.9488, Test: 0.9189
Epoch: 200, Loss: 0.4283, Val: 0.9545, Test: 0.9193
Epoch: 300, Loss: 0.4297, Val: 0.9561, Test: 0.9185
Epoch: 400, Loss: 0.4288, Val: 0.9561, Test: 0.9185
Epoch: 500, Loss: 0.4287, Val: 0.9561, Test: 0.9185
Epoch: 600, Loss: 0.4150, Val: 0.9561, Test: 0.9185
Epoch: 700, Loss: 0.4161, Val: 0.9561, Test: 0.9185
Epoch: 800, Loss: 0.4113, Val: 0.9561, Test: 0.9185
Epoch: 900, Loss: 0.4125, Val: 0.9561, Test: 0.9185
Epoch: 1000, Loss: 0.4042, Val: 0.9561, Test: 0.9185


In [12]:
z = model.encode()
final_edge_index_1 = model.decode_all(z)
#Remove self loops
bool_mask = final_edge_index_1[0] != final_edge_index_1[1]
simulated_edges_1 = torch.empty((2,int(sum(bool_mask))))
for item in range(final_edge_index_1.size()[0]):
    simulated_edges_1[item] = final_edge_index_1[item][bool_mask]
    

In [13]:
coincidences = to_dense_adj(test_data["edge_index"]).squeeze()*to_dense_adj(final_edge_index_1).squeeze()
coin_GNN = coincidences.sum()

In [14]:
coin_GNN

Data(x=[409, 9], edge_index=[2, 5841])

In [15]:
print(f"The GNN obtains a {coin_GNN/test_data.edge_index.size()[1]*100:.2f}% of accuracy in predicting links, but as in the previous case producing a big amount of them.")

The GNN obtains a 86.87% of accuracy in predicting links, but as in the previous case producing a big amount of them.


In [17]:
G_test = nx.from_pandas_edgelist(edges_test, "from", "to",create_using=nx.DiGraph,edge_attr="weight")
import random as rd 
coincidences_total = 0
for sim in range(10):
    chosen_edges_2 = chosen_edges.copy()
    G_random = nx.DiGraph()
    G_random.add_nodes_from(range(409))
    for trial in range(final_edge_index_1.size()[1]):
        rd_sample = rd.choice(chosen_edges_2)
        G_random.add_edge(rd_sample[0],rd_sample[1]) 
        chosen_edges_2.remove(rd_sample)
    coincidences_random = len([(u,v) for (u,v) in G_random.edges() if G_test.has_edge(u,v)])
    coincidences_total += coincidences_random
coin_random = coincidences_total/10

In [18]:
print(f"The null model obtains a {coin_random/test_data.edge_index.size()[1]*100:.2f}% of accuracy in predicting links, but as in the previous case producing a big amount of them.")

The null model obtains a 3.50% of accuracy in predicting links, but as in the previous case producing a big amount of them.


In [23]:
coin_GNN,len(simulated_edges_1[1])

(tensor(5074.), 22112)