## Build the Data object

In [1]:
import numpy as np
import pandas as pd 
import random as rd
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
import torch
import torch_geometric.data as data
from torch_geometric.nn import GCNConv
import torch_geometric.transforms as T
import torch.nn.functional as F
from torch_geometric.utils import negative_sampling,train_test_split_edges,to_dense_adj
from torch_geometric.loader import DataLoader
from sklearn.metrics import roc_auc_score
from torch_geometric.transforms import RandomLinkSplit
from sklearn import preprocessing
from torch_geometric.nn import Node2Vec

device = "cpu"

In [2]:
len_datasets = 13
datasets_nodes = [0]*len_datasets
datasets_edges = [0]*len_datasets
for i in range(0,len_datasets):
    datasets_nodes[i] = pd.read_csv(r"Coles/Nodes/Nodes_t"+str(i+1)+".csv",sep=",",encoding = 'unicode_escape')
    datasets_edges[i] = pd.read_csv(r"Coles/Edges/Edges_t"+str(i+1)+".csv",sep=",",encoding = 'unicode_escape')
    datasets_nodes[i].drop("ID",axis=1,inplace=True)
    if i == 0:
        datasets_edges[i][["from","to"]] = datasets_edges[i][["from","to"]].apply(lambda x:x-datasets_edges[i]["from"].min())
    else:
        datasets_edges[i][["from","to"]] = datasets_edges[i][["from","to"]].apply(lambda x:x+1+datasets_edges[i-1]["from"].max()-datasets_edges[i]["from"].min())
    datasets_nodes[i]["Escuela"] = i
    datasets_edges[i]["Escuela"] = i
edges_total = pd.concat(datasets_edges,axis=0).reset_index().drop("index",axis=1)
#With friends
edges_total = edges_total.reset_index().drop("index",axis=1)
#Without friends
edges_total = edges_total[edges_total["weight"] != 1].reset_index().drop("index",axis=1)
edges_total["weight"] = edges_total["weight"].apply(lambda x: np.sign(x)).replace({-1:0})
nodes_total = pd.concat(datasets_nodes,axis=0).reset_index().drop("index",axis=1)

### Build the data object

In [3]:
total_data = data.Data(edge_index=torch.tensor(edges_total[["from","to"]].to_numpy().T))
total_data

Data(edge_index=[2, 36315])

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = Node2Vec(total_data.edge_index, embedding_dim=128, walk_length=20,
             context_size=10, walks_per_node=5,
             num_negative_samples=1, p=1, q=10, sparse=True).to(device)

loader = model.loader(batch_size=128, shuffle=True, num_workers=4)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)


In [5]:
tolerance = 1e-3

def train():
    model.train()
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)


@torch.no_grad()
def test():
    model.eval()
    z = model()
    acc = model.test(z[total_data.train_mask], total_data.y[total_data.train_mask],
                     z[total_data.test_mask], total_data.y[total_data.test_mask],
                     max_iter=10)
    return acc


pre_value_loss,curr_value_loss = 100,0
epoch = 0
while (abs(pre_value_loss - curr_value_loss) > tolerance):
    loss = train()
    epoch += 1
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}')
        pre_value_loss = curr_value_loss
        curr_value_loss = loss
        

Epoch: 10, Loss: 0.9567
Epoch: 20, Loss: 0.7527
Epoch: 30, Loss: 0.7329
Epoch: 40, Loss: 0.7287
Epoch: 50, Loss: 0.7283


In [7]:
z = model()
# from tensor to numpy
emb_128 = z.detach().cpu().numpy()

## Compute edge embedding

In [8]:
edge_embedding = []
for u,v in total_data.edge_index.t():
    edge_embedding.append(np.maximum(emb_128[u],emb_128[v]))

In [9]:
total_embeddings = pd.DataFrame(edge_embedding)
total_embeddings["Escuela"] = edges_total["Escuela"]
edge_attr_pd = pd.DataFrame(edges_total[["weight"]])
total_embeddings = pd.concat([total_embeddings,edge_attr_pd],axis=1)
total_embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,120,121,122,123,124,125,126,127,Escuela,weight
0,-0.189668,0.203217,0.355779,-0.034540,-0.012918,0.286540,0.059287,-0.079313,0.071313,0.250507,...,-0.097594,-0.425788,-0.203412,0.032890,-0.018438,-0.302787,-0.029621,-0.148618,0,0
1,-0.141516,0.139113,0.355779,-0.034540,0.119023,0.286540,0.059287,0.039288,0.071313,0.250507,...,-0.091875,-0.425788,-0.194938,0.114397,-0.018438,-0.311509,-0.143265,-0.256549,0,0
2,-0.189668,0.139113,0.380535,-0.034540,-0.014130,0.286540,0.164432,-0.081648,0.073726,0.250507,...,-0.091042,-0.425788,-0.203761,-0.016567,-0.018438,-0.311509,-0.106561,0.121386,0,1
3,-0.172409,0.139113,0.355779,-0.034540,-0.009878,0.286540,0.059287,-0.065555,0.132951,0.250507,...,-0.020902,-0.425788,-0.200017,0.031923,-0.018438,-0.275536,-0.064610,-0.145989,0,0
4,-0.189668,0.185653,0.355779,-0.034540,0.066967,0.380904,0.202636,0.083451,0.100583,0.250507,...,-0.216095,-0.425788,-0.253172,-0.036468,-0.018438,-0.275080,-0.107826,-0.029205,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36310,0.012754,0.088046,0.427042,-0.229838,-0.136650,-0.232579,0.113630,0.016369,0.142230,-0.184593,...,0.245022,-0.220599,0.270071,0.614381,-0.030958,0.151477,0.533545,0.080397,12,0
36311,0.013223,0.088046,0.427042,-0.208969,-0.028671,-0.227608,0.078690,-0.053410,0.120702,-0.260914,...,0.245022,-0.076185,0.201563,0.614381,-0.030958,0.059433,0.533545,0.032435,12,0
36312,0.012754,0.139265,0.474330,-0.223153,-0.066464,-0.174495,0.162155,-0.257779,0.120702,-0.199169,...,0.245022,0.033530,0.201563,0.614381,-0.030958,0.092037,0.533545,-0.062273,12,0
36313,0.012754,0.088046,0.427042,-0.213527,-0.084757,-0.098089,0.078690,-0.140847,0.120702,-0.246145,...,0.252059,-0.146984,0.299321,0.614381,-0.030958,0.005651,0.533545,-0.016839,12,0


In [10]:
total_embeddings.to_csv("total_embeddings_with_bf_q10.csv",index="False")