## Build the Data object

In [1]:
import numpy as np
import pandas as pd 
import random as rd
import networkx as nx

In [2]:
import torch
import torch_geometric.data as data
import torch_geometric.transforms as T
from torch_geometric.utils import negative_sampling,train_test_split_edges,to_dense_adj
from torch_geometric.loader import DataLoader
from sklearn import preprocessing
from torch_geometric.nn import Node2Vec

device = "cpu"

In [3]:
def f_edges(nodes,edges):
    class_classi = []
    for edge in edges.iterrows():
        try:
            agent_from = nodes.iloc[list(nodes["ID"]).index(edge[1]["from"])]
            agent_to = nodes.iloc[list(nodes["ID"]).index(edge[1]["to"])]
            if (agent_from["Curso"] == agent_to["Curso"]):
                    class_classi.append(str(agent_from["Curso"]))
            else:
                    class_classi.append("Intergroup")
        except:
            class_classi.append("Missing")
    return class_classi

In [4]:
def rearrange_values(df_edges,df_nodes):
    
    if df_edges["to"].nunique() > df_edges["from"].nunique():
        new_vect = np.arange(df_edges["to"].nunique())
        ###
        translation =  dict(zip(list(df_edges.sort_values(by="to")["to"].unique()),new_vect))
    else:
        new_vect = np.arange(df_edges["from"].nunique())
        ###
        translation =  dict(zip(list(df_edges.sort_values(by="from")["from"].unique()),new_vect))
    df_edges[["from","to"]] = df_edges[["from","to"]].replace(translation)
    df_nodes["ID"] = df_nodes["ID"].replace(translation)
    return df_edges,df_nodes

In [5]:
len_datasets = 13
datasets_nodes = [0]*len_datasets
datasets_edges = [0]*len_datasets
data_list = [0]*len_datasets
graficas = [0]*len_datasets
for i in range(0,len_datasets):
    datasets_nodes[i] = pd.read_csv(r"Coles/Nodes/Nodes_t"+str(i+1)+".csv",sep=",",encoding = 'unicode_escape')
    datasets_edges[i] = pd.read_csv(r"Coles/Edges/Edges_t"+str(i+1)+".csv",sep=",",encoding = 'unicode_escape')
    #datasets_edges[i][["from","to"]] = datasets_edges[i][["from","to"]].apply(lambda x:x-min(datasets_edges[i]["to"].min(),datasets_edges[i]["from"].min()))    
    datasets_edges[i]["Escuela"] = i
    datasets_edges[i]["weight"] = datasets_edges[i]["weight"].apply(lambda x: np.sign(x)).replace({-1:0}).reset_index().drop("index",axis=1)
    datasets_edges[i],datasets_nodes[i] = rearrange_values(datasets_edges[i],datasets_nodes[i])
    datasets_edges[i]["class_classif"] = f_edges(datasets_nodes[i],datasets_edges[i])
    graficas[i] = nx.from_pandas_edgelist(datasets_edges[i],source="from",target="to",create_using=nx.DiGraph())
    data_list[i] = data.Data(edge_index = torch.tensor(datasets_edges[i][["from","to"]].to_numpy().T))

data_loader = DataLoader(data_list, batch_size=1)

In [7]:
datasets_edges[0]

Unnamed: 0,from,to,weight,Escuela,class_classif
0,0,7,0,0,1
1,0,20,0,0,1
2,0,21,1,0,1
3,0,24,0,0,1
4,0,32,1,0,1
...,...,...,...,...,...
8552,408,403,1,0,3
8553,408,404,1,0,3
8554,408,405,1,0,3
8555,408,406,1,0,3


In [6]:
data_list[0]

Data(edge_index=[2, 8557])

### Final choices for the hyperparameters
The parameters that are not described are set as in the code below.

 <u> Configurations without best friends </u> <br>
    -  DeepWalk _(p=1,q=1)_ <br>
    - BFS _(p=1,q=10)_ <br>
    - DFS _(p=10,q=1)_ <br>

In [6]:
tolerance = 1e-3
#_local p=0.5 q = 4 w_l = 40 ,c_s = 20 #same with local hyperparameters
#_local_2 p=0.5 q = 4, w_l = 10 c_s = 5 # same as former restricted to more begginings 
#should not be very trustworthy as c_s = 5 and c_s = 20 do not offer results
#0.25 with more nodes
total_embeddings = pd.DataFrame()
i = 0
for data in data_loader.dataset:
    print("Dataset: {}".format(i))
    model = Node2Vec(data.edge_index, embedding_dim=128, walk_length=30,
             context_size=10, walks_per_node=10,
             num_negative_samples=1, p=1, q=4, sparse=True).to(device)
    loader = model.loader(batch_size=64, shuffle=True, num_workers=4)
    optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)
    
    #####
    def train():
        model.train()
        total_loss = 0
        for pos_rw, neg_rw in loader:
            optimizer.zero_grad()
            loss = model.loss(pos_rw.to(device), neg_rw.to(device))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        return total_loss / len(loader)

    pre_value_loss,curr_value_loss = 100,0
    epoch = 0
    while (abs(pre_value_loss - curr_value_loss) > tolerance):
        loss = train()
        epoch += 1
        pre_value_loss = curr_value_loss
        curr_value_loss = loss
        if epoch%5 == 0:
            print(f'Epoch: {epoch:02d}, with loss: {loss:.4f}')
    print(f'The Node2vec algorithm converged at epoch: {epoch:02d}, with loss: {loss:.4f}')
            

        
        ####
        
    z = model()
    # from tensor to numpy
    emb_128 = z.detach().cpu().numpy()
    
    edge_embedding = []
    for u,v in data.edge_index.t():
        edge_embedding.append(np.maximum(emb_128[u],emb_128[v]))
    total_embeddings_temp = pd.DataFrame(edge_embedding)
    total_embeddings_temp["Escuela"] = datasets_edges[i]["Escuela"]
    total_embeddings_temp["weight"] = datasets_edges[i]["weight"]
    total_embeddings_temp["class_classif"] = datasets_edges[i]["class_classif"]
    total_embeddings = pd.concat([total_embeddings,total_embeddings_temp],axis=0)
    i += 1
    
total_embeddings.to_csv("total_embeddings_with_bf_p"+str(model.p)+"q"+str(model.q)+"_courses_ftest.csv",index="False")

Dataset: 0
Epoch: 05, with loss: 4.7840
Epoch: 10, with loss: 3.0384
Epoch: 15, with loss: 2.1435
Epoch: 20, with loss: 1.6404
Epoch: 25, with loss: 1.3639
Epoch: 30, with loss: 1.2057
Epoch: 35, with loss: 1.0933
Epoch: 40, with loss: 1.0315
Epoch: 45, with loss: 0.9984
Epoch: 50, with loss: 0.9723
Epoch: 55, with loss: 0.9623
Epoch: 60, with loss: 0.9502
The Node2vec algorithm converged at epoch: 60, with loss: 0.9502
Dataset: 1
Epoch: 05, with loss: 5.3020
Epoch: 10, with loss: 3.9482
Epoch: 15, with loss: 3.1592
Epoch: 20, with loss: 2.6669
Epoch: 25, with loss: 2.3274
Epoch: 30, with loss: 2.0848
Epoch: 35, with loss: 1.8944
Epoch: 40, with loss: 1.7723
Epoch: 45, with loss: 1.6784
Epoch: 50, with loss: 1.5736
Epoch: 55, with loss: 1.5231
The Node2vec algorithm converged at epoch: 55, with loss: 1.5231
Dataset: 2
Epoch: 05, with loss: 4.1898
Epoch: 10, with loss: 2.4927
Epoch: 15, with loss: 1.7250
Epoch: 20, with loss: 1.3207
Epoch: 25, with loss: 1.1206
Epoch: 30, with loss: 1.0