## Build the Data object

In [2]:
import numpy as np
import pandas as pd 
import random as rd
import networkx as nx

In [3]:
import torch
import torch_geometric.data as data
import torch_geometric.transforms as T
from torch_geometric.utils import negative_sampling,train_test_split_edges,to_dense_adj
from torch_geometric.loader import DataLoader
from sklearn import preprocessing
from torch_geometric.nn import Node2Vec

device = "cpu"

In [4]:
def f_edges(nodes,edges):
    class_classi = []
    for edge in edges.iterrows():
        try:
            agent_from = nodes.iloc[list(nodes["ID"]).index(edge[1]["from"])]
            agent_to = nodes.iloc[list(nodes["ID"]).index(edge[1]["to"])]
            if (agent_from["Curso"] == agent_to["Curso"]):
                    class_classi.append(str(agent_from["Curso"]))
            else:
                    class_classi.append("Intergroup")
        except:
            class_classi.append("Missing")
    return class_classi

In [5]:
from functools import reduce
len_datasets = 13
datasets_nodes = [0]*len_datasets
data_list = [0]*len_datasets
for i in range(0,len_datasets):
    datasets_nodes[i] = pd.read_csv(r"Coles/Nodes/Nodes_t"+str(i+1)+".csv",sep=",",encoding = 'unicode_escape')
    print(len(datasets_nodes[i]))

409
238
534
232
512
156
110
223
106
80
209
319
386


In [6]:
datasets_nodes[0]

Unnamed: 0,ID,Curso,Grupo,Sexo
0,207,1,A,H
1,206,1,A,H
2,205,1,A,H
3,204,1,A,H
4,203,1,A,M
...,...,...,...,...
404,581,3,E,H
405,580,3,E,M
406,579,3,E,M
407,578,3,E,M


In [7]:
len_datasets = 13
datasets_nodes = [0]*len_datasets
datasets_edges = [0]*len_datasets
data_list = [0]*len_datasets
for i in range(0,len_datasets):
    datasets_nodes[i] = pd.read_csv(r"Coles/Nodes/Nodes_t"+str(i+1)+".csv",sep=",",encoding = 'unicode_escape')
    datasets_edges[i] = pd.read_csv(r"Coles/Edges/Edges_t"+str(i+1)+".csv",sep=",",encoding = 'unicode_escape')
    if i == 0:
        datasets_edges[i][["from","to"]] = datasets_edges[i][["from","to"]].apply(lambda x:x-datasets_edges[i]["from"].min())
        datasets_nodes[i][["ID"]] = datasets_nodes[i][["ID"]].apply(lambda x:x-datasets_nodes[i]["ID"].min())
    else:
        datasets_edges[i][["from","to"]] = datasets_edges[i][["from","to"]].apply(lambda x:x+1+datasets_edges[i-1]["from"].max()-datasets_edges[i]["from"].min())
        datasets_nodes[i][["ID"]] = datasets_nodes[i][["ID"]].apply(lambda x:x+1+datasets_nodes[i-1]["ID"].max()-datasets_nodes[i]["ID"].min())
    
    datasets_nodes[i]["Escuela"] = i
    datasets_edges[i]["Escuela"] = i
    datasets_edges[i]["weight"] = datasets_edges[i]["weight"].apply(lambda x: np.sign(x)).replace({-1:0}).reset_index().drop("index",axis=1)
    datasets_edges[i]["class_clasif"] = f_edges(datasets_nodes[i],datasets_edges[i])

    data_list[i] = data.Data(edge_index = torch.tensor(datasets_edges[i][["from","to"]].to_numpy().T))

data_loader = DataLoader(data_list, batch_size=len_datasets)

### Final choices for the hyperparameters
The parameters that are not described are set as in the code below.

 <u> Configurations without best friends </u> <br>
    -  DeepWalk _(p=1,q=1)_ <br>
    - BFS _(p=1,q=10)_ <br>
    - DFS _(p=10,q=1)_ <br>

In [None]:
tolerance = 1e-3
#_local p=0.5 q = 4 w_l = 40 ,c_s = 20 #same with local hyperparameters
#_local_2 p=0.5 q = 4, w_l = 10 c_s = 5 # same as former restricted to more begginings 
#should not be very trustworthy as c_s = 5 and c_s = 20 do not offer results
#0.25 with more nodes
total_embeddings = pd.DataFrame()
i = 0
for data in data_loader.dataset:
    print("Dataset: {}".format(i))
    model = Node2Vec(data.edge_index, embedding_dim=128, walk_length=30,
             context_size=10, walks_per_node=10,
             num_negative_samples=1, p=10, q=1, sparse=True).to(device)
    loader = model.loader(batch_size=128, shuffle=True, num_workers=4)
    optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)
    
    #####
    def train():
        model.train()
        total_loss = 0
        for pos_rw, neg_rw in loader:
            optimizer.zero_grad()
            loss = model.loss(pos_rw.to(device), neg_rw.to(device))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        return total_loss / len(loader)

    pre_value_loss,curr_value_loss = 100,0
    epoch = 0
    while (abs(pre_value_loss - curr_value_loss) > tolerance):
        loss = train()
        epoch += 1
        pre_value_loss = curr_value_loss
        curr_value_loss = loss
    print(f'The Node2vec algorithm converged at epoch: {epoch:02d}, with loss: {loss:.4f}')
            

        
        ####
        
    z = model()
    # from tensor to numpy
    emb_128 = z.detach().cpu().numpy()
    
    edge_embedding = []
    for u,v in data.edge_index.t():
        edge_embedding.append(np.maximum(emb_128[u],emb_128[v]))
    total_embeddings_temp = pd.DataFrame(edge_embedding)
    total_embeddings_temp["Escuela"] = datasets_edges[i]["Escuela"]
    total_embeddings_temp["weight"] = datasets_edges[i]["weight"]
    total_embeddings_temp["class_classif"] = datasets_edges[i]["class_clasif"]
    total_embeddings = pd.concat([total_embeddings,total_embeddings_temp],axis=0)
    i += 1
    
total_embeddings.to_csv("total_embeddings_with_bf_p10q1_courses.csv",index="False")

### Build the dataloader object