## Build the Data object

In [7]:
import numpy as np
import pandas as pd 
import random as rd
import networkx as nx

import torch
import torch_geometric.data as data
import torch_geometric.transforms as T
from torch_geometric.utils import negative_sampling,train_test_split_edges,to_dense_adj
from torch_geometric.loader import DataLoader
from sklearn import preprocessing
from torch_geometric.nn import Node2Vec
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score,classification_report,confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from itertools import product
from tqdm.notebook import tqdm

device = "cpu"

In [8]:
def f_edges(nodes,edges):
    class_classi = []
    for edge in edges.iterrows():
        try:
            agent_from = nodes.iloc[list(nodes["ID"]).index(edge[1]["from"])]
            agent_to = nodes.iloc[list(nodes["ID"]).index(edge[1]["to"])]
            if (agent_from["Curso"] == agent_to["Curso"]):
                    class_classi.append(str(agent_from["Curso"]))
            else:
                    class_classi.append("Intergroup")
        except:
            class_classi.append("Missing")
    return class_classi

In [9]:
def rearrange_values(df_edges,df_nodes):
    
    if df_edges["to"].nunique() > df_edges["from"].nunique():
        new_vect = np.arange(df_edges["to"].nunique())
        ###
        translation =  dict(zip(list(df_edges.sort_values(by="to")["to"].unique()),new_vect))
    else:
        new_vect = np.arange(df_edges["from"].nunique())
        ###
        translation =  dict(zip(list(df_edges.sort_values(by="from")["from"].unique()),new_vect))
        
        
    df_edges[["from","to"]] = df_edges[["from","to"]].replace(translation)
    df_nodes["ID"] = df_nodes["ID"].replace(translation)
    return df_edges,df_nodes

In [10]:
    len_datasets = 13
    datasets_nodes = [0]*len_datasets
    datasets_edges = [0]*len_datasets
    datasets_edges_train = [0]*len_datasets
    datasets_edges_test = [0]*len_datasets
    graficas_train = [0]*len_datasets
    graficas_test = [0]*len_datasets
    data_list_train = [0]*len_datasets
    data_list_test = [0]*len_datasets
    data_list_train = [0]*len_datasets
    data_list_test = [0]*len_datasets
    graficas = [0]*len_datasets
    for i in range(0,len_datasets):
        datasets_nodes[i] = pd.read_csv(r"Coles/Nodes/Nodes_t"+str(i+1)+".csv",sep=",",encoding = 'unicode_escape')
        datasets_edges[i] = pd.read_csv(r"Coles/Edges/Edges_t"+str(i+1)+".csv",sep=",",encoding = 'unicode_escape')
        #datasets_edges[i][["from","to"]] = datasets_edges[i][["from","to"]].apply(lambda x:x-min(datasets_edges[i]["to"].min(),datasets_edges[i]["from"].min()))    
        datasets_edges[i]["Escuela"] = i
        datasets_edges[i]["weight"] = datasets_edges[i]["weight"].apply(lambda x: np.sign(x)).replace({-1:0}).reset_index().drop("index",axis=1)
        datasets_edges[i],datasets_nodes[i] = rearrange_values(datasets_edges[i],datasets_nodes[i])
        datasets_edges[i]["class_classif"] = f_edges(datasets_nodes[i],datasets_edges[i])
        datasets_edges_train[i], datasets_edges_test[i] = train_test_split(datasets_edges[i], test_size=0.2)
        graficas_train[i] = nx.from_pandas_edgelist(datasets_edges_train[i],source="from",target="to",create_using=nx.DiGraph())
        graficas_test[i] = nx.from_pandas_edgelist(datasets_edges_test[i],source="from",target="to",create_using=nx.DiGraph())
        data_list_train[i] = data.Data(edge_index = torch.tensor(datasets_edges_train[i][["from","to"]].to_numpy().T))
        data_list_test[i] = data.Data(edge_index = torch.tensor(datasets_edges_test[i][["from","to"]].to_numpy().T))

    data_loader_train = DataLoader(data_list_train, batch_size=1)
    data_loader_test = DataLoader(data_list_test,batch_size = 1 )

In [15]:
sum([409,
238,
534,
232,
512,
156,
110,
223,
106,
80,
209,
319,
386])

3514

In [11]:
for data in datasets_nodes:
    print(len(data))

409
238
534
232
512
156
110
223
106
80
209
319
386


In [22]:
dir(data_loader_train)

['_DataLoader__initialized',
 '_DataLoader__multiprocessing_context',
 '_IterableDataset_len_called',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_auto_collation',
 '_dataset_kind',
 '_get_iterator',
 '_index_sampler',
 '_is_protocol',
 '_iterator',
 'batch_sampler',
 'batch_size',
 'check_worker_number_rationality',
 'collate_fn',
 'dataset',
 'drop_last',
 'exclude_keys',
 'follow_batch',
 'generator',
 'multiprocessing_context',
 'num_workers',
 'persistent_workers',
 'pin_memory',
 'prefetch_factor',
 'sampler',
 'timeout',
 'worker_init_fn']

### Final choices for the hyperparameters
The parameters that are not described are set as in the code below.

 <u> Configurations without best friends </u> <br>
    -  DeepWalk _(p=1,q=1)_ <br>
    - BFS _(p=1,q=10)_ <br>
    - DFS _(p=10,q=1)_ <br>

In [5]:
def create_embedding(data_loader):

    tolerance = 1e-3
    total_embeddings = pd.DataFrame()
    i = 0
    for data in data_loader.dataset:
        print("Dataset: {}".format(i))
        model = Node2Vec(data.edge_index, embedding_dim=128, walk_length=30,
                 context_size=10, walks_per_node=10,
                 num_negative_samples=1, p=1, q=4, sparse=True).to(device)
        loader = model.loader(batch_size=64, shuffle=True, num_workers=4)
        optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

        #####
        def train():
            model.train()
            total_loss = 0
            for pos_rw, neg_rw in loader:
                optimizer.zero_grad()
                loss = model.loss(pos_rw.to(device), neg_rw.to(device))
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
            return total_loss / len(loader)

        pre_value_loss,curr_value_loss = 100,0
        epoch = 0
        while (abs(pre_value_loss - curr_value_loss) > tolerance):
            loss = train()
            epoch += 1
            pre_value_loss = curr_value_loss
            curr_value_loss = loss
            #if epoch%5 == 0:
            #    print(f'Epoch: {epoch:02d}, with loss: {loss:.4f}')
        print(f'The Node2vec algorithm converged at epoch: {epoch:02d}, with loss: {loss:.4f}')



            ####

        z = model()
        # from tensor to numpy
        emb_128 = z.detach().cpu().numpy()

        edge_embedding = []
        for u,v in data.edge_index.t():
            edge_embedding.append(np.maximum(emb_128[u],emb_128[v]))
        total_embeddings_temp = pd.DataFrame(edge_embedding)
        total_embeddings_temp["Escuela"] = datasets_edges[i]["Escuela"]
        total_embeddings_temp["weight"] = datasets_edges[i]["weight"]
        total_embeddings_temp["class_classif"] = datasets_edges[i]["class_classif"]
        total_embeddings = pd.concat([total_embeddings,total_embeddings_temp],axis=0)
        i += 1
        
        return total_embeddings
    
#total_embeddings.to_csv("total_embeddings_with_bf_p"+str(model.p)+"q"+str(model.q)+"_courses_ftest.csv",index="False")

In [6]:
# Vamos a probar a crear los embeddings por separado. Embeddings de train y embeddings de test

#Tiene que ser una pipeline de un único archivo : 

# Dados los dataframes listos : 

    # * Define train y test acorde a un criterio 
    # * Crea los embeddings para cada uno 
    # * Entrena la red neuronal y obtiene un val_loss
    

In [9]:
#####Define train y test split
len_datasets = 13
datasets_nodes = [0]*len_datasets
datasets_edges = [0]*len_datasets
datasets_edges_train = [0]*len_datasets
datasets_edges_test = [0]*len_datasets
graficas_train = [0]*len_datasets
graficas_test = [0]*len_datasets
data_list_train = [0]*len_datasets
data_list_test = [0]*len_datasets
data_list_train = [0]*len_datasets
data_list_test = [0]*len_datasets
graficas = [0]*len_datasets
for i in range(0,len_datasets):
    datasets_nodes[i] = pd.read_csv(r"Coles/Nodes/Nodes_t"+str(i+1)+".csv",sep=",",encoding = 'unicode_escape')
    datasets_edges[i] = pd.read_csv(r"Coles/Edges/Edges_t"+str(i+1)+".csv",sep=",",encoding = 'unicode_escape')
    #datasets_edges[i][["from","to"]] = datasets_edges[i][["from","to"]].apply(lambda x:x-min(datasets_edges[i]["to"].min(),datasets_edges[i]["from"].min()))    
    datasets_edges[i]["Escuela"] = i
    datasets_edges[i]["weight"] = datasets_edges[i]["weight"].apply(lambda x: np.sign(x)).replace({-1:0}).reset_index().drop("index",axis=1)
    datasets_edges[i],datasets_nodes[i] = rearrange_values(datasets_edges[i],datasets_nodes[i])
    datasets_edges[i]["class_classif"] = f_edges(datasets_nodes[i],datasets_edges[i])
    datasets_edges_train[i], datasets_edges_test[i] = train_test_split(datasets_edges[i], test_size=0.2)
    graficas_train[i] = nx.from_pandas_edgelist(datasets_edges_train[i],source="from",target="to",create_using=nx.DiGraph())
    graficas_test[i] = nx.from_pandas_edgelist(datasets_edges_test[i],source="from",target="to",create_using=nx.DiGraph())
    data_list_train[i] = data.Data(edge_index = torch.tensor(datasets_edges_train[i][["from","to"]].to_numpy().T))
    data_list_test[i] = data.Data(edge_index = torch.tensor(datasets_edges_test[i][["from","to"]].to_numpy().T))

data_loader_train = DataLoader(data_list_train, batch_size=1)
data_loader_test = DataLoader(data_list_test,batch_size = 1 )

#Crea los embeddings para dicho train y test_split


embeddings_train = create_embedding(data_loader_train)
embeddings_test = create_embedding(data_loader_test)

##Entrena la red neuronal para cada caso particular 

Dataset: 0
The Node2vec algorithm converged at epoch: 65, with loss: 0.9233
Dataset: 0
The Node2vec algorithm converged at epoch: 58, with loss: 0.8473


In [None]:
def train_NN(embeddings_train,embeddings_test):

    unique_courses = len(total_embeddings.loc[(total_embeddings["class_classif"]!="Missing")&(total_embeddings["class_classif"]!="Intergroup"),["Escuela","class_classif"]].value_counts())
    unique_schools = total_embeddings["Escuela"].nunique()
    factor = 1
    n_sim = unique_courses*factor #


    acc_clf_auc = np.zeros((n_sim))
    acc_ann_auc = np.zeros((n_sim))
    for i in tqdm(range(n_sim)):
        #tr_label = i%unique_schools
        #X = total_embeddings[total_embeddings["Escuela"] == tr_label].drop(["Escuela","weight",'class_classif'],axis=1).values
        #y = total_embeddings[total_embeddings["Escuela"] == tr_label]["weight"].values
        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        sc = MinMaxScaler()
        sc.fit(X_train)
        emb_x_train = sc.transform(X_train)
        emb_y_train = y_train
        emb_x_test = sc.transform(X_test)
        emb_y_test = y_test
        ros = SMOTE(random_state=0,sampling_strategy="minority")
        emb_x_resampled, emb_y_resampled = ros.fit_resample(emb_x_train, emb_y_train)
        clf = RandomForestClassifier(max_depth=7,class_weight="balanced")
        clf.fit(emb_x_resampled,emb_y_resampled)
        acc_clf_auc[int(i)] = roc_auc_score(emb_y_test,clf.predict(emb_x_test))
        #######
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(128,activation="relu",input_shape=(emb_x_train.shape[1],)),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(64,activation="relu"),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(32,activation="relu"),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(8,activation="relu"),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(1,activation="sigmoid")
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=10e-5),
                 loss="binary_crossentropy",
                     metrics=["AUC"])
        model_history = model.fit(emb_x_resampled,emb_y_resampled,epochs=250,verbose=0,batch_size=128,
                                 #callbacks=[tf.keras.callbacks.EarlyStopping(monitor="auc",patience=50,)])
                                 )
        #######º
        acc_ann_auc[int(i)] = roc_auc_score(emb_y_test,model.predict(emb_x_test))
        
        return