In this notebook we produce an embedding using the technique of Node2vec similar to Word2vec (linguistic models). 
With this node embedding we compute the edge embedding and make a k-cross validation score against the weighted edges
of the network. 
The goal is to produce an embedding of edges that, given two nodes, provides the predicted sign of the link. 
For this purpose we have substituted the classes in the edges by only 2, positive and negative. 

In [1]:
import numpy as np
import pandas as pd 


import torch
import torch_geometric.data as data
from torch_geometric.nn import GCNConv
import torch_geometric.transforms as T
import torch.nn.functional as F
from torch_geometric.utils import negative_sampling,train_test_split_edges,to_dense_adj
from sklearn.metrics import roc_auc_score
from torch_geometric.transforms import RandomLinkSplit
from sklearn import preprocessing
from torch_geometric.nn import Node2Vec

device = "cpu"

In [30]:
nodes = pd.read_csv("Renacimiento_info_completo_1.csv",encoding="iso8859_7",delimiter=";")
edges = pd.read_csv("Renacimiento_edges_completo_1.csv",encoding="iso8859_7",delimiter=";")
edges["peso"] = edges["peso"].apply(lambda x: np.sign(x))
nodes["Curso"] = nodes["Curso"].apply(lambda x: x.split("Ί")[0])
edges.drop("relacion",axis=1,inplace=True)
clean_range = dict(nodes["ID"])
clean_range = {value:key for key,value in clean_range.items()}
nodes["ID"] = nodes["ID"].map(clean_range)
edges["from"] = edges["from"].map(clean_range)
edges["to"] = edges["to"].map(clean_range)

In [3]:
nodes_dummy = pd.get_dummies(nodes,['Curso', 'Grupo', 'Sexo', 'Procedencia', 'Repetidor'])
x = nodes_dummy.values 
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
nodes_norm = pd.DataFrame(x_scaled)

In [4]:
total_data = data.Data(x=torch.tensor(nodes_norm.to_numpy(),dtype=torch.float32),
                          edge_index=torch.tensor(edges[["from","to"]].to_numpy().T),
                          edge_attr=torch.tensor((pd.get_dummies(edges["peso"]).to_numpy())))

In [5]:
total_data

Data(x=[238, 36], edge_index=[2, 3755], edge_attr=[3755, 2])

In [6]:
np.random.seed(10)
# get the nodes
nodes = total_data.edge_index.t().numpy()
nodes = np.unique(list(nodes[:,0]) + list(nodes[:,1]))

np.random.shuffle(nodes) # shuffle node order
print(len(nodes))

238


In [7]:
# get train test and val sizes: (70% - 15% - 15%)
train_size = int(len(nodes)*0.7)
test_size = int(len(nodes)*0.85) - train_size
val_size = len(nodes) - train_size - test_size
print(train_size,test_size,val_size)

166 36 36


In [8]:
train_set = nodes[0:train_size]
test_set = nodes[train_size:train_size+test_size]
val_set = nodes[train_size+test_size:]


print(len(train_set),len(test_set),len(val_set))
print(len(train_set)+len(test_set)+len(val_set) == len(nodes))

print("train set\t",train_set[:10])
print("test set \t",test_set[:10])
print("val set  \t",val_set[:10])

166 36 36
True
train set	 [ 26 124 192  58 208 132  83 154 182 127]
test set 	 [ 44  96 211  28 135 145 181 215  71 118]
val set  	 [ 13 197  77 216 179 177 206 218 200  33]


In [9]:
# build test train val masks

train_mask = torch.zeros(len(nodes),dtype=torch.long, device=device)
for i in train_set:
    train_mask[i] = 1.

test_mask = torch.zeros(len(nodes),dtype=torch.long, device=device)
for i in test_set:
    test_mask[i] = 1.
    
val_mask = torch.zeros(len(nodes),dtype=torch.long, device=device)
for i in val_set:
    val_mask[i] = 1.
    
print("train mask \t",train_mask[0:15])
print("test mask  \t",test_mask[0:15])
print("val mask   \t",val_mask[0:15]) 

train mask 	 tensor([0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1])
test mask  	 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
val mask   	 tensor([1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0])


In [10]:
# add masks
total_data.train_mask = train_mask
total_data.test_mask = test_mask
total_data.val_mask = val_mask

print("after\t\t",total_data)

after		 Data(x=[238, 36], edge_index=[2, 3755], edge_attr=[3755, 2], train_mask=[238], test_mask=[238], val_mask=[238])


In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = Node2Vec(total_data.edge_index, embedding_dim=128, walk_length=70,
             context_size=50, walks_per_node=10,
             num_negative_samples=1, p=1, q=1, sparse=True).to(device)

loader = model.loader(batch_size=128, shuffle=True, num_workers=4)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)


In [12]:
def train():
    model.train()
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)


@torch.no_grad()
def test():
    model.eval()
    z = model()
    acc = model.test(z[total_data.train_mask], total_data.y[total_data.train_mask],
                     z[total_data.test_mask], total_data.y[total_data.test_mask],
                     max_iter=10)
    return acc


for epoch in range(1, 101):
    loss = train()
    #acc = test()
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}')
        

Epoch: 10, Loss: 5.1354
Epoch: 20, Loss: 3.7564
Epoch: 30, Loss: 3.0064
Epoch: 40, Loss: 2.5038
Epoch: 50, Loss: 2.1744
Epoch: 60, Loss: 1.9389
Epoch: 70, Loss: 1.7833
Epoch: 80, Loss: 1.6703
Epoch: 90, Loss: 1.5857
Epoch: 100, Loss: 1.5080


In [13]:
z = model()

In [14]:
# from tensor to numpy
emb_128 = z.detach().cpu().numpy()

In [15]:
# convert edge attributes from categorical to numerical
edge_attr_cat = total_data.edge_attr.numpy()
print("Categorical edge attributes:\n",edge_attr_cat[:3])

edge_attr = []
for i in edge_attr_cat:
    edge_attr.append(np.nonzero(i)[0][0])

print("\n\nNumerical edge attributes:\n",edge_attr[:3])

Categorical edge attributes:
 [[0 1]
 [0 1]
 [0 1]]


Numerical edge attributes:
 [1, 1, 1]


In [16]:
# compute edge embedding

edge_embedding = []
for u,v in total_data.edge_index.t():
    edge_embedding.append(np.mean([emb_128[u],emb_128[v]],0))

In [17]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [18]:
clf = RandomForestClassifier(max_depth=7,random_state=10)


scores = cross_val_score(clf, edge_embedding, total_data.edge_attr, cv=10)
np.mean(scores)


0.9046702127659574

In [19]:
clf.fit(edge_embedding,total_data.edge_attr)

RandomForestClassifier(max_depth=7, random_state=10)

In [20]:
clf.predict([[1]*128])

array([[0, 1]], dtype=uint8)

In [21]:
sum([i[1] for i in total_data.edge_attr.numpy()])

3397

In [28]:
np.mean(scores)/(1-(358/3397))

1.01124209041328

In [33]:
nodes[nodes["Curso"]==1]

Unnamed: 0,ID,Curso,Grupo,Sexo,Procedencia,Repetidor
