## import necessary modules

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import DataLoader, Dataset
import os

In [None]:
"""
@brief: This is a dataset class to load data from tensor 
"""
class TensorDataset(Dataset):
    def __init__(self, data:torch.tensor):
        super().__init__()
        self.data = data

    def __getitem__(self, idx:int):
        return idx, self.data[idx]

    def __len__(self):
        return self.data.shape[0]

In [None]:
import random
import h5py

"""
@brief: The mat scheme file is in dataset and transform them into dataset_cache and store in dataset_cache directory 
"""
class SignNetData():
    def __init__(self, file_path:str, train_ratio:int=0.7, beta:float=0.1, gamma:float=10):
        """
        parameters:
            @file_path: define the relative path of data to load
            @train_ratio: decide the percentage of data used for training
        """
        save_dir = os.path.split(file_path)[0]
        self.file_path = file_path
        self.train_ratio = train_ratio
        self.full_mat_save_path = os.path.join(save_dir, "full_" + os.path.split(file_path)[-1].split(sep=".")[0] + ".pt")
        self.train_mat_save_path = os.path.join(save_dir, "train_" + os.path.split(file_path)[-1].split(sep=".")[0] + ".pt")
        self.validate_mat_save_path = os.path.join(save_dir, "validate_" + os.path.split(file_path)[-1].split(sep=".")[0] + ".pt")
        self.test_mat_save_path = os.path.join(save_dir, "test_" + os.path.split(file_path)[-1].split(sep=".")[0] + ".pt")
        self.penalty_mat_save_path = os.path.join(save_dir, "penalty_" + os.path.split(file_path)[-1].split(sep=".")[0] + ".pt")

        if os.path.exists(self.full_mat_save_path):
            full_data = torch.load(self.full_mat_save_path)
        else:
            full_data = self.load_tensor_from_mat(self.file_path)
            torch.save(full_data, self.full_mat_save_path)

       
        if not (os.path.exists(self.train_mat_save_path) 
            and os.path.exists(self.validate_mat_save_path) 
            and os.path.exists(self.penalty_mat_save_path)):

            print("begin to split dataset")
            nz_idx = torch.nonzero(full_data)
            random.shuffle(nz_idx)
            num_of_train_sample = int(self.train_ratio * len(nz_idx))
            num_of_val_sample = int((len(nz_idx) - num_of_train_sample) / 2)

            train_data = torch.zeros(full_data.shape)
            validate_data = torch.zeros(full_data.shape)
            test_data = torch.zeros(full_data.shape)
            penalty_mat = torch.ones(full_data.shape)
            print("There are {0} edges in the graph.".format(len(nz_idx)))
            
            idx_cnt = 0
            for coord in nz_idx:
                coord = tuple(coord)
                if idx_cnt % 100000 == 0:
                    print("already processed {0}/{1} edges".format(idx_cnt, len(nz_idx)))
                if idx_cnt < num_of_train_sample:
                    train_data[coord] = full_data[coord]    
                elif num_of_train_sample <= idx_cnt < num_of_train_sample + num_of_val_sample:
                    validate_data[coord] = full_data[coord]
                else:
                    test_data[coord] = full_data[coord]
                

                if full_data[coord] == 1:
                    penalty_mat[coord] = beta
                else:
                    penalty_mat[coord] = beta * gamma
                idx_cnt = idx_cnt + 1

            if not os.path.exists(save_dir):
                os.mkdir(save_dir)
            
            torch.save(train_data, self.train_mat_save_path)
            torch.save(validate_data, self.validate_mat_save_path)
            torch.save(test_data, self.test_mat_save_path)
            print("self.penalty_mat_save_path = {0}".format(self.penalty_mat_save_path))
            torch.save(penalty_mat, self.penalty_mat_save_path)
            print("finish split dataset")


    """
    @brief: This is a function to read data from .mat file into tensor
    """
    def load_tensor_from_mat(self, load_path:str)->torch.tensor:
        if os.path.exists(load_path):
            np_data = np.transpose(np.array(h5py.File(name=load_path)["Gwl_ud"])).astype(np.float16)
            data = torch.from_numpy(np_data).type(torch.float) # transform array into tensor
            return data
        else:
            raise "ERROR! The input argument file_location does not exist!"

    def get_full_data(self):
        if os.path.exists(self.full_mat_save_path):
            raise "Error! The full_mat_save_path does not exist!"
        return self.load_tensor_from_mat(self.full_mat_save_path)

    def get_train_data(self):
        if os.path.exists(self.train_mat_save_path):
            raise "Error! The train_mat_save_path does not exist!"
        return self.load_tensor_from_mat(self.train_mat_save_path)
        
    def get_validate_data(self):
        if os.path.exists(self.validate_mat_save_path):
            raise "Error! The validate_mat_save_path does not exist!"
        return self.load_tensor_from_mat(self.validate_mat_save_path)

    def get_test_data(self):
        if os.path.exists(self.test_mat_save_path):
            raise "Error! The test_mat_save_path does not exist!"
        return self.load_tensor_from_mat(self.test_mat_save_path)


## construct autoencoder class

In [None]:
class AutoEncoder(nn.Module):
    def __init__(self, 
                name, 
                in_dim:int, 
                hidden_dim:int, 
                activate_func=torch.tanh, 
                params_save_dir:str="params_cache"):
        super().__init__()
        self.model_name = name
        self.file_name = name + "-" + str(in_dim) + "-" + str(hidden_dim) + ".pth"
        self.params_path = os.path.join(params_save_dir, self.file_name)
        self.encoder = nn.Linear(in_features=in_dim, out_features=hidden_dim, bias=True)
        self.decoder = nn.Linear(in_features=hidden_dim, out_features=in_dim, bias=True)
        self.in_dim = in_dim
        self.hidden_dim = hidden_dim
        self.activate_func = activate_func
        self.params_save_dir = params_save_dir,
        self.params_init()

    
    def forward(self, x):
        x = self.encoder(x)
        x = self.activate_func(x)
        x = self.decoder(x)
        x = self.activate_func(x)
        return x
    
    def encode(self, x):
        x = self.encoder(x)
        x = self.activate_func(x)
        return x
    
    def decode(self, x):
        x = self.decoder(x)
        x = self.activate_func(x)
        return x

    def save_params(self):
        torch.save(self.state_dict(), self.params_path)
    
    def params_init(self):
        if os.path.exists(self.params_path):
            self.load_state_dict(torch.load(self.params_path))

        else:
            nn.init.kaiming_normal_(self.encoder.weight.data)
            nn.init.constant_(self.encoder.bias.data, 0.0)
            nn.init.kaiming_normal_(self.decoder.weight.data)
            nn.init.constant_(self.decoder.bias.data, 0.0)


## construct stacked autoencoder class

In [None]:
class StackedAutoEncoder():
    def __init__(self, *layer_dim, **kwargs):
        print("begin to init stacked auto encoder")
        self.sae = []
        self.activate_func = torch.tanh
        self.params_dir = "params_cache"
        self.rep_save_dir = "rep_cache"
        self.name = "untitled"
        
        for k,v in kwargs.items():
            if k == "activate_func":
                self.activate_func = v
            elif k == "params_load_dir":
                self.params_load_dir = v
            elif k == "rep_save_dir":
                self.rep_save_dir = v
            elif k == "name":
                self.name = v

        # create autoencoders and store them into self.sae
        for idx in range(len(layer_dim)-1):
            ae_name = self.name + "-AE" + str(idx)
            ae = AutoEncoder(ae_name, layer_dim[idx], layer_dim[idx+1], self.activate_func, self.params_dir)
            self.sae.append(ae)
        
        self.in_dim = layer_dim[0]
        self.out_dim = layer_dim[-1]

    def __len__(self):
        """ return the depth of the stacked auto-encoder """
        return len(self.sae)

    def forward(self, x):
        for ae in self.sae:
            x = ae.encode(x)
        return x

    def reconstruct(self, x):
        for ae in self.sae:
            x = ae.encode(x)
        for ae in reversed(self.sae):
            x = ae.decode(x)
        return x

## layer-wise training 

In [None]:
class AETrainer():
    def __init__(self, ae:AutoEncoder, input_rep_path:str, penalty:torch.tensor, rep_save_dir = "rep_cache"):
        print("begin to init AE trainer")
        self.model = ae
        self.penalty = penalty
        self.input_rep_path = input_rep_path
        rep_out_name = ae.model_name + "-" +str(ae.hidden_dim) + "-rep" + ".pt"
        self.save_rep_path = os.path.join(rep_save_dir, rep_out_name) 
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model.to(device)


    def train(self, epochs:int):
        rep = torch.load(self.input_rep_path)
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        rep.to(device)
        dataset = TensorDataset(rep)
        dataloader = DataLoader(dataset=dataset, batch_size=8, shuffle=True)
        optimizer = torch.optim.Adam(self.model.parameters())
        node_cnt = 0
        for epoch in range(epochs):
            print("<< epochs: {0}/{1} >>".format(epoch+1, epochs))
            node_cnt = 0
            for idx, in_rep in dataloader:
                optimizer.zero_grad()
                out_rep = self.model(in_rep)
                if self.penalty[idx].shape == out_rep.shape:
                    out_rep = self.penalty[idx] * self.penalty[idx] * out_rep
                    in_rep = self.penalty[idx] * self.penalty[idx] * in_rep
                loss_score = F.l1_loss(in_rep, out_rep)
                loss_score.backward()
                optimizer.step()
                node_cnt = node_cnt + 8
                if node_cnt % 800 == 0:
                    print("calculate nodes {0}/{1} ".format(node_cnt, len(dataset)))
        
        nodes_rep = torch.zeros(len(dataset), self.model.hidden_dim).cuda()
        for idx, in_rep in dataloader:
            nodes_rep[idx] = self.model.encode(in_rep)

        if not os.path.exists(os.path.split(self.save_rep_path)[0]):
            os.mkdir(os.path.split(self.save_rep_path)[0])
        torch.save(nodes_rep, self.save_rep_path)

        print("Saving auto-encoder {0}'s parameters to file {1}".format(self.model.model_name, self.save_rep_path))
        self.model.save_params()
        torch.cuda.empty_cache()
        del self.model

In [None]:
class SAETrainer():
    def __init__(self, sae:StackedAutoEncoder, data_path:str):
        self.models = sae
        self.data_path = data_path

    def train(self, epochs:int, mode:str="train"):
        sign_net = SignNetData(self.data_path)
        if mode == "train":
            rep_path = sign_net.train_mat_save_path
        elif mode == "validate":
            rep_path = sign_net.validate_mat_save_path
        elif mode == "test":
            rep_path = sign_net.test_mat_save_path
        else:
            raise "Error! Argument mode is illegal!"
        
        self.data_path = rep_path
        penalty = torch.load(sign_net.penalty_mat_save_path).cuda()

        for model in self.models.sae:
            print("<<<<<<<<<< begin to  train model: {0} >>>>>>>>>>".format(model.model_name))
            ae_trainer =  AETrainer(model, rep_path, penalty)
            ae_trainer.train(epochs)
            rep_path = ae_trainer.save_rep_path
        return rep_path


### import DGL modules for further process

In [None]:
import dgl
import dgl.nn as dglnn
import dgl.function as dglfn
from dgl.data import DGLDataset

In [None]:
class SignNetGraph(DGLDataset):
    def __init__(self, mat_path:str, feat_path:str)->None: 
        dataset_dir, mat_name = os.path.split(mat_path)
        self.feat_path = feat_path
        super().__init__(name=mat_name, raw_dir=dataset_dir, save_dir=dataset_dir)


    def process(self) -> None:
        # read data from file to generate Dataset
        torch_data = torch.load(self.raw_path)
        src, dst = torch.nonzero(torch_data, as_tuple=True)
        self.graph = dgl.graph((src,dst), num_nodes=torch_data.shape[0])
        self.graph.edata["sign"] = torch_data[src,dst]
        self.graph.ndata["feat"] = torch.load(self.feat_path).cpu()
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.graph.to(device)

    def __getitem__(self, idx)->None:
        return self.graph.ndata["feat"][idx]
    
    def __len__(self)-> None:
        return self.graph.num_nodes()

    def save(self)->None:
        # save graph
        graph_path = os.path.join(self.save_dir, self.name.split(".")[0] + '_dgl_graph.bin')
        dgl.save_graphs(graph_path, self.graph)


    def load(self)->None:
        # load processed data from directory `self.save_path`
        graph_path = os.path.join(self.save_dir, self.name.split(".")[0] + '_dgl_graph.bin')
        graphs, _ = dgl.load_graphs(graph_path)
        self.graph = graphs[0]

    def has_cache(self)->bool:
        # check whether there are processed data in `self.save_path`
        graph_path = os.path.join(self.save_path, self.name.split(".")[0] + '_dgl_graph.bin')
        return os.path.exists(graph_path)

In [None]:
class Encoder2Vec(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, graph):
        graph.apply_edges(func=dglfn.v_sub_u("feat", "feat", "diff"))
    
    def comp_vec(self):
        pass

In [None]:
class E2VTrainer():
    def __init__(self, e2v:Encoder2Vec, graph_path:str, feat_path:str):
        self.model = e2v
        self.graph_path = graph_path
        self.graph = SignNetGraph(graph_path, feat_path)

    def train(self, epochs:int):
        dataloader = dgl.dataloading.pytorch.EdgeDataLoader(self.graph)
        for epoch in range(epochs):
            for src, dst  in dataloader:
                with self.graph.local_scope():
                    pass
        dgl.save_graphs(self.graph_path, self.graph)
        return self.graph_path

In [None]:
class FeatEvaluator():
    def __init__(self, graph_path):
        self.graph_path = graph_path
        graphs, _ = dgl.load_graphs(graph_path)
        self.graph = graphs[0]

    def link_predict(self):
        # target paper uses AUC(area under curve) and AP(average precision) as indicators to test model for link prediction
        auc = 0.0
        ap = 0.0
        return (auc, ap)

    def node_cluster(self):
        # target paper uses error rate to test model for node clustering task
        # the ground truth is the result of k-mean on the same dataset
        err_rate = 0.0
        return err_rate

## test self-defined signed network models

In [None]:
if __name__ == "__main__":
    original_data_path = "dataset/slashdot_UD.mat"
    model_name = "slash"
    sae = StackedAutoEncoder(7000, 1024, 128, 32, name=model_name)
    sae_trainer = SAETrainer(sae, original_data_path)
    feat_path = sae_trainer.train(1, mode="train")
    e2v_trainer = E2VTrainer(original_data_path, feat_path)
    graph_path = e2v_trainer.train(10)
    evaluator = FeatEvaluator(graph_path)
    auc, ap = evaluator.link_predict()
    print("<<<<<<<<<< Final Result of the Encoder2Vec >>>>>>>>>>>")
    print("link prediction performance: AUC={1}, AP={2}".format(original_data_path, auc, ap))
    err_rate =  evaluator.node_cluster()
    print("node clustering performance: error rate = {0}".format(err_rate))