In [1]:
import argparse
import time
import numpy as np
import scipy.sparse as sp
import torch
from torch import optim
from model import GCNModelAE, Regularizer
from optimizer import loss_function1
from utils import load_data, preprocess_graph, get_roc_score, load_data_with_labels
from sklearn.cluster import KMeans
from metrics import clustering_metrics

# Hyper-parameter Settings

Here in node clustering we only use half of the training iterations for link prediction (i.e. 100 epochs for Cora and Citeseer, and 750 epochs for PubMed).

In [2]:
parser = argparse.ArgumentParser()
parser.add_argument('--seed', type=int, default=0, help='Random seed.')
parser.add_argument('--epochs', type=int, default=100, help='Number of epochs to train.')
# We recommend 100 epochs for Cora and Citeseer, and 800 epochs for PubMed
parser.add_argument('--hidden1', type=int, default=32, help='Number of units in the first encoding layer.')
parser.add_argument('--hidden2', type=int, default=16, help='Number of units in the second embedding layer.')
parser.add_argument('--hidden3', type=int, default=16, help='Number of units in the first hidden layer of Regularizer.')
parser.add_argument('--hidden4', type=int, default=64, help='Number of units in the second hidden layer of Regularizer.')
parser.add_argument('--clamp', type=float, default=0.01, help='Weight clamp for Regularizer Parameters.')
parser.add_argument('--lr', type=float, default=0.001, help='Initial learning rate for Generator.')
parser.add_argument('--reglr', type=float, default=0.001, help='Initial learning rate for Regularizer.')
parser.add_argument('--dropout', type=float, default=0., help='Dropout rate (1 - keep probability).')
parser.add_argument('--dataset-str', type=str, default='cora', help='type of dataset.')

args,unknown = parser.parse_known_args()

torch.manual_seed(args.seed)
np.random.seed(args.seed)    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# Model for Node Clustering

In [3]:
def gae_for(args):
    print("Using {} dataset".format(args.dataset_str))
    adj, features,true_labels = load_data_with_labels(args.dataset_str)
    n_nodes, feat_dim = features.shape
    features = features.to(device)
    
    if args.dataset_str == 'cora':
        n_clusters = 7
    elif args.dataset_str == 'citeseer':
        n_clusters = 6
    else:
        n_clusters = 3

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

    # Some preprocessing
    adj_norm = preprocess_graph(adj)
    adj_norm = adj_norm.to(device)
    
    adj_label = adj + sp.eye(adj.shape[0])
    adj_label = torch.FloatTensor(adj_label.toarray())
    adj_label = adj_label.to(device)

    pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
    norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    model = GCNModelAE(feat_dim, args.hidden1, args.hidden2, args.dropout).to(device)
    regularizer = Regularizer(args.hidden3, args.hidden2, args.hidden4).to(device)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    regularizer_optimizer = optim.Adam(regularizer.parameters(), lr=args.reglr)
    
    clustering_scores=[]
    for epoch in range(args.epochs):
        t = time.time()
        model.train()
        regularizer.train() 
        
        #Generate embeddings
        predicted_labels_prob, emb = model(features, adj_norm)
        
        #Wasserstein Regularizer
        for i in range(1):
            f_z = regularizer(emb).to(device)
            r = torch.normal(0.0, 1.0, [n_nodes, args.hidden2]).to(device)
            f_r = regularizer(r)          
            reg_loss = - f_r.mean() + f_z.mean() 
            
            regularizer_optimizer.zero_grad()
            reg_loss.backward(retain_graph=True)
            regularizer_optimizer.step()
            
            # weight clamp
            for p in regularizer.parameters():
                p.data.clamp_(-args.clamp, args.clamp)
        
        #GAE Update
        f_z = regularizer(emb)  
        generator_loss = -f_z.mean()
        loss = loss_function1(preds=predicted_labels_prob, labels=adj_label,
                             norm=norm, pos_weight=torch.tensor(pos_weight))
        loss = loss + generator_loss
        
        optimizer.zero_grad()
        loss.backward()
        cur_loss = loss.item()
        optimizer.step()
        if epoch%20==0:
            print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(cur_loss))
            print("time=", "{:.5f}".format(time.time() - t))
        
    np_emb = emb.cpu().detach().numpy()
    kmeans = KMeans(n_clusters= n_clusters, random_state=args.seed).fit(np_emb)
    predict_labels = kmeans.predict(np_emb)
    cm = clustering_metrics(true_labels, predict_labels)
    acc, nmi, f1_macro, precision_macro, adjscore = cm.evaluationClusterModelFromLabel()

    clustering_scores.append([acc, nmi, f1_macro, precision_macro, adjscore])
 
    return clustering_scores[-1]

# Run

In [4]:
once = False

if __name__ == '__main__':
    if once == True:
        gae_for(args)
    else:
        clustering_scores = []
        clustering_metrics_names = ['acc', 'nmi', 'f1_macro', 'precision_macro', 'adjscore']
        
        # using 10 different random seeds
        for seed in range(10):
            print('Seed',seed)
            args.seed = seed
            torch.manual_seed(args.seed)
            clustering_score = gae_for(args)
            clustering_scores.append(clustering_score)
        # show the results by mean and std
        clustering_scores = np.asarray(clustering_scores)
        for i in range(len(clustering_scores[0])):
            print(clustering_metrics_names[i],'=',np.mean(clustering_scores[:,i]),', std = ',np.std(clustering_scores[:,i]))

Seed 0
Using cora dataset
Epoch: 0001 train_loss= 0.77257
time= 0.47882
Epoch: 0021 train_loss= 0.72065
time= 0.01695
Epoch: 0041 train_loss= 0.58250
time= 0.01795
Epoch: 0061 train_loss= 0.53101
time= 0.01795
Epoch: 0081 train_loss= 0.51072
time= 0.01795
ACC=0.653250, f1_macro=0.629955, precision_macro=0.645058, recall_macro=0.652351, f1_micro=0.653250, precision_micro=0.653250, recall_micro=0.653250, NMI=0.498641, ADJ_RAND_SCORE=0.429786
Seed 1
Using cora dataset
Epoch: 0001 train_loss= 0.76904
time= 0.01895
Epoch: 0021 train_loss= 0.67864
time= 0.01695
Epoch: 0041 train_loss= 0.53863
time= 0.01596
Epoch: 0061 train_loss= 0.49334
time= 0.02094
Epoch: 0081 train_loss= 0.47483
time= 0.01596
ACC=0.608936, f1_macro=0.567735, precision_macro=0.579074, recall_macro=0.588276, f1_micro=0.608936, precision_micro=0.608936, recall_micro=0.608936, NMI=0.493074, ADJ_RAND_SCORE=0.416182
Seed 2
Using cora dataset
Epoch: 0001 train_loss= 0.77400
time= 0.02094
Epoch: 0021 train_loss= 0.68116
time= 0.