In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Network Science Analytics
Lab 4: Network representation learning
March 13, 2020
"""
%matplotlib inline
from helper import *
import os
import networkx as nx
from gensim.models import Word2Vec
from scipy.sparse import *
from scipy.stats.stats import pearsonr
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve


In [None]:
# Read network files
karate = nx.read_gml("./karate.gml")
cora = nx.read_gml("./cora.gml")

# Choose a network
G = 
print("The number of nodes: {}".format(G.number_of_nodes()))
print("The number of edges: {}".format(G.number_of_edges()))

# Get the node community labels and the number of communities
node2comm, num_of_communities = get_node2community(g=G)

### Part I: Network Representation Learning

#### Exercise 1.1: Implementation of a random walking strategy

In [None]:
def perform_random_walks(graph, N, L):
    '''
    :param graph: networkx graph
    :param N: the number of walks for each node
    :param L: the walk length
    :return walks: the list of walks
    '''
    nodelist = list(graph.nodes())
    walks = []
    

    ...
    ...
    
        
    return walks

#### Exercise 1.2: Learning representations of nodes

In [None]:
num_of_walks=
walk_length=
embedding_size = 
window_size = 
output_filename="./graph.embedding"

# Perform random walks
walks = perform_random_walks(graph=G, N=num_of_walks, L=walk_length)
# Learn representations of nodes
model = Word2Vec(walks, size=embedding_size, sg=1, window=window_size, min_count=0, workers=3)
# Save the embedding vectors
model.wv.save_word2vec_format(output_filename)

#### Exercise 1.4: Visualization of embedding vectors

In [None]:
visualize(graph=G, node2embedding=model.wv)

### Part II: Link Predicition

In [None]:
def generate_samples(graph, train_set_ratio):
        
    # --- Step 0: The graph must be connected ---
    if nx.is_connected(G) is not True:
        raise ValueError("The graph contains more than one connected component!")
        
    # --- Step 1: Generate positive edge samples for testing set ---
    residual_g = graph.copy()
    test_pos_samples = []
      
    # Shuffle the list of edges
    edges = list(residual_g.edges())
    np.random.shuffle(edges)

    test_set_size = int((1.0 - train_set_ratio) * graph.number_of_edges())
    num_of_pos_test_samples = 0
    ##################################
    
    Please complete here
    
    ##################################
    # Check if we have the desired number of positive samples for testing set 
    if num_of_pos_test_samples != test_set_size:
        raise ValueError("Enough positive edge samples could not be found!")

    # --- Step 2: Generate positive edge samples for training set ---
    # The remaining edges are simply considered for positive samples of the training set
    train_pos_samples = list(residual_g.edges())
        
    # --- Step 3: Generate the negative samples for testing and training sets ---
    non_edges = list(nx.non_edges(graph))
    np.random.shuffle(non_edges)

    train_neg_samples = non_edges[:test_set_size]
    test_neg_samples = non_edges[test_set_size:test_set_size*2]

    # --- Step 4: Combine sample lists and create corresponding labels ---
    # For training set
    train_samples = train_pos_samples + train_neg_samples
    train_labels = [1 for _ in train_pos_samples] + [0 for _ in train_neg_samples]
    # For testing set
    test_samples = test_pos_samples + test_neg_samples
    test_labels = [1 for _ in test_pos_samples] + [0 for _ in test_neg_samples]
    
    return residual_g, train_samples, train_labels, test_samples, test_labels
    
def edge_prediction(node2embedding, train_samples, test_samples, feature_func=None):
    
    # --- Construct feature vectors for edges ---
    if feature_func is None:
        feature_func = lambda x,y: abs(x-y)
        
    train_features = [feature_func(node2embedding[edge[0]], node2embedding[edge[1]]) for edge in train_samples]
    test_features = [feature_func(node2embedding[edge[0]], node2embedding[edge[1]]) for edge in test_samples]
    
    # --- Build the model and train it ---
    clf = LogisticRegression()
    clf.fit(train_features, train_labels)

    train_preds = clf.predict_proba(train_features)[:, 1]
    test_preds = clf.predict_proba(test_features)[:, 1]

    # --- Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from predictions ---
    fpr, tpr, _ = roc_curve(test_labels, test_preds)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(6,6))
    plt.plot(fpr, tpr, color='darkred', label='ROC curve (area = %0.3f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='lightgray', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic Curve')
    plt.legend(loc="lower right")
    plt.show()
    
    return roc_auc

##### Exercise 2

In [None]:
# Construct the training and testing sets
residual_g, train_samples, train_labels, test_samples, test_labels = generate_samples(graph=G, train_set_ratio=0.5)
# Perform random walks over the residual network
num_of_walks, walk_length, window_size, embedding_size = (, , , )
residual_walks = perform_random_walks(graph=residual_g, N=num_of_walks, L=walk_length)
# Learn representations of nodes
model = Word2Vec(walks, size=embedding_size, sg=1, window=window_size, min_count=0, workers=3)
# Perform the edge prediction and plot the ROC curve
edge_prediction(model.wv, train_samples, test_samples)

### Part III: Node Classification

#### Exercise 3

In [None]:
results = classification(graph=G, train_set_ratio=0.5, node2embedding=model.wv, number_of_shuffles=10)
print(results)

### Part IV: Spectral Clustering

##### Exercise 4.1: Implementation of spectral clustering

In [None]:
def spectral_clustering(G, k):
    '''
    :param G: given graph
    :param k: the number of clusters
    :return node2embedding: a dictionary whose keys are nodes and values are the corresponding embeddings
    '''


    return node2embedding

#### Exercise 4.2: Visualization of embedding vectors

In [None]:
embedding_size = 
node2embedding = spectral_clustering(G, k=embedding_size)
visualize(graph=G, node2embedding=node2embedding)