In [22]:
import graphviz as gv
import pydot
import random
import numpy    
from sklearn.metrics import roc_auc_score
from  sklearn.ensemble import RandomForestClassifier as RF
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import coo_matrix
from time import gmtime, strftime
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.lines as mlines
from scipy.sparse import linalg
import numpy as np
import math

In [23]:
def add_nodes(graph, nodes):
    for n in nodes:
        if isinstance(n, tuple):
            graph.node(n[0], **n[1])
        else:
            graph.node(n)
    return graph

def add_edges(graph, edges):
    for e in edges:
        if isinstance(e[0], tuple):
            graph.edge(*e[0], **e[1])
        else:
            graph.edge(*e)
    return graph

In [24]:
def dist(u, w, nodes_pos, A=[], H=[]):
    nodes_pos_u = nodes_pos[u]
    nodes_pos_w = nodes_pos[w]
    length = len(nodes_pos_u)
    dist = 0.0
    for i in range(length):
        dist = dist + (nodes_pos_u[i] - nodes_pos_w[i])**2
        
    return -(dist)**0.5

def mf_score(u, w, nodes_pos, A, H):
    return numpy.dot(A[int(u)], H.T[int(w)])
    
def svd_score(u, w, nodes_pos, A, H):
    return numpy.dot(A[int(u)], A[int(w)])

In [25]:
def make_dataset(nodes_pos, pos_set, neg_set, functs, A=[], H=[]):
    X = []
    Y = []
    
    for edge in pos_set:
        u, w = edge
        x = []
        for func in functs:
            x.append(func(u, w, nodes_pos, A, H))
        X.append(x)
        Y.append(1)
        
    for edge in neg_set:
        u, w = edge
        x = []
        for func in functs:
            x.append(func(u, w, nodes_pos, A, H))
        X.append(x)
        Y.append(0)
        
    X = numpy.array(X)
    Y = numpy.array(Y)
    return X, Y

In [26]:
def make_sparse_matrix(train_set, n):
    row = []
    col = []
    data = []
    for edge in train_set:
        u = int(edge[0])
        w = int(edge[1])
        row.append(u)
        col.append(w)
        row.append(w)
        col.append(u)
        data.append(1)
        data.append(1)
    return coo_matrix((data, (row, col)), shape=(n, n))

In [27]:
def read_train(dataset_name):
    max_id = 0
    file_name = dataset_name + "/train.in"
    fin_train = open(file_name, 'r')
    edges = set()
    nodes = set()
    for line in fin_train:
        line = line.strip()
        u, w = line.split()
        max_id = max(max_id, int(u))
        max_id = max(max_id, int(w))
        edges.add((u,w))
        nodes.add(u)
        nodes.add(w)
    fin_train.close()
    
    return edges, nodes, max_id

In [28]:
def get_sets(nodes, edges, division=10):
    nodes_size = len(nodes)
    edges_size = len(edges)
    print "Nodes size: " + str(nodes_size)
    print "Edges size: " + str(edges_size)
    test_size = int(edges_size * (division / 100.0))
    pos_edges = random.sample(edges, test_size)
    pos_edges = set(pos_edges)

    neg_edges = set()
    while len(neg_edges) < test_size:
        u = random.sample(nodes, 1)[0]
        w = random.sample(nodes, 1)[0]
        edge = (str(u),str(w))
        if edge not in edges and u != w:
            neg_edges.add(edge)
        
    edges_not_full = edges - pos_edges
    print "Edges not full: " + str(len(edges_not_full))
    
    return pos_edges, neg_edges, edges_not_full

In [29]:
def render_graph(dataset_name, dimension, nodes, edges_not_full):
    graph = gv.Graph(format="dot")
    graph.engine = 'sfdp'
    graph.graph_attr['dim'] = str(dimension)
    graph.graph_attr['dimen'] = str(dimension)
    graph = add_nodes(graph, nodes)
    graph = add_edges(graph, edges_not_full)
    file_name = dataset_name + "/graph" + str(dimension)
    graph.render(file_name, view=False)
    
    graph.format = "png"
    graph.render(file_name, view=False)
    
    return

In [30]:
def read_edges_dot(dataset_name, dimension):
    file_name = dataset_name + "/graph" + str(dimension) + ".dot"
    dot_graph = pydot.graph_from_dot_file(file_name)[0]
    dot_nodes = dot_graph.get_nodes()
    nodes_pos = {}
    for node in dot_nodes:
        name = node.get_name()
        if name != 'node' and name != 'graph':
            pos_str = node.get('pos').strip('"')        
            nodes_pos[name] = map(float, pos_str.split(','))
    return nodes_pos

In [31]:
def auc_sfdp(nodes_pos, pos_edges, neg_edges):
    X, Y = make_dataset(nodes_pos, pos_edges, neg_edges, [dist])
    score = roc_auc_score(Y, X)
    print "SFDP " + str(score)
    return score

In [32]:
def auc_nmf(edges, nodes_pos, pos_edges, neg_edges, n_components, max_id):
    G = make_sparse_matrix(edges, max_id + 1)
    model = NMF(n_components=n_components, init='random')
    W = model.fit_transform(G)
    H = model.components_
    
    X, Y = make_dataset(nodes_pos, pos_edges, neg_edges, [mf_score], W, H)
    
    score = roc_auc_score(Y, X)
    print "NMF " + str(score)
    return score

In [33]:
def auc_svd(edges, nodes_pos, pos_edges, neg_edges, n_components, max_id):
    G = make_sparse_matrix(edges, max_id + 1)
    #model = TruncatedSVD(n_components=n_components, algorithm='arpack')
    #A = model.fit_transform(G)

    U, s, Vh = linalg.svds(G.asfptype(), k=n_components)
    U = U * s
    
    X, Y = make_dataset(nodes_pos, pos_edges, neg_edges, [mf_score], U, Vh)
    
    score = roc_auc_score(Y, X)
    print "SVD " + str(score)
    return score

In [34]:
def compute_mean(dim, nodes_pos):
    values = []

    for i in range(dim):
        values.append([])

    for node in nodes_pos:
        for i in range(dim):
            values[i].append(nodes_pos[node][i])
        
    for i in range(dim):
        print np.mean(values[i])
        print np.std(values[i])
        print

In [35]:
def draw_graph(dataset_name, nodes_pos, edges):
    xs = []
    ys = []
    zs = []

    for node in nodes_pos:
        xs.append(nodes_pos[node][0])
        ys.append(nodes_pos[node][1])
        zs.append(nodes_pos[node][2])
    
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    ax.scatter(xs, ys, zs, s=2.0, c='b', marker='o')

    for edge in edges:
        u = edge[0]
        w = edge[1]
        ax.plot([nodes_pos[u][0], nodes_pos[w][0]], [nodes_pos[u][1], nodes_pos[w][1]],zs=[nodes_pos[u][2], nodes_pos[w][2]], linewidth=1.0)

    ax.set_xlabel('X Axis')
    ax.set_ylabel('Y Axis')
    ax.set_zlabel('Z Axis')

    file_name = dataset_name + "/3d.png"
    #plt.show()
    fig.savefig(file_name)
    plt.close("all")

In [36]:
def draw_graph_svd(dataset_name, A, edges):
    xs = []
    ys = []
    zs = []
    
    nodes = set()
    for edge in edges:
        u = int(edge[0])
        w = int(edge[1])
        nodes.add(u)
        nodes.add(u)
    
    for node in nodes:
        xs.append(A[node][0])
        ys.append(A[node][1])
        zs.append(A[node][2])
    
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    ax.scatter(xs, ys, zs, s=2.0, c='b', marker='o')

    for edge in edges:
        u = int(edge[0])
        w = int(edge[1])
        ax.plot([A[u][0], A[w][0]], [A[u][1], A[w][1]], zs=[A[u][2], A[w][2]], linewidth=1.0)

    ax.set_xlabel('X Axis')
    ax.set_ylabel('Y Axis')
    ax.set_zlabel('Z Axis')


    file_name = dataset_name + "/3d_svd.png"

    #plt.show()
    fig.savefig(file_name)
    plt.close("all")

In [37]:
def run_exp(dataset_name, dimension, n_components, division):
    print "Read train"
    edges, nodes, max_id = read_train(dataset_name)
    
    print "Get sets"
    pos_edges, neg_edges, edges_not_full = get_sets(nodes, edges, division)
    
    print "Render graph"
    print strftime("%Y-%m-%d %H:%M:%S", gmtime())
    render_graph(dataset_name, dimension, nodes, edges_not_full)
    
    print "Read edges"
    print strftime("%Y-%m-%d %H:%M:%S", gmtime())
    nodes_pos = read_edges_dot(dataset_name, dimension)
    
    print "Compute mean and std"
    compute_mean(dimension, nodes_pos)
    
    if dimension == 3:
        print "Draw graph"
        draw_graph(dataset_name, nodes_pos, edges)
    
    print "SFDP"
    print strftime("%Y-%m-%d %H:%M:%S", gmtime())
    auc_sfdp(nodes_pos, pos_edges, neg_edges)
    
    print "NMF"
    print strftime("%Y-%m-%d %H:%M:%S", gmtime())
    auc_nmf(edges_not_full, nodes_pos, pos_edges, neg_edges, n_components, max_id)
    
    print "SVD"
    print strftime("%Y-%m-%d %H:%M:%S", gmtime())
    auc_svd(edges_not_full, nodes_pos, pos_edges, neg_edges, n_components, max_id)
    
    return

In [38]:
def run_division_exp(dataset_name, num_exps, divisions, dimension):
    div_scores_sfdp = {}
    div_scores_nmf = {}
    div_scores_svd = {}
    for div in divisions:
            div_scores_sfdp[div] = np.array([])
            div_scores_nmf[div] = np.array([])
            div_scores_svd[div] = np.array([])
            
    edges, nodes, max_id = read_train(dataset_name)
    
    for div in divisions:
        print "Division " + str(div) + "/%"
        for i in range(num_exps):
        
            print "Launch " + str(i)
            print strftime("%Y-%m-%d %H:%M:%S", gmtime())
        
            pos_edges, neg_edges, edges_not_full = get_sets(nodes, edges, div)
    
            render_graph(dataset_name, dimension, nodes, edges_not_full)
            nodes_pos = read_edges_dot(dataset_name, dimension)
            
            sfdp_scores = auc_sfdp(nodes_pos, pos_edges, neg_edges)
            div_scores_sfdp[div] = np.append(div_scores_sfdp[div], sfdp_scores)
    
            nmf_scores = auc_nmf(edges_not_full, nodes_pos, pos_edges, neg_edges, 10, max_id)
            div_scores_nmf[div] = np.append(div_scores_nmf[div], nmf_scores)
    
            svd_scores = auc_svd(edges_not_full, nodes_pos, pos_edges, neg_edges, 30, max_id)
            div_scores_svd[div] = np.append(div_scores_svd[div], svd_scores)
    
    return div_scores_sfdp, div_scores_nmf, div_scores_svd

In [39]:
def save_scores_div(dataset_name, dimension, divisions, div_scores_sfdp, div_scores_nmf, div_scores_svd):
    scores_sfdp = []
    scores_nmf = []
    scores_svd = []
    shift = 2
    divisions_nmf = []
    divisions_svd = []
    err_sfdp = []
    err_nmf = []
    err_svd = []
    
    for div in divisions:
        scores_sfdp.append(div_scores_sfdp[div].mean())
        scores_nmf.append(div_scores_nmf[div].mean())
        scores_svd.append(div_scores_svd[div].mean())
        divisions_nmf.append(div - shift)
        divisions_svd.append(div + shift)
        err_sfdp.append(div_scores_sfdp[div].std() * 2)
        err_nmf.append(div_scores_nmf[div].std() * 2)
        err_svd.append(div_scores_svd[div].std() * 2)
    
    fig, ax = plt.subplots( nrows=1, ncols=1 )
    plt.errorbar(x=divisions, y=scores_sfdp, yerr=err_sfdp, c='r', marker='o', linestyle=None, markersize=5, linewidth=0, elinewidth=1)
    plt.errorbar(x=divisions_nmf, y=scores_nmf, yerr=err_nmf, c='g', marker='o', linestyle=None, markersize=5, linewidth=0, elinewidth=1)
    plt.errorbar(x=divisions_svd, y=scores_svd, yerr=err_svd, c='b', marker='o', linestyle=None, markersize=5, linewidth=0, elinewidth=1)
       
    plt.xlim([0,100])
    plt.xticks(np.arange(10, 100, 10))
    
    ax.set_xlabel('Test set size, %')
    ax.set_ylabel('AUC')
    
    sfdp = mlines.Line2D([], [], color='red')
    nmf = mlines.Line2D([], [], color='green')
    svd = mlines.Line2D([], [], color='blue')
    ax.legend([sfdp, nmf, svd], ["sfdp", "nmf", "svd"], fontsize = 'small')
    plt.legend()
    
    file_name = dataset_name + "/divisions"  + str(dimension) + ".png"
    fig.savefig(file_name)
    plt.close("all")

In [40]:
def run_dim_exp(dataset_name, num_exps, dimensions):
    dim_scores_sfdp = {}
    dim_scores_nmf = {}
    dim_scores_svd = {}
    for dimension in dimensions:
            dim_scores_sfdp[dimension] = np.array([])
            dim_scores_nmf[dimension] = np.array([])
            dim_scores_svd[dimension] = np.array([])
            
    edges, nodes, max_id = read_train(dataset_name)
    
    for dimension in dimensions:
        print "Dimension " + str(dimension)
        
        for i in range(num_exps):
            print "Launch " + str(i)
            print strftime("%Y-%m-%d %H:%M:%S", gmtime())
            
            pos_edges, neg_edges, edges_not_full = get_sets(nodes, edges)
            render_graph(dataset_name, dimension, nodes, edges_not_full)
            nodes_pos = read_edges_dot(dataset_name, dimension)
            
            sfdp_scores = auc_sfdp(nodes_pos, pos_edges, neg_edges)
            dim_scores_sfdp[dimension] = np.append(dim_scores_sfdp[dimension], sfdp_scores)
    
            nmf_scores = auc_nmf(edges_not_full, nodes_pos, pos_edges, neg_edges, dimension, max_id)
            dim_scores_nmf[dimension] = np.append(dim_scores_nmf[dimension], nmf_scores)
    
            svd_scores = auc_svd(edges_not_full, nodes_pos, pos_edges, neg_edges, dimension, max_id)
            dim_scores_svd[dimension] = np.append(dim_scores_svd[dimension], svd_scores)
    
            if dimension == 3:
                print "Draw graph"
                draw_graph(dataset_name, nodes_pos, edges)
                #draw_graph_svd(dataset_name, A, edges)
    
    return dim_scores_sfdp, dim_scores_nmf, dim_scores_svd

In [41]:
def save_scores(dataset_name, dimensions, dim_scores_sfdp, dim_scores_nmf, dim_scores_svd):
    scores_sfdp = []
    scores_nmf = []
    scores_svd = []
    shift = 0.15
    dimensions_nmf = []
    dimensions_svd = []
    err_sfdp = []
    err_nmf = []
    err_svd = []
    
    for dim in dimensions:
        scores_sfdp.append(dim_scores_sfdp[dim].mean())
        scores_nmf.append(dim_scores_nmf[dim].mean())
        scores_svd.append(dim_scores_svd[dim].mean())
        dimensions_nmf.append(dim - shift)
        dimensions_svd.append(dim + shift)
        err_sfdp.append(dim_scores_sfdp[dim].std() * 2)
        err_nmf.append(dim_scores_nmf[dim].std() * 2)
        err_svd.append(dim_scores_svd[dim].std() * 2)

    
    fig, ax = plt.subplots( nrows=1, ncols=1 )
    
    plt.errorbar(x=dimensions, y=scores_sfdp, yerr=err_sfdp, c='r', marker='o', linestyle=None, markersize=5, linewidth=0, elinewidth=1)
    plt.errorbar(x=dimensions_nmf, y=scores_nmf, yerr=err_nmf,c='g', marker='o', linestyle=None, markersize=5, linewidth=0, elinewidth=1)
    plt.errorbar(x=dimensions_svd, y=scores_svd, yerr=err_svd,c='b', marker='o', linestyle=None, markersize=5, linewidth=0, elinewidth=1)
        
    plt.xlim([1,11])
    plt.xticks(np.arange(2, 11, 1))
    
    ax.set_xlabel('Dimensions')
    ax.set_ylabel('AUC')
    
    sfdp = mlines.Line2D([], [], color='red')
    nmf = mlines.Line2D([], [], color='green')
    svd = mlines.Line2D([], [], color='blue')
    ax.legend([sfdp, nmf, svd], ["sfdp", "nmf", "svd"], fontsize = 'small')
    plt.legend()
    
    file_name = dataset_name + "/dimensions.png"
    fig.savefig(file_name)
    plt.close("all")

In [103]:
#dataset_names = ["airport", "Ca-HelpTh", "chicago", "Conflict", "euroroad", "EuroSiS", "PowerGrid"]
#dimensions = [2, 3, 4, 5, 6, 7, 8 ,9, 10]

dataset_names = ["airport", "Ca-HelpTh", "EuroSiS", "PowerGrid", "Conflict", "euroroad"]
dimensions = [2, 3, 4, 5, 6, 7, 8 ,9, 10]
#dataset_names = ["airport"]
#dimensions = [2, 3, 10]

for dataset_name in dataset_names:
    print dataset_name
    dim_scores_sfdp, dim_scores_nmf, dim_scores_svd = run_dim_exp(dataset_name, 10, dimensions)
    print "saving"
    save_scores(dataset_name, dimensions, dim_scores_sfdp, dim_scores_nmf, dim_scores_svd)

airport
Dimension 2
Launch 0
2017-05-12 17:18:43
Nodes size: 1574
Edges size: 28236
Edges not full: 25413
SFDP 0.950178804263
NMF 0.910925374018
SVD 0.943464248998
Launch 1
2017-05-12 17:21:21
Nodes size: 1574
Edges size: 28236
Edges not full: 25413
SFDP 0.945911192774
NMF 0.957923308223
SVD 0.94325394271
Launch 2
2017-05-12 17:23:55
Nodes size: 1574
Edges size: 28236
Edges not full: 25413
SFDP 0.947283893035
NMF 0.959476513016
SVD 0.94161735574
Launch 3
2017-05-12 17:26:30
Nodes size: 1574
Edges size: 28236
Edges not full: 25413
SFDP 0.944848054334
NMF 0.959399718596
SVD 0.942120974049
Launch 4
2017-05-12 17:29:05
Nodes size: 1574
Edges size: 28236
Edges not full: 25413
SFDP 0.950883443261
NMF 0.910181522183
SVD 0.941958099609
Dimension 3
Launch 0
2017-05-12 17:31:42
Nodes size: 1574
Edges size: 28236
Edges not full: 25413
SFDP 0.956585177999
NMF 0.962143362383
SVD 0.939225121714
Draw graph
Launch 1
2017-05-12 17:36:04
Nodes size: 1574
Edges size: 28236
Edges not full: 25413
SFDP 0.95

In [None]:
#dataset_names = ["airport", "Ca-HelpTh", "chicago", "Conflict", "euroroad", "EuroSiS", "PowerGrid"]
#dimensions = [2, 3, 4, 10]
#dataset_names = ["airport", "Ca-HelpTh", "EuroSiS", "PowerGrid"]
#dataset_names = ["EuroSiS", "airport"]
dataset_names = ["airport", "Ca-HelpTh", "EuroSiS", "PowerGrid", "Conflict", "euroroad"]
divisions = [10, 20, 30, 40, 50, 60, 70, 80, 90]
dimensions = [2, 3]

for dataset_name in dataset_names:
    for dimension in dimensions:
        print "Dataset: " + dataset_name
        print "Dim: " + str(dimension)
        print "Components: " + str(component)
        print
        div_scores_sfdp, div_scores_nmf, div_scores_svd = run_division_exp(dataset_name, 10, divisions, dimension)
        save_scores_div(dataset_name, dimension, divisions, div_scores_sfdp, div_scores_nmf, div_scores_svd)
        print "---------------------------"
        print
        print

Dataset: EuroSiS
Dim: 2
Components: 30

Division 10/%
Launch 0
2017-05-12 22:52:00
Nodes size: 1285
Edges size: 6462
Edges not full: 5816
SFDP 0.908119985814
NMF 0.916542859608
SVD 0.889989839834
Launch 1
2017-05-12 22:53:22
Nodes size: 1285
Edges size: 6462
Edges not full: 5816
SFDP 0.897419461511
NMF 0.912315846984
SVD 0.888091997431
Launch 2
2017-05-12 22:55:06
Nodes size: 1285
Edges size: 6462
Edges not full: 5816
SFDP 0.894367817194
NMF 0.916808845096
SVD 0.883214398681
Division 30/%
Launch 0
2017-05-12 22:57:14
Nodes size: 1285
Edges size: 6462
Edges not full: 4524
SFDP 0.907782112356
NMF 0.912842892303
SVD 0.852092898427
Launch 1
2017-05-12 22:59:02
Nodes size: 1285
Edges size: 6462
Edges not full: 4524
SFDP 0.89147432641
NMF 0.900993757994
SVD 0.857288534881
Launch 2
2017-05-12 22:59:43
Nodes size: 1285
Edges size: 6462
Edges not full: 4524
