In [1]:
import graphviz as gv
import pydot
import random
import numpy    
from sklearn.metrics import roc_auc_score
from  sklearn.ensemble import RandomForestClassifier as RF
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from scipy.sparse import coo_matrix
from time import gmtime, strftime
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import math

In [2]:
def add_nodes(graph, nodes):
    for n in nodes:
        if isinstance(n, tuple):
            graph.node(n[0], **n[1])
        else:
            graph.node(n)
    return graph

def add_edges(graph, edges):
    for e in edges:
        if isinstance(e[0], tuple):
            graph.edge(*e[0], **e[1])
        else:
            graph.edge(*e)
    return graph

In [3]:
def dist(u, w, nodes_pos, A=[]):
    nodes_pos_u = nodes_pos[u]
    nodes_pos_w = nodes_pos[w]
    length = len(nodes_pos_u)
    dist = 0.0
    for i in range(length):
        dist = dist + (nodes_pos_u[i] - nodes_pos_w[i])**2
        
    return (dist)**0.5

def mf_score(u, w, nodes_pos, A):
    return numpy.dot(A[int(u)], A[int(w)])

In [4]:
def make_dataset(nodes_pos, pos_set, neg_set, functs, A=[]):
    X = []
    Y = []
    
    for edge in pos_set:
        u, w = edge
        x = []
        for func in functs:
            x.append(func(u, w, nodes_pos, A))
        X.append(x)
        Y.append(1)
        
    for edge in neg_set:
        u, w = edge
        x = []
        for func in functs:
            x.append(func(u, w, nodes_pos, A))
        X.append(x)
        Y.append(0)
        
    X = numpy.array(X)
    Y = numpy.array(Y)
    return X, Y

In [5]:
def make_sparse_matrix(train_set, n):
    row = []
    col = []
    data = []
    for edge in train_set:
        u = int(edge[0])
        w = int(edge[1])
        row.append(u)
        col.append(w)
        row.append(w)
        col.append(u)
        data.append(1)
        data.append(1)
    return coo_matrix((data, (row, col)), shape=(n, n))

In [6]:
def read_train(dataset_name):
    max_id = 0
    file_name = dataset_name + "/train.in"
    fin_train = open(file_name, 'r')
    edges = set()
    nodes = set()
    for line in fin_train:
        line = line.strip()
        u, w = line.split()
        max_id = max(max_id, int(u))
        max_id = max(max_id, int(w))
        edges.add((u,w))
        nodes.add(u)
        nodes.add(w)
    fin_train.close()
    
    return edges, nodes, max_id

In [7]:
def get_sets(nodes, edges, division=10):
    nodes_size = len(nodes)
    edges_size = len(edges)
    print "Nodes size: " + str(nodes_size)
    print "Edges size: " + str(edges_size)
    test_size = int(edges_size / division)
    pos_edges = random.sample(edges, test_size)
    pos_edges = set(pos_edges)

    neg_edges = set()
    while len(neg_edges) < test_size:
        u = random.sample(nodes, 1)[0]
        w = random.sample(nodes, 1)[0]
        edge = (str(u),str(w))
        if edge not in edges and u != w:
            neg_edges.add(edge)
        
    edges_not_full = edges - pos_edges
    
    return pos_edges, neg_edges, edges_not_full

In [8]:
def render_graph(dataset_name, dimension, nodes, edges_not_full):
    graph = gv.Graph(format="dot")
    graph.engine = 'sfdp'
    graph.graph_attr['dim'] = str(dimension)
    graph.graph_attr['dimen'] = str(dimension)
    graph = add_nodes(graph, nodes)
    graph = add_edges(graph, edges_not_full)
    file_name = dataset_name + "/graph" + str(dimension)
    graph.render(file_name, view=False)
    
    graph.format = "png"
    graph.render(file_name, view=False)
    
    return

In [9]:
def read_edges_dot(dataset_name, dimension):
    file_name = dataset_name + "/graph" + str(dimension) + ".dot"
    dot_graph = pydot.graph_from_dot_file(file_name)[0]
    dot_nodes = dot_graph.get_nodes()
    nodes_pos = {}
    for node in dot_nodes:
        name = node.get_name()
        if name != 'node' and name != 'graph':
            pos_str = node.get('pos').strip('"')        
            nodes_pos[name] = map(float, pos_str.split(','))
    return nodes_pos

In [10]:
def auc_sfdp(nodes_pos, pos_edges, neg_edges):
    X, Y = make_dataset(nodes_pos, pos_edges, neg_edges, [dist])
    clf = RF()
    scores = cross_val_score(clf, X, Y, cv=10, scoring='roc_auc')
    print("ROC AUC SFDP: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    return scores

In [11]:
def auc_nmf(edges, nodes_pos, pos_edges, neg_edges, n_components, max_id):
    G = make_sparse_matrix(edges, max_id + 1)
    model = NMF(n_components=n_components, init='random')
    A = model.fit_transform(G)

    X, Y = make_dataset(nodes_pos, pos_edges, neg_edges, [mf_score], A)
    clf = RF()
    scores = cross_val_score(clf, X, Y, cv=10, scoring='roc_auc')
    print("ROC AUC NMF: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    return scores

In [12]:
def auc_svd(edges, nodes_pos, pos_edges, neg_edges, n_components, max_id):
    G = make_sparse_matrix(edges, max_id + 1)
    model = TruncatedSVD(n_components=n_components, algorithm='arpack')
    A = model.fit_transform(G)

    X, Y = make_dataset(nodes_pos, pos_edges, neg_edges, [mf_score], A)
    clf = RF()
    scores = cross_val_score(clf, X, Y, cv=10, scoring='roc_auc')
    print("ROC AUC SVD: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    return scores

In [21]:
def pca_alg(data):
    X = np.array(data)
    pca = PCA()
    pca.fit(X)
    PCA(copy=True, iterated_power='auto', n_components='mle', random_state=None,
      svd_solver='auto', tol=0.0, whiten=False)
    print(pca.explained_variance_ratio_) 
    
    pca = PCA('mle')
    pca.fit(X)
    PCA(copy=True, iterated_power='auto', n_components='mle', random_state=None,
      svd_solver='full', tol=0.0, whiten=False)
    print(pca.explained_variance_ratio_) 

In [14]:
def compute_mean(dim, nodes_pos):
    values = []

    for i in range(dim):
        values.append([])

    for node in nodes_pos:
        for i in range(dim):
            values[i].append(nodes_pos[node][i])
        
    for i in range(dim):
        print np.mean(values[i])
        print np.std(values[i])
        print

In [15]:
def draw_graph(dataset_name, nodes_pos, edges):
    xs = []
    ys = []
    zs = []

    for node in nodes_pos:
        xs.append(nodes_pos[node][0])
        ys.append(nodes_pos[node][1])
        zs.append(nodes_pos[node][2])
    
#    print xs
#    print ys
#    print zs
    
#    print max(xs)
#    print min(xs)
    
#    print max(ys)
#    print min(ys)
    
#    print max(zs)
#    print min(zs)
    
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    ax.scatter(xs, ys, zs, s=2.0, c='b', marker='o')

    for edge in edges:
        u = edge[0]
        w = edge[1]
        ax.plot([nodes_pos[u][0], nodes_pos[w][0]], [nodes_pos[u][1], nodes_pos[w][1]],zs=[nodes_pos[u][2], nodes_pos[w][2]], linewidth=1.0)

    ax.set_xlabel('X Label')
    ax.set_ylabel('Y Label')
    ax.set_zlabel('Z Label')

    file_name = dataset_name + "/3d.png"
    #plt.show()
    fig.savefig(file_name)

In [16]:
def run_exp(dataset_name, dimension, n_components, division):
    print "Read train"
    edges, nodes, max_id = read_train(dataset_name)
    
    print "Get sets"
    pos_edges, neg_edges, edges_not_full = get_sets(nodes, edges, division)
    
    print "Render graph"
    print strftime("%Y-%m-%d %H:%M:%S", gmtime())
    render_graph(dataset_name, dimension, nodes, edges_not_full)
    
    print "Read edges"
    print strftime("%Y-%m-%d %H:%M:%S", gmtime())
    nodes_pos = read_edges_dot(dataset_name, dimension)
    
    print "Compute mean and std"
    compute_mean(dimension, nodes_pos)
    
    if dimension == 3:
        print "Draw graph"
        draw_graph(dataset_name, nodes_pos, edges)
    
    print "SFDP"
    print strftime("%Y-%m-%d %H:%M:%S", gmtime())
    auc_sfdp(nodes_pos, pos_edges, neg_edges)
    
    print "NMF"
    print strftime("%Y-%m-%d %H:%M:%S", gmtime())
    auc_nmf(edges, nodes_pos, pos_edges, neg_edges, n_components, max_id)
    
    print "SVD"
    print strftime("%Y-%m-%d %H:%M:%S", gmtime())
    auc_svd(edges, nodes_pos, pos_edges, neg_edges, n_components, max_id)
    
    return

In [22]:
def run_dim_exp(dataset_name, num_exps, dimensions, n_components_nmf, n_components_svd):
    dim_scores_sfdp = {}
    dim_scores_nmf = {}
    dim_scores_svd = {}
    for dimension in dimensions:
            dim_scores_sfdp[dimension] = np.array([])
            dim_scores_nmf[dimension] = np.array([])
            dim_scores_svd[dimension] = np.array([])
            
    edges, nodes, max_id = read_train(dataset_name)
    
    for i in range(num_exps):
        print "Launch " + str(i)
        print strftime("%Y-%m-%d %H:%M:%S", gmtime())
            
        pos_edges, neg_edges, edges_not_full = get_sets(nodes, edges)
    
        for dimension in dimensions:
            print "Dimension " + str(dimension)
            render_graph(dataset_name, dimension, nodes, edges_not_full)
            nodes_pos = read_edges_dot(dataset_name, dimension)
                        
            if dimension == 3:
                print "Draw graph"
                draw_graph(dataset_name, nodes_pos, edges)
            
            print "PCA" 
            data_for_pca = nodes_pos.values()
            pca_alg(data_for_pca)

            
            print "SFDP"
            sfdp_scores = auc_sfdp(nodes_pos, pos_edges, neg_edges)
            dim_scores_sfdp[dimension] = np.append(dim_scores_sfdp[dimension], sfdp_scores)
    
            print "NMF"
            nmf_scores = auc_nmf(edges, nodes_pos, pos_edges, neg_edges, n_components_nmf, max_id)
            dim_scores_nmf[dimension] = np.append(dim_scores_nmf[dimension], nmf_scores)
    
            print "SVD"
            svd_scores = auc_svd(edges, nodes_pos, pos_edges, neg_edges, n_components_svd, max_id)
            dim_scores_svd[dimension] = np.append(dim_scores_svd[dimension], svd_scores)
    
    return dim_scores_sfdp, dim_scores_nmf, dim_scores_svd

In [18]:
def save_scores(dataset_name, dimensions, dim_scores_sfdp, dim_scores_nmf, dim_scores_svd):
    for dim in dimensions:
        dim_scores_sfdp[dim] = dim_scores_sfdp[dim].mean()
        dim_scores_nmf[dim] = dim_scores_nmf[dim].mean()
        dim_scores_svd[dim] = dim_scores_svd[dim].mean()
    
    print dim_scores_sfdp        
        
    scores_sfdp = []
    scores_nmf = []
    scores_svd = []
    
    for dim in dimensions:
        scores_sfdp.append(dim_scores_sfdp[dim])
        scores_nmf.append(dim_scores_nmf[dim])
        scores_svd.append(dim_scores_svd[dim])
    
    fig, ax = plt.subplots( nrows=1, ncols=1 )
    ax.plot(dimensions, scores_sfdp, label='sfdp')
    ax.plot(dimensions, scores_nmf, label='nmf')
    ax.plot(dimensions, scores_svd, label='svd')
    ax.legend()
    file_name = dataset_name + "/dimensions.png"
    fig.savefig(file_name)

In [19]:
X = np.array([(1, 2, 0), [2, 1, 0], [8, 4, 1], [2, -1, 0], [4, 4, 0], [1, 7, 0]])
pca = PCA()
pca.fit(X)
PCA(copy=True, iterated_power='auto', n_components='mle', random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)
print(pca.explained_variance_ratio_) 

[ 0.581009    0.41722784  0.00176316]


In [23]:
#dataset_names = ["airport", "Ca-HelpTh", "chicago", "Conflict", "euroroad", "EuroSiS", "PowerGrid"]
#dimensions = [2, 3, 4, 5, 6, 7, 8 ,9, 10]

dataset_names = ["sphere"]
dimensions = [2, 3, 4, 5, 10]

for dataset_name in dataset_names:
    print dataset_name
    dim_scores_sfdp, dim_scores_nmf, dim_scores_svd = run_dim_exp(dataset_name, 1, dimensions, 10, 30)
    print "saving"
    save_scores(dataset_name, dimensions, dim_scores_sfdp, dim_scores_nmf, dim_scores_svd)

sphere
Launch 0
2017-04-21 08:29:32
Nodes size: 2798
Edges size: 36186
Dimension 2
PCA
[ 0.58244733  0.41755267]
[ 0.58244733]
SFDP
ROC AUC SFDP: 0.84 (+/- 0.03)
NMF
ROC AUC NMF: 0.79 (+/- 0.03)
SVD
ROC AUC SVD: 0.94 (+/- 0.02)
Dimension 3
Draw graph
PCA
[ 0.48366169  0.38688427  0.12945404]
[ 0.48366169  0.38688427]
SFDP
ROC AUC SFDP: 0.87 (+/- 0.03)
NMF
ROC AUC NMF: 0.87 (+/- 0.02)
SVD
ROC AUC SVD: 0.94 (+/- 0.02)
Dimension 4
PCA
[  9.91252871e-01   4.74421082e-03   3.88863684e-03   1.14280893e-04]
[ 0.99125287  0.00474421  0.00388864]
SFDP
ROC AUC SFDP: 0.87 (+/- 0.02)
NMF
ROC AUC NMF: 0.73 (+/- 0.04)
SVD
ROC AUC SVD: 0.94 (+/- 0.02)
Dimension 5
PCA
[  9.92589856e-01   3.93296666e-03   2.93359873e-03   4.17992265e-04
   1.25586185e-04]
[  9.92589856e-01   3.93296666e-03   2.93359873e-03   4.17992265e-04]
SFDP
ROC AUC SFDP: 0.87 (+/- 0.03)
NMF
ROC AUC NMF: 0.88 (+/- 0.03)
SVD
ROC AUC SVD: 0.94 (+/- 0.01)
Dimension 10
PCA
[  9.94538275e-01   1.94979037e-03   1.83334496e-03   8.6866333

In [None]:
#dataset_names = ["airport", "Ca-HelpTh", "chicago", "Conflict", "euroroad", "EuroSiS", "PowerGrid"]
#dimensions = [2, 3, 4, 10]
dataset_names = ["airport", "Ca-HelpTh", "PowerGrid"]
divisions = [5, 10, 15, 25, 35]
dimensions = [2, 3]
n_components = [30]

for dataset_name in dataset_names:
    for division in divisions:
        for dimension in dimensions:
            for component in n_components:
                print "Dataset: " + dataset_name
                print "Division: " + str(division) + "/%"
                print "Dim: " + str(dimension)
                print "Components: " + str(component)
                print
                run_exp(dataset_name, dimension, component, division)
                print "---------------------------"
                print
                print

Dataset: airport
Division: 5/%
Dim: 2
Components: 30

Read train
Get sets
Nodes size: 1574
Edges size: 28236
Render graph
2017-04-20 22:01:01
Read edges
2017-04-20 22:01:04
