In [8]:
#PageRank function
import networkx as nx
def pagerank_scipy(G, alpha=0.85, personalization=None,
                   max_iter=100, tol=1.0e-6, weight='weight',
                   dangling=None):

    import scipy.sparse

    N = len(G)
    if N == 0:
        return {}

    nodelist = G.nodes()
    M = nx.to_scipy_sparse_matrix(G, nodelist=nodelist, weight=weight,
                                  dtype=float)
    S = scipy.array(M.sum(axis=1)).flatten()
    S[S != 0] = 1.0 / S[S != 0]
    Q = scipy.sparse.spdiags(S.T, 0, *M.shape, format='csr')
    M = Q * M

    # initial vector
    x = scipy.repeat(1.0 / N, N)

    # Personalization vector
    if personalization is None:
        p = scipy.repeat(1.0 / N, N)
    else:
        missing = set(nodelist) - set(personalization)
        if missing:
            raise NetworkXError('Personalization vector dictionary '
                                'must have a value for every node. '
                                'Missing nodes %s' % missing)
        p = scipy.array([personalization[n] for n in nodelist],
                        dtype=float)
        p = p / p.sum()

    # Dangling nodes
    if dangling is None:
        dangling_weights = p
    else:
        missing = set(nodelist) - set(dangling)
        if missing:
            raise NetworkXError('Dangling node dictionary '
                                'must have a value for every node. '
                                'Missing nodes %s' % missing)
        # Convert the dangling dictionary into an array in nodelist order
        dangling_weights = scipy.array([dangling[n] for n in nodelist],
                                       dtype=float)
        dangling_weights /= dangling_weights.sum()
    is_dangling = scipy.where(S == 0)[0]

    # power iteration: make up to max_iter iterations
    for _ in range(max_iter):
        xlast = x
        x = alpha * (x * M + sum(x[is_dangling]) * dangling_weights) + \
            (1 - alpha) * p
        # check convergence, l1 norm
        err = scipy.absolute(x - xlast).sum()
        if err < N * tol:
            return dict(zip(nodelist, map(float, x)))
    raise NetworkXError('pagerank_scipy: power iteration failed to converge '
                        'in %d iterations.' % max_iter)


In [2]:
#Read the original true graph from file
import networkx as nx
import pickle
DG = pickle.load( open( '246_source/retweet_network_graph.p', "rb" ) )
true_nodes = set(DG.nodes())

In [4]:
#Add Sybils into the graph

d = 3
a = 3

import random
import copy
node_number = len(DG.nodes())
DG2 = copy.deepcopy(DG)
TrustSet = DG2.nodes()
TrustSet_number = len(TrustSet)
start = max(DG.nodes()) + 1
Sybil_number = int(node_number * 0.3)
new_node = []
for i in range(Sybil_number):
    DG2.add_node(start+i)
    new_node.append(start+i)
node_number2 = len(DG2.nodes())
print node_number
print node_number2

for node in new_node:
    for i in range(d):
        to = random.randint(start, start+Sybil_number)
        DG2.add_edge(node, to)
        DG2.add_edge(to, node)

for i in range(a):
    from_one = TrustSet[random.randint(0, TrustSet_number-1)]
    to_other = new_node[random.randint(0, Sybil_number-1)]
    DG2.add_edge(from_one, to_other)
    DG2.add_edge(to_other, from_one)

256491
333438


In [5]:

#After adding Sybils, run pagerank
##set up personalized vector
l = []
for n in DG2:
    numfollower = len(DG2.in_edges(n))
    #print G.in_edges(n), numfollower
    l.append([n, numfollower])

l = sorted(l, key=lambda s: s[1], reverse=True)

#Set the top 100 nodes as trust
t = []
for i in range(100):
    t.append(l[i][0])

Personalized_vector= DG2.in_degree()
TrustSet = set(t)
NumberofTrust = len(TrustSet)

In [6]:
for key in Personalized_vector:
    if key in TrustSet:
        Personalized_vector[key] = 0
    else:
        Personalized_vector[key] = 1.0 / NumberofTrust

In [9]:
#run trust_rank
result = pagerank_scipy(DG2,personalization=Personalized_vector)



In [12]:
#After getting the pagerank value we will go to find the ROC
### Construct y and result_prob
nodes = DG2.nodes()
y = []
result_prob = []
for i in nodes:
    if i in true_nodes:
        y.append(1)
    else:
        y.append(0)
    result_prob.append(result[i])

In [15]:

#Use package to draw and compute the ROC
from sklearn.metrics import roc_curve, auc
import numpy as np
import pylab as pl


# Compute ROC curve and area the curve
#### Here probas_[:,1] is a numpy array of probabilities
fpr, tpr, thresholds = roc_curve(y, result_prob)
roc_auc = auc(fpr, tpr)
print "Area under the ROC curve : %f" % roc_auc

# Plot ROC curve
pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k--')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Receiver operating characteristic example')
pl.legend(loc="lower right")
pl.show()

Area under the ROC curve : 0.036080
