In [1]:
#PageRank function
import networkx as nx
def pagerank_scipy(G, alpha=0.85, personalization=None,
                   max_iter=100, tol=1.0e-6, weight='weight',
                   dangling=None):

    import scipy.sparse

    N = len(G)
    if N == 0:
        return {}

    nodelist = G.nodes()
    M = nx.to_scipy_sparse_matrix(G, nodelist=nodelist, weight=weight,
                                  dtype=float)
    S = scipy.array(M.sum(axis=1)).flatten()
    S[S != 0] = 1.0 / S[S != 0]
    Q = scipy.sparse.spdiags(S.T, 0, *M.shape, format='csr')
    M = Q * M

    # initial vector
    x = scipy.repeat(1.0 / N, N)

    # Personalization vector
    if personalization is None:
        p = scipy.repeat(1.0 / N, N)
    else:
        missing = set(nodelist) - set(personalization)
        if missing:
            raise NetworkXError('Personalization vector dictionary '
                                'must have a value for every node. '
                                'Missing nodes %s' % missing)
        p = scipy.array([personalization[n] for n in nodelist],
                        dtype=float)
        p = p / p.sum()

    # Dangling nodes
    if dangling is None:
        dangling_weights = p
    else:
        missing = set(nodelist) - set(dangling)
        if missing:
            raise NetworkXError('Dangling node dictionary '
                                'must have a value for every node. '
                                'Missing nodes %s' % missing)
        # Convert the dangling dictionary into an array in nodelist order
        dangling_weights = scipy.array([dangling[n] for n in nodelist],
                                       dtype=float)
        dangling_weights /= dangling_weights.sum()
    is_dangling = scipy.where(S == 0)[0]

    # power iteration: make up to max_iter iterations
    for _ in range(max_iter):
        xlast = x
        x = alpha * (x * M + sum(x[is_dangling]) * dangling_weights) + \
            (1 - alpha) * p
        # check convergence, l1 norm
        err = scipy.absolute(x - xlast).sum()
        if err < N * tol:
            return dict(zip(nodelist, map(float, x)))
    raise NetworkXError('pagerank_scipy: power iteration failed to converge '
                        'in %d iterations.' % max_iter)


In [2]:
import random
import copy

def add_sybils(DG,d,a):
    
    true_nodes = DG.nodes()
    node_number = len(true_nodes)
    DG2 = copy.deepcopy(DG)
    true_nodes_number = len(true_nodes)
    start = max(DG.nodes()) + 1
    Sybil_number = int(node_number * 0.3)
    new_node = []
    
    for i in range(Sybil_number):
        DG2.add_node(start+i)
        new_node.append(start+i)
    node_number2 = len(DG2.nodes())
    
    print node_number
    print node_number2

    for node in new_node:
        for i in range(d):
            to = random.randint(start, start+Sybil_number)
            DG2.add_edge(node, to)
            DG2.add_edge(to, node)

    for i in range(a):
        from_one = true_nodes[random.randint(0, true_nodes_number-1)]
        to_other = new_node[random.randint(0, Sybil_number-1)]
        DG2.add_edge(from_one, to_other)
        DG2.add_edge(to_other, from_one)
        
    return DG2

In [3]:
def get_personalized_vector(DG2,TrustSet):
    NumberofTrust = len(TrustSet)
    Personalized_vector= DG2.in_degree()
    for key in Personalized_vector:
        if key in TrustSet:
            Personalized_vector[key] = 0
        else:
            Personalized_vector[key] = 1.0 / NumberofTrust
    return Personalized_vector

In [4]:
def initialize_trust_seeds(DG,numberofseed):
    l = []
    for n in DG:
        numfollower = len(DG.in_edges(n))
        #print G.in_edges(n), numfollower
        l.append([n, numfollower])

    l = sorted(l, key=lambda s: s[1], reverse=True)

    #Set the top 100 nodes as trust
    t = []
    for i in range(numberofseed):
        t.append(l[i][0])

    Personalized_vector= DG.in_degree()
    TrustSet = set(t)
    return TrustSet

In [5]:
def get_y_with_pagerank(DG2,true_nodes):
    #After getting the pagerank value we will go to find the ROC
    ### Construct y and result_prob
    nodes = DG2.nodes()
    y = []
    result_prob = []
    index = []
    for i in nodes:
        if i in true_nodes:
            y.append(1)
        else:
            y.append(0)
        result_prob.append(result[i])
        index.append(i)
    return y,result_prob,index

In [6]:
def compute_TPR_FPR(true_nodes_set,threshold,result_prob):
    TN = 0
    TP = 0
    FN = 0
    FP = 0
    
    for i in range(len(result_prob)):
        if result_prob[i] < threshold:
            if index[i] in true_nodes_set:
                FP += 1
            else:
                TP += 1
        else:
            if index[i] in true_nodes_set:
                TN += 1
            else:
                FN += 1
    if TP + FN == 0:  
        TPR = -1
    else:
        TPR = TP * 1.0 / (TP + FN)
    
    if FP + TN == 0:
        FPR = -1
    else:
        FPR = FP * 1.0 / (FP + TN)
    #print TPR,FPR
    return TPR,FPR
    
#True Positive Rate: Recall: (TP) / (TP + FN)
#False Positive Rate: FP / (FP + TN)

In [7]:
#Filter the FPR and TPR to ensure no negative slew
def filter_FPR_TPR(TPR_array,FPR_array):
    TPR_array2 = [0.0]
    FPR_array2 = [0.0]
    con = 0
    for i in range(1,len(TPR_array)):
        if TPR_array[i] > FPR_array[i] and TPR_array[i] >= TPR_array2[con] and FPR_array[i] >= FPR_array2[con]:
            TPR_array2.append(TPR_array[i])
            FPR_array2.append(FPR_array[i])
            con += 1
    return TPR_array2,FPR_array2

In [8]:
#Read the original true graph from file
import networkx as nx
import pickle
DG = pickle.load( open( '246_source/retweet_network_graph.p', "rb" ) )

true_nodes = set(DG.nodes())

numberofseed = 100
TrustSet = initialize_trust_seeds(DG,numberofseed)

In [9]:
#run trust_rank
d = 7
a = 10000
DG2 = add_sybils(DG,d,a)
Personalized_vector = get_personalized_vector(DG2,TrustSet)
result = pagerank_scipy(DG2,personalization=Personalized_vector)
y,result_prob,index = get_y_with_pagerank(DG2,true_nodes)

256491
333438




In [14]:
len(DG.edges())

328132

In [10]:
n = len(DG2.nodes())
max_abs_prob = max(result_prob) * n * 1.0
for i in xrange(len(result_prob)):
    result_prob[i] = result_prob[i] * n / max_abs_prob

threshold2 = []
sort_result_prob = sorted(result_prob)
for i in range(150):
    tmp_n = int((0.25 + 0.005 * i) * n)
    threshold2.append(sort_result_prob[tmp_n])
#threshold2 = threshold2[::-1]

In [29]:
TPR,FPR = compute_TPR_FPR(true_nodes,0.0002,result_prob)
print TPR, FPR

0.178601133233 0.95977246765


In [11]:
threshold = 0.01

TPR_array = []
FPR_array = []
for i in range(len(threshold2)):
    TPR,FPR = compute_TPR_FPR(true_nodes,threshold2[i],result_prob)
    if TPR == -1 or FPR == -1:
        continue
    TPR_array.append(TPR)
    FPR_array.append(FPR)

#TPR_array = TPR_array[::-1]
TPR_array.insert(0,0.0)
TPR_array.append(1.0)
FPR_array.insert(0,0.0)
FPR_array.append(1.0)

In [15]:
DG.order(),DG2.order()


(256491, 333439)

In [12]:
for i in range(len(TPR_array)):
    print TPR_array[i],FPR_array[i]

0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.0
0.0 0.812488547356
0.0 0.818999497058
0.0 0.825498750443
0.0 0.831994105056
0.0 0.838501155986
0.0 0.844181667193
0.0 0.850029825608
0.0 0.857998916141
0.0 0.858926823943
0.0 0.858926823943
0.0 0.877239357326
0.0 0.881395448573
0.0 0.881395448573
0.0 0.881395448573
0.0 0.881395448573
0.0 0.881395448573
0.0 0.881395448573
0.0 0.881395448573
0.0 0.881395448573
0.0 0.935724840248
0.0002859073

In [13]:

#Use package to draw and compute the ROC
from sklearn.metrics import roc_curve, auc
import numpy as np
import pylab as pl


# Compute ROC curve and area the curve
#### Here probas_[:,1] is a numpy array of probabilities
roc_auc = auc(FPR_array, TPR_array)
fpr, tpr, thresholds = roc_curve(y, result_prob)
roc_auc = auc(fpr, tpr)
print "Area under the ROC curve : %f" % roc_auc

# Plot ROC curve
pl.clf()
pl.plot(tpr, fpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k--')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Receiver operating characteristic example')
pl.legend(loc="lower right")
pl.show()

Area under the ROC curve : 0.031750


In [19]:
TPR_array2 = [0.0]
FPR_array2 = [0.0]

for i in range(1,len(TPR_array)):
    if TPR_array[i] >= TPR_array[i-1] and FPR_array[i] >= FPR_array[i-1]:
        TPR_array2.append(TPR_array[i])
        FPR_array2.append(FPR_array[i])

In [28]:
TPR_array2

[0.0,
 0.31979668508287296,
 0.28696659407867303,
 0.28696659407867303,
 0.28696659407867303,
 0.28696659407867303,
 0.28696659407867303,
 0.28696659407867303,
 0.28696659407867303,
 0.15555058981264774,
 0.15587968258085816,
 0.13583085497213407,
 0.1363085286948532,
 0.13652188653166603,
 0.1379025378545532,
 0.14065460692231144,
 0.1434324572893732,
 0.1439384202333519,
 0.14582693292370713,
 0.14839251439539347,
 0.1487041070808473,
 0.15215523530079272,
 0.15538153948683772,
 0.1591033726350425,
 0.1631648584281556,
 0.16703624268421244,
 0.1708121684691946,
 0.1747929726801331,
 0.17976247600767753,
 0.1836852882169228,
 0.1897947817145795,
 0.19589025880262134,
 0.20183168773645843,
 0.20928502879078695,
 0.20893965472278725,
 0.21633632323672947,
 0.22492288358277163,
 0.2348848368522073,
 0.24560406604160748,
 0.2584471842719094,
 0.27117806865892813,
 0.28336332283700705,
 0.31763849021035945,
 0.3377963226191022,
 0.35757484880291895,
 0.3865866957470011,
 0.4194457773512476

In [29]:
FPR_array

[0.7680689391176444,
 0.7669007343140344,
 0.7658272362736233,
 0.7672316972032059,
 0.7699159958536218,
 0.7733176681558891,
 0.7762798040824445,
 0.7800288034089241,
 0.7837887913979035,
 0.7876388638968075,
 0.7915004490652144,
 0.7955058832537839,
 0.7994223919391586,
 0.8032202723010146,
 0.8073831748429957,
 0.8114798720836607,
 0.8154974172063875,
 0.8197469589261409,
 0.824195889475812,
 0.8286009430347057,
 0.8330881589132351,
 0.8375657096643753,
 0.842036239422826,
 0.846673437255049,
 0.849221446467804,
 0.854014774674154,
 0.858711480320639,
 0.863556948914935,
 0.86853274404574,
 0.8732530528573899,
 0.8782701283029583,
 0.883215275199223,
 0.8882268219710439,
 0.8933637815532928,
 0.8986502451951565,
 0.9039788749135942,
 0.9093033353940357,
 0.9147860858618583,
 0.9203071329046487,
 0.924438896199077,
 0.929980685367657,
 0.9354481124912686,
 0.9408469219686866,
 0.9465207126983028,
 0.9525182361705009,
 0.9582197329677349,
 0.9636584600909519,
 0.9692517747708287,
 0.9

In [54]:
TPR_array.append(1.0)
FPR_array.append(1.0)
TPR_array,FPR_array = filter_FPR_TPR(TPR_array,FPR_array)

In [55]:
from sklearn.metrics import roc_curve, auc
import numpy as np
import pylab as pl
roc_auc = auc(FPR_array, TPR_array)

In [56]:
roc_auc,TPR_array,FPR_array2

(0.33402437969624155,
 [0.0, 0.8686459739091318, 1.0],
 [0.0, 0.7672001713743611, 0.7680689391176444])

In [1]:
#result_prob

In [16]:
import numpy as np
import pylab as pl
from sklearn import svm, datasets
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc

random_state = np.random.RandomState(0)

# Import some data to play with
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Make it a binary classification problem by removing the third class
X, y = X[y != 2], y[y != 2]
n_samples, n_features = X.shape

# Add noisy features to make the problem harder
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]

# shuffle and split training and test sets
X, y = shuffle(X, y, random_state=random_state)
half = int(n_samples / 2)
X_train, X_test = X[:half], X[half:]
y_train, y_test = y[:half], y[half:]

# Run classifier
classifier = svm.SVC(kernel='linear', probability=True)
probas_ = classifier.fit(X_train, y_train).predict_proba(X_test)

# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1])
roc_auc = auc(fpr, tpr)
print "Area under the ROC curve : %f" % roc_auc

# Plot ROC curve
pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k--')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Receiver operating characteristic example')
pl.legend(loc="lower right")
pl.show()

Area under the ROC curve : 0.793881
