In [1]:
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("pyspark")
sc = SparkContext(conf=conf)

In [14]:
#btc_raw = sc.textFile("graph_datasets/soc-sign-bitcoinalpha.csv")
btc_raw = sc.parallelize([(0,1), (1,2), (2,5), (5,8), (7,8), (3,7), (3,4), (3,6), (10,11), (10,12), (12,13)])

In [3]:
btc_raw.take(10)

['7188,1,10,1407470400',
 '430,1,10,1376539200',
 '3134,1,10,1369713600',
 '3026,1,10,1350014400',
 '3010,1,10,1347854400',
 '804,1,10,1337572800',
 '160,1,10,1394683200',
 '95,1,9,1384578000',
 '377,1,7,1414728000',
 '888,1,7,1365652800']

In [15]:
#btc = btc_raw.map(lambda x: x.split(",")).map(lambda x: (int(x[0]), int(x[1]))).flatMap(lambda x: [x, (x[1], x[0])]).groupByKey().map(lambda x: (x[0], set(x[1])))
btc = btc_raw.flatMap(lambda x: [x, (x[1], x[0])]).groupByKey().map(lambda x: (x[0], set(x[1])))

In [5]:
def Min_Selection_Step(G): #dictionary format RDD
    v_min = G.map(lambda x: (x[0], min(x[1] | {x[0]})))
    NN_G_u = G.map(lambda x: (x[0], (x[1] | {x[0]})))
    addEdge1 = v_min.cogroup(NN_G_u).map(lambda x :(x[0], ( list(x[1][0]), list(x[1][1])))) #if it is possible to reduce to one MapReduce job
    H = addEdge1.flatMap(lambda x: [(x[1][0][0], y) for y in x[1][1][0]]).map(lambda x: (x[1], x[0])).groupByKey().map(lambda x: (x[0], set(x[1])))#.filter(lambda x: len(x[1]) > 1)
    return H

def Pruning_Step(H, T):
    H_filtered = H.filter(lambda x: len(x[1]) > 1)
    v_min_filtered = H_filtered.map(lambda x: (x[0], min(x[1])))
    NN_H_u = H_filtered.map(lambda x: (x[0], x[1] - {min(x[1])} ))
    addEdge2 = v_min_filtered.cogroup(NN_H_u).map(lambda x :(x[0], ( list(x[1][0]), list(x[1][1]))))
    G = addEdge2.flatMap(lambda x: [(x[1][0][0], y) for y in x[1][1][0]]).flatMap(lambda x: [x, (x[1], x[0])]).groupByKey().map(lambda x: (x[0], set(x[1])))
    
    
    #deactiviation
    deactiveNodes = H.filter(lambda x: x[0] not in x[1]).map(lambda x: (x[0], None))
    v_min = H.map(lambda x: (x[0], min(x[1])))
    addEdge3 = deactiveNodes.join(v_min).map(lambda x: (x[1][1], x[0]))
    T = T.union(addEdge3)

    
    return [G, T]

def findSeeds(T):
    T_rev = T.map(lambda x:(x[1], x[0]))
    A = T.keys().distinct().map(lambda x:(x,1))
    B = T_rev.keys().distinct().map(lambda x:(x,1))
    return A.leftOuterJoin(B).filter(lambda x: not x[1][1]).map(lambda x:x[0])

In [8]:
def Cracker(G):
    n = 0
    T = sc.parallelize([])
    while G.take(1):
        n += 1
        print(n)
        H = Min_Selection_Step(G)
        G, T = Pruning_Step(H, T)
    
    return T

In [70]:
T = Cracker(btc)

1
2
3


In [109]:
def Seed_Propragation(T, seed): 
    seed = seed.map(lambda x: (x, x))  
    T_seed = sc.parallelize([(-1, (None, -1))])                       
    
    while T_seed.map(lambda x: (x[1])).lookup(None):
        T_seed = seed.rightOuterJoin(T)
        seed = T_seed.map(lambda x: (x[1][1], x[1][0])).union(seed)
        
    return T_seed

In [110]:
seed = findSeeds(T)
seed.collect()

[0, 10]

In [111]:
T.collect()

[(3, 4),
 (3, 6),
 (3, 7),
 (2, 8),
 (10, 11),
 (10, 13),
 (0, 3),
 (0, 5),
 (10, 12),
 (0, 1),
 (0, 2)]

In [112]:
Seed_Propragation(T, seed).collect()

[(0, (0, 3)),
 (0, (0, 5)),
 (0, (0, 1)),
 (0, (0, 2)),
 (2, (0, 8)),
 (3, (0, 4)),
 (3, (0, 6)),
 (3, (0, 7)),
 (10, (10, 11)),
 (10, (10, 13)),
 (10, (10, 12))]

## Big Example

In [118]:
btc_big_raw = sc.textFile("graph_datasets/soc-sign-bitcoinalpha.csv")
btc_big = btc_big_raw.map(lambda x: x.split(",")).map(lambda x: (int(x[0]), int(x[1]))).flatMap(lambda x: [x, (x[1], x[0])]).groupByKey().map(lambda x: (x[0], set(x[1])))
btc_big.take(1)

[(7188, {1})]

In [119]:
T_big = Cracker(btc_big)

1
2
3


In [121]:
seed_big = findSeeds(T_big)
seed_big.collect()

[1, 1389, 3228, 1870, 5837]

In [126]:
Seed_Propragation(T_big, seed_big).takeSample(False, 10, 123)

[(2, (1, 2842)),
 (45, (1, 776)),
 (1, (1, 285)),
 (13, (1, 1591)),
 (1, (1, 2942)),
 (1, (1, 137)),
 (1, (1, 7558)),
 (1, (1, 1375)),
 (2, (1, 1416)),
 (1, (1, 2434))]

## Explaination

In [99]:
seed = sc.parallelize([0,10]).map(lambda x: (x, x))
seed.collect()

[(0, 0), (10, 10)]

In [100]:
T.collect()

[(3, 4),
 (3, 6),
 (3, 7),
 (2, 8),
 (10, 11),
 (10, 13),
 (0, 3),
 (0, 5),
 (10, 12),
 (0, 1),
 (0, 2)]

In [101]:
initialTree = seed.rightOuterJoin(T)
initialTree.collect()

[(0, (0, 3)),
 (0, (0, 5)),
 (0, (0, 1)),
 (0, (0, 2)),
 (2, (None, 8)),
 (3, (None, 4)),
 (3, (None, 6)),
 (3, (None, 7)),
 (10, (10, 11)),
 (10, (10, 13)),
 (10, (10, 12))]

In [102]:
seed2 = initialTree.map(lambda x: (x[1][1], x[1][0])).union(seed)
seed2.collect()

[(3, 0),
 (5, 0),
 (1, 0),
 (2, 0),
 (8, None),
 (4, None),
 (6, None),
 (7, None),
 (11, 10),
 (13, 10),
 (12, 10),
 (0, 0),
 (10, 10)]

In [103]:
secondTree = seed2.rightOuterJoin(T)
secondTree.collect()

[(0, (0, 3)),
 (0, (0, 5)),
 (0, (0, 1)),
 (0, (0, 2)),
 (2, (0, 8)),
 (3, (0, 4)),
 (3, (0, 6)),
 (3, (0, 7)),
 (10, (10, 11)),
 (10, (10, 13)),
 (10, (10, 12))]

Stop condition

In [104]:
initialTree.map(lambda x: (x[1])).lookup(None)

[8, 4, 6, 7]

In [105]:
secondTree.map(lambda x: (x[1])).lookup(None)

[]