In [1]:
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("pyspark")
sc = SparkContext(conf=conf)

In [2]:
btc_raw = sc.parallelize([(0,1), (1,2), (2,5), (5,8), (7,8), (3,7), (3,4), (3,6), (10,11), (10,12), (12,13)])
btc = btc_raw.flatMap(lambda x: [x, (x[1], x[0])]).groupByKey().map(lambda x: (x[0], set(x[1])))

In [3]:
def Min_Selection_Step(G): #dictionary format RDD
    v_min = G.map(lambda x: (x[0], min(x[1] | {x[0]})))
    NN_G_u = G.map(lambda x: (x[0], x[1] | {x[0]}))
    #Broadcasting
    v_min_bc = sc.broadcast(dict(v_min.collect()))
    addEdge = NN_G_u.map(lambda x: (x[0], (x[1], v_min_bc.value[x[0]])) )
    addEdge1 = addEdge.flatMap(lambda x: [(y, x[1][1]) for y in x[1][0]])
    #Without broadcasting
    #addEdge = NN_G_u.join(v_min).flatMap(lambda x: [(y, x[1][1]) for y in x[1][0]])

    H = addEdge1.groupByKey().mapValues(lambda x: set(x))
    return H

def Pruning_Step(H, T, Seeds):
    #minimum node of the neighborhood: shared for following parts
    v_min = H.mapValues(lambda x: min(x))
    v_min_bc = sc.broadcast(dict(v_min.collect())) #Broadcasting v_min
    
    #---------------G construction-------------------
    H_filtered = H.filter(lambda x: len(x[1]) > 1)
    NN_H_u = H_filtered.mapValues(lambda x: x - {min(x)} )
    #With Broadcasting
    addEdge2=NN_H_u.map(lambda x:(x[0],(x[1],v_min_bc.value[x[0]]))).flatMap(lambda x:[(x[1][1],y) for y in x[1][0]])
    #Without broadcasting
    #addEdge2 = NN_H_u.join(v_min).flatMap(lambda x: [(x[1][1], y) for y in x[1][0]])
    G = addEdge2.flatMap(lambda x: [x, (x[1], x[0])]).groupByKey().mapValues(lambda x: set(x))
    
    #---------------Tree construction--------------
    #The deactivated Nodes do not appear in G_{t+1}
    deactiveNodes = H.filter(lambda x: x[0] not in x[1]).mapValues(lambda x: False)
    #Without broadcasting
    #addEdge3 = deactiveNodes.join(v_min).map(lambda x: (x[1][1], x[0]))
    #With Broadcasting
    addEdge3 = deactiveNodes.map(lambda x: (x[0], (x[1], v_min_bc.value[x[0]]))).map(lambda x: (x[1][1], x[0]))
    T = T.union(addEdge3)

    #--------------Find Seed-----------------
    #Elements in H with neighborhood from G_{t+1}
    NN_G_H = H.cogroup(G).mapValues(lambda x: (list(x[0]), list(x[1])) ).mapValues(lambda x: set_join(x) )

    #Not sure is necessary to use True/False
    #deactivated = NN_G_H.cogroup(deactiveNodes).map(lambda x: (x[0], (list(x[1][0]), list(x[1][1])) ))
    #seed = deactivated.filter(lambda x: (len(x[1][0]) <= 1) & (x[0] in x[1][0]) & x[1][1]) 
    
    seed = NN_G_H.filter(lambda x: (len(x[1]) <= 1) & (x[0] in x[1]))
    Seeds = Seeds.union(seed)

    return [G, T, Seeds]


def set_join(value):
    if not value[1]:
        return value[0][0]
    else:
        return value[0][0] | value[1][0]


def Cracker(G):
    n = 0
    T = sc.parallelize([])
    Seeds = sc.parallelize([])
    while G.take(1):
        n += 1
        H = Min_Selection_Step(G)
        G, T, Seeds = Pruning_Step(H, T, Seeds)
    
    return [T, Seeds.keys()]

T, Seeds = Cracker(btc)
Seeds.collect()

[10, 0]

In [4]:
seed = Seeds.map(lambda x:(x, None))
seed.collect()

[(10, None), (0, None)]

In [5]:
T.groupByKey().mapValues(lambda x: set(x)).collect()

[(0, {1, 2, 3, 5}), (2, {8}), (3, {4, 6, 7}), (10, {11, 12, 13})]

## output (node, seed) format

In [6]:
needProp = T.subtractByKey(seed)
needProp.collect()

[(2, 8), (3, 4), (3, 6), (3, 7)]

In [7]:
noProp = T.join(seed).map(lambda x: (x[1][0], x[0]))
noProp.collect()

[(5, 0), (3, 0), (1, 0), (2, 0), (13, 10), (11, 10), (12, 10)]

In [8]:
prop = needProp.leftOuterJoin(noProp).map(lambda x: (x[1][0], x[1][1]))
prop.collect()

[(8, 0), (4, 0), (6, 0), (7, 0)]

## 4 layers tree

In [9]:
T = sc.parallelize([(0,1), (1,2), (2,3), (3,4), (9,10), (10,11)])
Seeds = sc.parallelize([0,9])

In [10]:
seed = Seeds.map(lambda x:(x, None))
seed.collect()

[(0, None), (9, None)]

In [27]:
needProp = T.subtractByKey(seed)
needProp.collect()

[(1, 2), (2, 3), (10, 11), (3, 4)]

In [28]:
noProp = T.join(seed).map(lambda x: (x[1][0], x[0]))
noProp.collect()

[(1, 0), (10, 9)]

In [32]:
noProp2 = needProp.leftOuterJoin(noProp).values().filter(lambda x: x[1] is not None)
noProp2.collect()

[(2, 0), (11, 9)]

In [33]:
needProp2 = needProp.map(lambda x: (x[1], x[0])).subtractByKey(noProp2).map(lambda x: (x[1], x[0]))
needProp2.collect()

[(2, 3), (3, 4)]

In [34]:
noProp3 = needProp2.leftOuterJoin(noProp2).values().filter(lambda x: x[1] is not None)
noProp3.collect()

[(3, 0)]

In [37]:
needProp3 = needProp2.map(lambda x: (x[1], x[0])).subtractByKey(noProp3).map(lambda x: (x[1], x[0]))
needProp3.collect()

[(3, 4)]

In [38]:
noProp4 = needProp3.leftOuterJoin(noProp3).values().filter(lambda x: x[1] is not None)
noProp4.collect()

[(4, 0)]

In [20]:
needProp4 = needProp3.subtractByKey(noProp4)
needProp4.collect()

[]

In [21]:
ans = noProp.collect() + noProp2.collect() + noProp3.collect() + noProp4.collect()
print(ans)

[(1, 0), (10, 9), (2, 0), (11, 9), (3, 0), (4, 0)]


## Function

In [39]:
def Seed_Propragation_lite(T, Seeds):
    
    seed = Seeds.map(lambda x:(x, None))
    needProp = T.subtractByKey(seed)    
    noProp = T.join(seed).map(lambda x: (x[1][0], x[0]))
    T_prop = noProp
    while needProp.take(1):
        noProp = needProp.leftOuterJoin(noProp).values().filter(lambda x: x[1] is not None)
        needProp = needProp.map(lambda x: (x[1], x[0])).subtractByKey(noProp).map(lambda x: (x[1], x[0]))
        T_prop = T_prop.union(noProp)
    
    return T_prop

In [40]:
Seed_Propragation_lite(T,Seeds).collect()

[(1, 0), (10, 9), (2, 0), (11, 9), (3, 0), (4, 0)]