In [1]:
import time
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *

conf = SparkConf().setAppName("pyspark")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

# Some defined schemas

In [2]:
#Graph: (node, [NN])
schemaList = ["Node", "NN"]
schemaType = [ IntegerType() , ArrayType( IntegerType() )]
schemaNull = [False, True]

fields = [StructField(schemaList[0], schemaType[0], schemaNull[0]),\
          StructField(schemaList[1], schemaType[1], schemaNull[1])]

schemaG = StructType(fields)

#Tree with (Parent, Child) format
schemaList2 = ["Parent", "Child"]
schemaType2 = [ IntegerType() , IntegerType() ]
schemaNull2 = [False, True]

fields2 = [StructField(schemaList2[0], schemaType2[0], schemaNull2[0]),\
          StructField(schemaList2[1], schemaType2[1], schemaNull2[1])]

schemaT = StructType(fields2)

#Seeds
schemaList3 = ["Node"]
schemaType3 = [IntegerType()]
schemaNull3 = [True]

fields3 = [StructField(schemaList3[0], schemaType3[0], schemaNull3[0])]

schemaS = StructType(fields3)

Synthetic dataset

In [3]:
btc_raw = sc.parallelize([(0,1), (1,2), (2,5), (5,8), (7,8), (3,7), (3,4), (3,6), (10,11), (10,12), (12,13), (6,9), (9,15), (9,16)])
G = btc_raw.flatMap(lambda x: [x, (x[1], x[0])]).groupByKey().mapValues(lambda x: list(set(x)) )

dfG = sqlContext.createDataFrame(G, schemaG)
dfG.printSchema()
dfG.count()

root
 |-- Node: integer (nullable = false)
 |-- NN: array (nullable = true)
 |    |-- element: integer (containsNull = true)



16

Some random dataset

In [3]:
btc_raw = sc.textFile("soc-sign-bitcoinalpha.csv")
G = btc_raw.map(lambda x: x.split(",")).map(lambda x: (int(x[0]), int(x[1]))).flatMap(lambda x: [x, (x[1], x[0])]).groupByKey().mapValues(lambda x: list(set(x)))

dfG = sqlContext.createDataFrame(G, schemaG)
dfG.printSchema()
dfG.count()


root
 |-- Node: integer (nullable = false)
 |-- NN: array (nullable = true)
 |    |-- element: integer (containsNull = true)



3783

# Min_Selection_Step

In [4]:
def Min_Selection_Step(df_G):
    #Including node inside NN_H and finding the min_id inside the NN_H
    NN_min = df_G.select(array_union(col("NN"), array(col("Node"))).alias("NN")\
                        ).withColumn("v_min", array_min( col("NN") ))
    #All edges that needs to be added to G_{t+1}
    addEdge = NN_min.select(explode(NN_min.NN).alias("Node"), "v_min")
    #Grouping all node_id
    dfH = addEdge.groupBy("Node").agg(collect_set("v_min").alias("NN"))
    return dfH

In [None]:
%%time

dfH = Min_Selection_Step(dfG)
dfH.show()

# Pruning_Step

In [5]:
#Function to check if int is inside an array 
#Used in deactiveNodes
def isInside(node, NN):
    return (node in NN)
    
isInside_udf = udf(isInside, BooleanType())

#Function to join two list 
#Used to find seeds
def joinList(H_NN, G_NN):
    #I need to think if H_NN can be empty
    if G_NN != None:
        return list(set(G_NN + H_NN))
    else:
        return H_NN
    
joinList_udf = udf(joinList, ArrayType(IntegerType()))

In [6]:
def Pruning_Step(dfH, T, Seeds):
    
    #---------------G construction-------------------
    H_filtered = dfH.filter(size(col("NN")) > 1) #NN with more than 1 element
    NN_H_min = H_filtered.select("NN", array_min(col("NN")).alias("v_min")) #NN and min_id
    NN_H_u = NN_H_min.select(array_except(col("NN"), array(col("v_min"))).alias("NN_u"), "v_min") #NN-min_id, min_id

    addEdge = NN_H_u.select(explode(NN_H_u.NN_u).alias("Node"), "v_min") #New edges
    addEdge_inv = addEdge.select(col("v_min").alias("Node"), col("Node").alias("v_min")) #Inverse direction of edges
    allEdges = addEdge.union(addEdge_inv) #All edges that need to be in the new graph

    G = allEdges.groupBy("Node").agg(collect_set("v_min").alias("NN"))
    
    #---------------Tree construction--------------
    #The deactivated Nodes do not appear in G_{t+1}
    deactiveNodes = dfH.select("Node", array_min(col("NN")).alias("v_min"), \
                               isInside_udf(col("Node"), col("NN")).alias("Active")).filter(col("Active") == False)
    #Tree in (Parent, Child) format; the one used in RDD
    addEdge = deactiveNodes.select(col("v_min").alias("Parent"), col("Node").alias("Child"))
    T = T.union(addEdge)
    
    #--------------Find Seed-----------------
    #Without broadcasting
    NN_H_G = dfH.join(G, dfH.Node == G.Node, how="left").select(dfH.Node, dfH.NN.alias("H_NN"), G.NN.alias("G_NN"))
    #With broadcasting
    #NN_H_G = dfH.join(broadcast(G), dfH.Node == G.Node, how="left").select(dfH.Node, dfH.NN.alias("H_NN"), G.NN.alias("G_NN"))
    joined_NN = NN_H_G.select("Node", joinList_udf(col("H_NN"), col("G_NN")).alias("NN"))
    seed = joined_NN.filter( (size(col("NN"))<= 1) & ( isInside_udf(col("Node"), col("NN")) ) ) 
    Seeds = Seeds.union(seed.select("Node"))
    
    return G, T, Seeds
    

In [12]:
empty = sc.parallelize([])
T = sqlContext.createDataFrame(empty, schemaT)
Seeds = sqlContext.createDataFrame(empty, schemaS)

In [None]:
%%time

G, T, S = Pruning_Step(dfH, T, Seeds)
G.show()

# Cracker: Main function

In [7]:
def cracker(G):
    n = 0
    empty = sc.parallelize([])
    T = sqlContext.createDataFrame(empty, schemaT)
    Seeds = sqlContext.createDataFrame(empty, schemaS)

    while G.count() != 0:
        n += 1
        H = Min_Selection_Step(G)
        G, T, Seeds = Pruning_Step(H, T, Seeds)
    
    return T, Seeds

In [8]:
%%time
#Without broadcasting in pruning
T, S = cracker(dfG)
T.show()

+------+-----+
|Parent|Child|
+------+-----+
|     2|  463|
|     1|  471|
|     1| 1088|
|    91| 1238|
|     1| 1342|
|     1| 1580|
|     3| 1959|
|     1| 2122|
|     2| 2142|
|     5| 2366|
|     2| 2866|
|     2| 3175|
|  3228| 6336|
|     1| 7340|
|     2| 7554|
|     1|  833|
|    13| 1591|
|   239| 1645|
|     1| 1829|
|    23| 2659|
+------+-----+
only showing top 20 rows

CPU times: user 382 ms, sys: 128 ms, total: 510 ms
Wall time: 34 s


# Seed Propagation

In [9]:
def Seed_Propagation_lite(Tree, Seeds):
    T_seed = Tree.join(Seeds, Tree.Parent == Seeds.Node, how = "left")
    
    needProp = T_seed.filter(T_seed.Node.isNull()).select("Parent", "Child")
    noProp = T_seed.filter(T_seed.Node.isNotNull()).select(col("Parent").alias("Seed"), col("Child").alias("Node"))
    
    result = noProp
    while needProp.count() != 0:
        T_seed = needProp.join(noProp, needProp.Parent == noProp.Node, how = "left")
        
        noProp = T_seed.filter(T_seed.Seed.isNotNull() ).select("Seed", col("Child").alias("Node"))
        needProp = T_seed.filter(T_seed.Seed.isNull() ).select("Parent", "Child")
        result = result.union(noProp)
    return result.select("Node", "Seed")

In [10]:
%%time

Seed_Propagation_lite(T,S).show()

+----+----+
|Node|Seed|
+----+----+
|6336|3228|
| 471|   1|
|1088|   1|
|1342|   1|
|1580|   1|
|2122|   1|
|7340|   1|
| 833|   1|
|1829|   1|
| 243|   1|
| 623|   1|
| 897|   1|
|1025|   1|
|1127|   1|
|1483|   1|
|1507|   1|
|1522|   1|
|1721|   1|
|2235|   1|
|2387|   1|
+----+----+
only showing top 20 rows

CPU times: user 589 ms, sys: 228 ms, total: 817 ms
Wall time: 1min 12s


In [14]:
T = sc.parallelize([(0,1), (1,2), (2,3), (3,4), (9,10), (10,11)])
Seeds = sc.parallelize([0,9]).map(lambda x: (x, ))

T = sqlContext.createDataFrame(T, schemaT)
T.show()
Seeds = sqlContext.createDataFrame(Seeds, schemaS)
Seeds.show()

+------+-----+
|Parent|Child|
+------+-----+
|     0|    1|
|     1|    2|
|     2|    3|
|     3|    4|
|     9|   10|
|    10|   11|
+------+-----+

+----+
|Node|
+----+
|   0|
|   9|
+----+



In [63]:
%%time

Seed_Propagation_lite(T,Seeds).show()

+----+----+
|Node|Seed|
+----+----+
|  10|   9|
|   1|   0|
|   2|   0|
|  11|   9|
|   3|   0|
|   4|   0|
+----+----+

CPU times: user 189 ms, sys: 79.8 ms, total: 269 ms
Wall time: 9.61 s


In [57]:
allq = T.join(Seeds, T.Parent == Seeds.Node, how = "left")
allq.show()

+------+-----+----+
|Parent|Child|Node|
+------+-----+----+
|     1|    2|null|
|     3|    4|null|
|     9|   10|   9|
|    10|   11|null|
|     2|    3|null|
|     0|    1|   0|
+------+-----+----+



In [45]:
needProp = allq.filter(allq.Node.isNull()).drop("Node")
needProp.show()
noProp = allq.filter(allq.Node.isNotNull()).select(col("Parent").alias("Seed"), col("Child").alias("Node"))
noProp.show()

+------+-----+
|Parent|Child|
+------+-----+
|     1|    2|
|     3|    4|
|    10|   11|
|     2|    3|
+------+-----+

+----+----+
|Seed|Node|
+----+----+
|   9|  10|
|   0|   1|
+----+----+



In [52]:
a = needProp.join(noProp, needProp.Parent == noProp.Node, how = "left")
a.show()

noProp2 = a.filter(a.Seed.isNotNull() ).select("Seed", col("Child").alias("Node"))
noProp2.show()
needProp2 = a.filter(a.Seed.isNull() ).select("Parent", "Child")
needProp2.show()

+------+-----+----+----+
|Parent|Child|Seed|Node|
+------+-----+----+----+
|     1|    2|   0|   1|
|     3|    4|null|null|
|    10|   11|   9|  10|
|     2|    3|null|null|
+------+-----+----+----+

+----+----+
|Seed|Node|
+----+----+
|   0|   2|
|   9|  11|
+----+----+

+------+-----+
|Parent|Child|
+------+-----+
|     3|    4|
|     2|    3|
+------+-----+



In [54]:
b = needProp2.join(noProp2, needProp2.Parent == noProp2.Node, how = "left")
b.show()

noProp3 = b.filter(b.Seed.isNotNull() ).select("Seed", col("Child").alias("Node"))
noProp3.show()
needProp3 = b.filter(b.Seed.isNull() ).select("Parent", "Child")
needProp3.show()

+------+-----+----+----+
|Parent|Child|Seed|Node|
+------+-----+----+----+
|     3|    4|null|null|
|     2|    3|   0|   2|
+------+-----+----+----+

+----+----+
|Seed|Node|
+----+----+
|   0|   3|
+----+----+

+------+-----+
|Parent|Child|
+------+-----+
|     3|    4|
+------+-----+



In [55]:
c = needProp3.join(noProp3, needProp3.Parent == noProp3.Node, how = "left")
c.show()

noProp4 = c.filter(c.Seed.isNotNull() ).select("Seed", col("Child").alias("Node"))
noProp4.show()
needProp4 = c.filter(c.Seed.isNull() ).select("Parent", "Child")
needProp4.show()

+------+-----+----+----+
|Parent|Child|Seed|Node|
+------+-----+----+----+
|     3|    4|   0|   3|
+------+-----+----+----+

+----+----+
|Seed|Node|
+----+----+
|   0|   4|
+----+----+

+------+-----+
|Parent|Child|
+------+-----+
+------+-----+



In [58]:
needProp4.count()

0

In [27]:
needProp = T.join(Seeds, T.Parent == Seeds.Node, how="left_anti")
needProp.show()

+------+-----+
|Parent|Child|
+------+-----+
|     1|    2|
|     3|    4|
|    10|   11|
|     2|    3|
+------+-----+



Older versions

In [22]:
def seedPropagation(Tree, Seeds):
    n = 0
    T_seed = Tree.join(Seeds, Tree.Parent == Seeds.Node, how= "left").select(col("Child").alias("Node"), col("Node").alias("Seed"))
    Seeds = Seeds.select(Seeds.Node, Seeds.Node)
    while T_seed.filter(T_seed.Seed.isNull()).count() != 0:
        n += 1
        Seeds = T_seed.filter(T_seed.Seed.isNotNull()).union(Seeds)
        T_seed = Tree.join(Seeds, Tree.Parent == Seeds.Node, how= "left").select(col("Child").alias("Node"), col("Seed"))
        
    return T_seed


In [23]:
%%time

T_prop = seedPropagation(T, S)
T_prop.show()

+----+----+
|Node|Seed|
+----+----+
|2599|   1|
|1898|   1|
| 985|   1|
| 789|   1|
|2751|   1|
|2606|   1|
|2013|   1|
|2045|   1|
|2048|   1|
|1121|   1|
|2030|   1|
|2669|   1|
| 790|   1|
|1262|   1|
|2658|   1|
|2644|   1|
|2484|   1|
|2680|   1|
|2612|   1|
|1626|   1|
+----+----+
only showing top 20 rows

CPU times: user 1.01 s, sys: 416 ms, total: 1.43 s
Wall time: 1min 45s


In [9]:
def seedPropagation(Tree, Seeds):
    n = 0
    T_seed = Tree.join(broadcast(Seeds), Tree.Parent == Seeds.Node, how= "left").select(col("Child").alias("Node"), col("Node").alias("Seed"))
    Seeds = Seeds.select(Seeds.Node, Seeds.Node)
    while T_seed.filter(T_seed.Seed.isNull()).count() != 0:
        n += 1
        Seeds = T_seed.filter(T_seed.Seed.isNotNull()).union(Seeds)
        T_seed = Tree.join(broadcast(Seeds), Tree.Parent == Seeds.Node, how= "left").select(col("Child").alias("Node"), col("Seed"))
        
    return T_seed


In [10]:
%%time

T_prop = seedPropagation(T, S)
T_prop.show()

+----+----+
|Node|Seed|
+----+----+
|  13|  10|
|  16|   0|
|  15|   0|
|   4|   0|
|   4|   0|
|   8|   0|
|   8|   0|
|   7|   0|
|   7|   0|
|  11|  10|
|  12|  10|
|   6|   0|
|   6|   0|
|   5|   0|
|   9|   0|
|   9|   0|
|   1|   0|
|   3|   0|
|   2|   0|
+----+----+

CPU times: user 1.34 s, sys: 546 ms, total: 1.89 s
Wall time: 1min 57s
