In [1]:
import time
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *

conf = SparkConf().setAppName("pyspark")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

# Some defined schemas

In [2]:
#Graph: (node, [NN])
schemaList = ["Node", "NN"]
schemaType = [ IntegerType() , ArrayType( IntegerType() )]
schemaNull = [False, True]

fields = [StructField(schemaList[0], schemaType[0], schemaNull[0]),\
          StructField(schemaList[1], schemaType[1], schemaNull[1])]

schemaG = StructType(fields)

#Tree with (Parent, Child) format
schemaList2 = ["Parent", "Child"]
schemaType2 = [ IntegerType() , IntegerType() ]
schemaNull2 = [False, True]

fields2 = [StructField(schemaList2[0], schemaType2[0], schemaNull2[0]),\
          StructField(schemaList2[1], schemaType2[1], schemaNull2[1])]

schemaT = StructType(fields2)

#Seeds
schemaList3 = ["Node"]
schemaType3 = [IntegerType()]
schemaNull3 = [True]

fields3 = [StructField(schemaList3[0], schemaType3[0], schemaNull3[0])]

schemaS = StructType(fields3)

Synthetic dataset

In [3]:
btc_raw = sc.parallelize([(0,1), (1,2), (2,5), (5,8), (7,8), (3,7), (3,4), (3,6), (10,11), (10,12), (12,13), (6,9), (9,15), (9,16)])
G = btc_raw.flatMap(lambda x: [x, (x[1], x[0])]).groupByKey().mapValues(lambda x: list(set(x)) )

dfG = sqlContext.createDataFrame(G, schemaG)
dfG.printSchema()
dfG.count()

root
 |-- Node: integer (nullable = false)
 |-- NN: array (nullable = true)
 |    |-- element: integer (containsNull = true)



16

Some random dataset

In [None]:
btc_raw = sc.textFile("soc-sign-bitcoinalpha.csv")
G = btc_raw.map(lambda x: x.split(",")).map(lambda x: (int(x[0]), int(x[1]))).flatMap(lambda x: [x, (x[1], x[0])]).groupByKey().mapValues(lambda x: list(set(x)))

dfG = sqlContext.createDataFrame(G, schemaG)
dfG.printSchema()
dfG.count()


# Min_Selection_Step

In [4]:
def Min_Selection_Step(df_G):
    #Including node inside NN_H and finding the min_id inside the NN_H
    NN_min = df_G.select(array_union(col("NN"), array(col("Node"))).alias("NN")\
                        ).withColumn("v_min", array_min( col("NN") ))
    #All edges that needs to be added to G_{t+1}
    addEdge = NN_min.select(explode(NN_min.NN).alias("Node"), "v_min")
    #Grouping all node_id
    dfH = addEdge.groupBy("Node").agg(collect_set("v_min").alias("NN"))
    return dfH

In [None]:
%%time

dfH = Min_Selection_Step(dfG)
dfH.show()

# Pruning_Step

In [5]:
#Function to check if int is inside an array 
#Used in deactiveNodes
def isInside(node, NN):
    return (node in NN)
    
isInside_udf = udf(isInside, BooleanType())

#Function to join two list 
#Used to find seeds
def joinList(H_NN, G_NN):
    #I need to think if H_NN can be empty
    if G_NN != None:
        return list(set(G_NN + H_NN))
    else:
        return H_NN
    
joinList_udf = udf(joinList, ArrayType(IntegerType()))

In [6]:
def Pruning_Step(dfH, T, Seeds):
    
    #---------------G construction-------------------
    H_filtered = dfH.filter(size(col("NN")) > 1) #NN with more than 1 element
    NN_H_min = H_filtered.select("NN", array_min(col("NN")).alias("v_min")) #NN and min_id
    NN_H_u = NN_H_min.select(array_except(col("NN"), array(col("v_min"))).alias("NN_u"), "v_min") #NN-min_id, min_id

    addEdge = NN_H_u.select(explode(NN_H_u.NN_u).alias("Node"), "v_min") #New edges
    addEdge_inv = addEdge.select(col("v_min").alias("Node"), col("Node").alias("v_min")) #Inverse direction of edges
    allEdges = addEdge.union(addEdge_inv) #All edges that need to be in the new graph

    G = allEdges.groupBy("Node").agg(collect_set("v_min").alias("NN"))
    
    #---------------Tree construction--------------
    #The deactivated Nodes do not appear in G_{t+1}
    deactiveNodes = dfH.select("Node", array_min(col("NN")).alias("v_min"), \
                               isInside_udf(col("Node"), col("NN")).alias("Active")).filter(col("Active") == False)
    #Tree in (Parent, Child) format; the one used in RDD
    addEdge = deactiveNodes.select(col("v_min").alias("Parent"), col("Node").alias("Child"))
    T = T.union(addEdge)
    
    #--------------Find Seed-----------------
    #Without broadcasting
    #NN_H_G = dfH.join(G, dfH.Node == G.Node, how="left").select(dfH.Node, dfH.NN.alias("H_NN"), G.NN.alias("G_NN"))
    #With broadcasting
    NN_H_G = dfH.join(broadcast(G), dfH.Node == G.Node, how="left").select(dfH.Node, dfH.NN.alias("H_NN"), G.NN.alias("G_NN"))
    joined_NN = NN_H_G.select("Node", joinList_udf(col("H_NN"), col("G_NN")).alias("NN"))
    seed = joined_NN.filter( (size(col("NN"))<= 1) & ( isInside_udf(col("Node"), col("NN")) ) ) 
    Seeds = Seeds.union(seed.select("Node"))
    
    return G, T, Seeds
    

In [None]:
empty = sc.parallelize([])
T = sqlContext.createDataFrame(empty, schemaT)
Seeds = sqlContext.createDataFrame(empty, schemaS)

In [None]:
%%time

G, T, S = Pruning_Step(dfH, T, Seeds)
G.show()

# Cracker: Main function

In [7]:
def cracker(G):
    n = 0
    empty = sc.parallelize([])
    T = sqlContext.createDataFrame(empty, schemaT)
    Seeds = sqlContext.createDataFrame(empty, schemaS)

    while G.count() != 0:
        n += 1
        H = Min_Selection_Step(G)
        G, T, Seeds = Pruning_Step(H, T, Seeds)
    
    return T, Seeds

In [8]:
%%time

T, S = cracker(dfG)
T.show()

+------+-----+
|Parent|Child|
+------+-----+
|    10|   13|
|     6|   16|
|     6|   15|
|     3|    4|
|     2|    8|
|     3|    7|
|    10|   11|
|    10|   12|
|     2|    6|
|     0|    5|
|     2|    9|
|     0|    1|
|     0|    3|
|     0|    2|
+------+-----+

CPU times: user 421 ms, sys: 143 ms, total: 564 ms
Wall time: 21.1 s


In [None]:
%%time

S.show()

# Seed Propagation

In [9]:
def seedPropagation(Tree, Seeds):
    n = 0
    T_seed = Tree.join(Seeds, Tree.Parent == Seeds.Node, how= "left").select(col("Child").alias("Node"), col("Node").alias("Seed"))
    Seeds = Seeds.select(Seeds.Node, Seeds.Node)
    while T_seed.filter(T_seed.Seed.isNull()).count() != 0:
        n += 1
        Seeds = T_seed.filter(T_seed.Seed.isNotNull()).union(Seeds)
        T_seed = Tree.join(Seeds, Tree.Parent == Seeds.Node, how= "left").select(col("Child").alias("Node"), col("Seed"))
        
    return T_seed


In [10]:
%%time

T_prop = seedPropagation(T, S)
T_prop.show()

+----+----+
|Node|Seed|
+----+----+
|  16|   0|
|  15|   0|
|   4|   0|
|   4|   0|
|   7|   0|
|   7|   0|
|  13|  10|
|  11|  10|
|  12|  10|
|   8|   0|
|   8|   0|
|   6|   0|
|   6|   0|
|   9|   0|
|   9|   0|
|   5|   0|
|   1|   0|
|   3|   0|
|   2|   0|
+----+----+

CPU times: user 1.48 s, sys: 571 ms, total: 2.05 s
Wall time: 1min 56s


In [11]:
def seedPropagation(Tree, Seeds):
    n = 0
    T_seed = Tree.join(Seeds, Tree.Parent == Seeds.Node, how= "left").select(col("Child").alias("Node"), col("Node").alias("Seed"))
    Seeds = Seeds.select(Seeds.Node, Seeds.Node)
    while T_seed.filter(T_seed.Seed.isNull()).count() != 0:
        n += 1
        Seeds = T_seed.filter(T_seed.Seed.isNotNull()).union(broadcast(Seeds))
        T_seed = Tree.join(Seeds, Tree.Parent == Seeds.Node, how= "left").select(col("Child").alias("Node"), col("Seed"))
        
    return T_seed


In [12]:
%%time

T_prop = seedPropagation(T, S)
T_prop.show()

+----+----+
|Node|Seed|
+----+----+
|  16|   0|
|  15|   0|
|   4|   0|
|   4|   0|
|   7|   0|
|   7|   0|
|  13|  10|
|  11|  10|
|  12|  10|
|   8|   0|
|   8|   0|
|   6|   0|
|   6|   0|
|   9|   0|
|   9|   0|
|   5|   0|
|   1|   0|
|   3|   0|
|   2|   0|
+----+----+

CPU times: user 1.39 s, sys: 547 ms, total: 1.94 s
Wall time: 1min 46s
