In [1]:
import time
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *

conf = SparkConf().setAppName("pyspark")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [20]:
#I think it is not possible to use sets inside dataframes, so I had to use ArrayType
btc_raw = sc.parallelize([(0,1), (1,2), (2,5), (5,8), (7,8), (3,7), (3,4), (3,6), (10,11), (10,12), (12,13)])
G = btc_raw.flatMap(lambda x: [x, (x[1], x[0])]).groupByKey().map(lambda x: ( x[0], list(set(x[1])) ))

DataFrame Schema

In [21]:
#Graph
schemaList = ["Node", "NN"]
schemaType = [ IntegerType() , ArrayType( IntegerType() )]
schemaNull = [False, True]

fields = [StructField(schemaList[0], schemaType[0], schemaNull[0]),\
          StructField(schemaList[1], schemaType[1], schemaNull[1])]

schemaG = StructType(fields)

#Tree with (Parent, Child) format
schemaList2 = ["Parent", "Child"]
schemaType2 = [ IntegerType() , IntegerType() ]
schemaNull2 = [False, True]

fields2 = [StructField(schemaList2[0], schemaType2[0], schemaNull2[0]),\
          StructField(schemaList2[1], schemaType2[1], schemaNull2[1])]

schemaT = StructType(fields2)

#Tree with (Parent, Children) format
schemaList21 = ["Parent", "Children"]
schemaType21 = [ IntegerType(), ArrayType( IntegerType() )]
schemaNull21 = [False, True]

fields21 = [StructField(schemaList21[0], schemaType21[0], schemaNull21[0]),\
          StructField(schemaList21[1], schemaType21[1], schemaNull21[1])]

schemaT1 = StructType(fields21)

#Seeds
schemaList3 = ["Node"]
schemaType3 = [IntegerType()]
schemaNull3 = [True]

fields3 = [StructField(schemaList3[0], schemaType3[0], schemaNull3[0])]

schemaS = StructType(fields3)

Applying schema to RDD

In [22]:
dfG = sqlContext.createDataFrame(G, schemaG)
dfG.printSchema()

root
 |-- Node: integer (nullable = false)
 |-- NN: array (nullable = true)
 |    |-- element: integer (containsNull = true)



# Min_Selection_Step

In [23]:
#Nodes as int
def Min_Selection_Step(df_G):
    #Including node inside NN_H and finding the min_id inside the NN_H
    NN_min = df_G.select(array_union(col("NN"), array(col("Node")).alias("Node")).alias("NN"), \
                         array_min( array_union(array(col("Node")).alias("Node"), col("NN"))  ).alias("v_min"))
    #All edges that needs to be added to G_{t+1}
    addEdge = NN_min.select(explode(NN_min.NN).alias("Node"), "v_min")
    #Grouping all node_id
    dfH = addEdge.groupBy("Node").agg(collect_set("v_min").alias("NN"))
    return dfH

In [24]:
dfH = Min_Selection_Step(dfG)
dfH.show()

+----+---------+
|Node|       NN|
+----+---------+
|  12| [12, 10]|
|   1|   [0, 1]|
|  13| [12, 10]|
|   6|      [3]|
|   3|      [3]|
|   5|[1, 5, 2]|
|   4|      [3]|
|   8|[5, 2, 3]|
|   7|   [5, 3]|
|  10|     [10]|
|  11|     [10]|
|   2|[0, 1, 2]|
|   0|      [0]|
+----+---------+



# Pruning_Step

In [25]:
#Function to check if int is inside an array 
#Used in deactiveNodes
def isInside(node, NN):
    if node in NN:
        return True
    else:
        return False
    
isInside_udf = udf(isInside, BooleanType())

#Function to join two list 
#Used to find seeds
def joinList(H_NN, G_NN):
    #I need to think if H_NN can be empty
    if G_NN != None:
        return list(set(G_NN + H_NN))
    else:
        return H_NN
    
joinList_udf = udf(joinList, ArrayType(IntegerType()))

In [26]:
def Pruning_Step(dfH, T, Seeds):
    #---------------G construction-------------------
    H_filtered = dfH.filter(size(col("NN")) > 1) #NN with more than 1 element
    NN_H_min = H_filtered.select("NN", array_min(col("NN")).alias("v_min")) #NN and min_id
    NN_H_u = NN_H_min.select(array_except(col("NN"), array(col("v_min"))).alias("NN_u"), "v_min") #NN-min_id, min_id

    addEdge = NN_H_u.select(explode(NN_H_u.NN_u).alias("Node"), "v_min") #New edges
    addEdge_inv = addEdge.select(col("v_min").alias("Node"), col("Node").alias("v_min")) #Inverse direction of edges
    allEdges = addEdge.union(addEdge_inv) #All edges that need to be in the new graph

    G = allEdges.groupBy("Node").agg(collect_set("v_min").alias("NN"))
    
    #---------------Tree construction--------------
    #The deactivated Nodes do not appear in G_{t+1}
    deactiveNodes = dfH.select("Node", array_min(col("NN")).alias("v_min"), \
                               isInside_udf(col("Node"), col("NN")).alias("Active")).filter(col("Active") == False)
    
    #Tree in (Parent, Child) format; the one used in RDD
    addEdge = deactiveNodes.select(col("v_min").alias("Parent"), col("Node").alias("Child"))
    T = T.union(addEdge)
    
    #Tree in (Parent, Children) format; Graph like format
    #addEdge = deactiveNodes.groupBy(col("v_min").alias("Parent")).agg(collect_set("Node").alias("Children"))
    #T = T.union(addEdge)
    
    
    #--------------Find Seed-----------------
    NN_H_G = dfH.join(broadcast(G), dfH.Node == G.Node, how="left").select(dfH.Node, dfH.NN.alias("H_NN"), G.NN.alias("G_NN"))
    joined_NN = NN_H_G.select("Node", joinList_udf(col("H_NN"), col("G_NN")).alias("NN"))
    seed = joined_NN.filter( (size(col("NN"))<= 1) & ( isInside_udf(col("Node"), col("NN")) ) ) 
    Seeds = Seeds.union(seed.select("Node"))
    
    return G, T, Seeds
    

# Cracker: Main function

In [27]:
def cracker(G):
    n = 0
    empty = sc.parallelize([])
    T = sqlContext.createDataFrame(empty, schemaT)
    Seeds = sqlContext.createDataFrame(empty, schemaS)

    while G.count() != 0:
        n += 1
        H = Min_Selection_Step(G)
        G, T, Seeds = Pruning_Step(H, T, Seeds)
    
    #Building a graph format (Parent, Children) from (Parent, Child): OPTIONAL
    Tree = T.groupBy(col("Parent")).agg(collect_set("Child").alias("Children"))
    
    return Tree

In [28]:
%%time
Tree = cracker(dfG)
Tree.show()
# Can this function output number of CC?

+------+------------+
|Parent|    Children|
+------+------------+
|     3|   [6, 7, 4]|
|    10|[12, 13, 11]|
|     2|         [8]|
|     0|[1, 5, 2, 3]|
+------+------------+

CPU times: user 774 ms, sys: 289 ms, total: 1.06 s
Wall time: 30.7 s


# Other possibilities

In [12]:
#Nodes as lists

G = btc_raw.flatMap(lambda x: [x, (x[1], x[0])]).groupByKey().map(lambda x: ( [x[0]], list(set(x[1])) ))


schemaList = ["Node", "NN"]
schemaType = [ArrayType( IntegerType() ), ArrayType( IntegerType() )]
schemaNull = [False, True]

fields = [StructField(schemaList[0], schemaType[0], schemaNull[0]),\
          StructField(schemaList[1], schemaType[1], schemaNull[1])]

schema = StructType(fields)

In [13]:
#Nodes as lists
def Min_Selection_Step(df_G):
    #Including node inside NN_H and finding the min_id inside the NN_H
    NN_min = df_G.select(array_union(col("NN"), col("Node")).alias("NN"), \
                         array_min( array_union(col("Node"), col("NN"))  ).alias("v_min"))
    #All edges that needs to be added to G_{t+1}
    addEdge = NN_min.select(explode(NN_min.NN).alias("Node"), "v_min")
    #Grouping all node_id
    dfH = addEdge.groupBy(array("Node").alias("Node")).agg(collect_set("v_min").alias("NN"))
    return dfH

In [14]:
#Tree in Edge = (Parent, Child) format
schemaList2 = ["Parent", "Child"]
schemaType2 = [ IntegerType() , IntegerType()]
schemaNull2 = [False, True]

fields2 = [StructField(schemaList2[0], schemaType2[0], schemaNull2[0]),\
          StructField(schemaList2[1], schemaType2[1], schemaNull2[1])]

schemaT = StructType(fields2)


empty = sc.parallelize([])
T = sqlContext.createDataFrame(empty, schemaT)