In [2]:
import time
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.types import *

conf = SparkConf().setAppName("pyspark")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [21]:
#I think it is not possible to use sets inside dataframes, so I had to use ArrayType
btc_raw = sc.parallelize([(0,1), (1,2), (2,5), (5,8), (7,8), (3,7), (3,4), (3,6), (10,11), (10,12), (12,13)])
G = btc_raw.flatMap(lambda x: [x, (x[1], x[0])]).groupByKey().mapValues(lambda x: list(set(x)))

In [22]:
G.collect()

[(0, [1]),
 (8, [5, 7]),
 (4, [3]),
 (12, [10, 13]),
 (1, [0, 2]),
 (5, [8, 2]),
 (13, [12]),
 (2, [1, 5]),
 (6, [3]),
 (10, [11, 12]),
 (7, [8, 3]),
 (3, [4, 6, 7]),
 (11, [10])]

DataFrame Schema

In [5]:
schemaList = ["Node", "NN"]
schemaType = [IntegerType(), ArrayType( IntegerType() )]
schemaNull = [False, True]

fields = [StructField(schemaList[0], schemaType[0], schemaNull[0]),\
          StructField(schemaList[1], schemaType[1], schemaNull[1])]

schema = StructType(fields)

Applying schema to RDD

In [28]:
dfG = sqlContext.createDataFrame(G, schema)
dfG.createOrReplaceTempView("graph")
dfG.printSchema()

root
 |-- Node: integer (nullable = false)
 |-- NN: array (nullable = true)
 |    |-- element: integer (containsNull = true)



In [35]:
sqlContext.sql("SELECT * FROM graph").show()
dfG.select("Node").show()

+----+---------+
|Node|       NN|
+----+---------+
|   0|      [1]|
|   8|   [5, 7]|
|   4|      [3]|
|  12| [10, 13]|
|   1|   [0, 2]|
|   5|   [8, 2]|
|  13|     [12]|
|   2|   [1, 5]|
|   6|      [3]|
|  10| [11, 12]|
|   7|   [8, 3]|
|   3|[4, 6, 7]|
|  11|     [10]|
+----+---------+

+----+
|Node|
+----+
|   0|
|   8|
|   4|
|  12|
|   1|
|   5|
|  13|
|   2|
|   6|
|  10|
|   7|
|   3|
|  11|
+----+



In [52]:
from pyspark.sql.functions import col, pandas_udf

def set_func(node, nn):
    return min(nn + [node])


v_min = dfG.select(set_func(col("Node"), col("NN")))
v_min.show()

TypeError: Column is not iterable

In [None]:
def Min_Selection_Step(G): #dictionary format RDD
    v_min = G.map(lambda x: (x[0], min(x[1] | {x[0]})))
    NN_G_u = G.map(lambda x: (x[0], x[1] | {x[0]}))
    #Broadcasting
    v_min_bc = sc.broadcast(dict(v_min.collect()))
    addEdge = NN_G_u.map(lambda x: (x[0], (x[1], v_min_bc.value[x[0]])) )
    addEdge1 = addEdge.flatMap(lambda x: [(y, x[1][1]) for y in x[1][0]])
    #Without broadcasting
    #addEdge1 = NN_G_u.join(v_min).flatMap(lambda x: [(y, x[1][1]) for y in x[1][0]])

    H = addEdge1.groupByKey().mapValues(lambda x: set(x))
    return H