In [1]:
import pyspark
from pyspark import SparkContext, SparkConf
import findspark
findspark.init()
conf = SparkConf().setAppName('test5').setMaster('local[4]')
sc = SparkContext(conf=conf)

In [2]:
# glom
rdd = sc.parallelize(range(10), 2)
# 将每个分区的元素转换为列表  如果纯collect则返回在同一个列表中
rdd.glom().collect()

[[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]

In [3]:
# coalesce 重置分区数量
rdd = sc.parallelize(range(10), 3)
rdd.glom().collect()

[[0, 1, 2], [3, 4, 5], [6, 7, 8, 9]]

In [4]:
rdd_new = rdd.coalesce(2, shuffle=False)  # shuffle为True是增加， False是减少
rdd_new.glom().collect()

[[0, 1, 2], [3, 4, 5, 6, 7, 8, 9]]

In [5]:
# repartition  和上一个函数coalesce差不多，不需要shuffle参数
rdd1 = sc.parallelize(range(10), 3)
rdd2 = sc.parallelize([("a", 1), ("a", 2), ("a", 3), ("c", 4)])
rdd1.repartition(4).glom().collect()

[[6, 7, 8, 9], [3, 4, 5], [], [0, 1, 2]]

In [6]:
rdd2.repartition(2).glom().collect()  # 相同的key不一定在同一个分区

[[('a', 1), ('a', 3), ('c', 4)], [('a', 2)]]

In [7]:
# partitionBy
rdd2.partitionBy(2).glom().collect()

[[('c', 4)], [('a', 1), ('a', 2), ('a', 3)]]

In [8]:
# mapPartitions 对每一个分区应用一个函数，如求和，必须使用yield关键字返回迭代器
rdd = sc.parallelize(range(10), 2)
def func(x): yield sum(x)
rdd.mapPartitions(func).collect()

[10, 35]

In [9]:
# mapPartitionsWithIndex
def func(i, x): yield i, sum(x)
rdd.mapPartitionsWithIndex(func).collect()

[(0, 10), (1, 35)]

In [11]:
# repartitionAndSortWithinPartitions
rdd = sc.parallelize([(0,1), (3, 2), (1, 3), (0, 4), (3, 5), (2, 6)])
rdd_new = rdd.repartitionAndSortWithinPartitions(
numPartitions=2,
partitionFunc=lambda x: x% 2,
ascending=True)
rdd_new.glom().collect()

[[(0, 1), (0, 4), (2, 6)], [(1, 3), (3, 2), (3, 5)]]

In [12]:
# foreachPartitions
rdd = sc.parallelize(range(10), 2)
rdd.glom().collect()

[[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]

In [13]:
acc = sc.accumulator(value=0)
def func(x): acc.add(sum(x))
rdd.foreachPartition(func)
acc.value

45

In [14]:
# aggregate
rdd = sc.parallelize(range(1, 10), 3)
rdd.glom().collect()

[[1, 2, 3], [4, 5, 6], [7, 8, 9]]

In [15]:
seqOp = (lambda x, y: (x[0] + y, x[1] + 1))
combOp = (lambda x, y: (x[0]+y[0], x[1]+y[1]))
rdd.aggregate(zeroValue=(0, 0), seqOp=seqOp, combOp=combOp)

(45, 9)

In [17]:
# aggregateByKey 高性能算子，执行效率高
rdd = sc.parallelize([("orange", 1), ("orange", 2), ("orange", 3), ("banana", 1), 
                    ("banana", 4), ("banana", 5)], 2)
rdd_new = rdd.aggregateByKey(zeroValue=0, seqFunc=lambda x, y:max(x,y),
                            combFunc=lambda x, y:max(x,y))
rdd_new.collect()

[('orange', 3), ('banana', 5)]