In [1]:
import pyspark
from pyspark import SparkContext, SparkConf
import findspark
findspark.init()
conf = SparkConf().setAppName('test3').setMaster('local[4]')
sc = SparkContext(conf=conf)

# pairRDD变换算子

#### 包含key和value的RDD，类似于字典

In [2]:
rdd = sc.parallelize(["a", "b", "c"])
rdd.keyBy(lambda x: 1).collect()

[(1, 'a'), (1, 'b'), (1, 'c')]

In [3]:
# lookup
rdd_1 = sc.parallelize([("python", 1), ("python", 2), ("pandas", 3), ("pandas", 4)])

In [4]:
rdd_1.keys().collect()

['python', 'python', 'pandas', 'pandas']

In [5]:
rdd_1.values().collect()

[1, 2, 3, 4]

In [6]:
# lookup是动作算子
rdd_1.lookup("python")

[1, 2]

In [7]:
# 以key分组对value进行二元“归并”的操作 返回嵌套的元组列表
rdd_1.reduceByKey(lambda x, y: x+y).collect()

[('python', 3), ('pandas', 7)]

In [12]:
# reduceByKeyLocally 返回python的字典
dct = rdd_1.reduceByKeyLocally(lambda x, y: x+y)
dct

{'python': 3, 'pandas': 7}

In [13]:
# foldByKey 以key分组并按照指定函数(add加法)合并value
# fold折叠，必须传递zeroValue的初始值，
from operator import add
rdd_1.foldByKey(0, add).collect()

[('python', 3), ('pandas', 7)]

In [16]:
# combineByKey 以key分组按照指定函数合并value，合并后返回列表
# createCombiner, 将value转换为列表
# mergeValue 将value添加至列表
# mergeCombiners， 将多个列表合并为同一个列表
def to_list(x):
    return [x]
def append(x, y):
    x.append(y)
    return x
def extend(x, y):
    x.extend(y)
    return x
rdd_1.combineByKey(to_list, append, extend).collect()

[('python', [1, 2]), ('pandas', [3, 4])]

In [18]:
# subtractByKey  只考虑key做差集
x = sc.parallelize([("a", 1), ("b", 2), ("c", 3)])
y = sc.parallelize([("a", 2)])
x.subtractByKey(y).collect()

[('b', 2), ('c', 3)]

In [19]:
# 生成器可以当作迭代器
rdd = sc.parallelize(range(10))
iterator = rdd.toLocalIterator()
type(iterator)

generator

In [21]:
# groupBy ：以函数返回值分组合并，合并后返回迭代器
# 以返回值为key，值放在迭代器中作为value
rdd_new = rdd.groupBy(lambda x: x % 3).collect()
[[x, list(y)] for x, y in rdd_new]

[[0, [0, 3, 6, 9]], [1, [1, 4, 7]], [2, [2, 5, 8]]]

In [24]:
# groupByKey  按照key分组
rdd_new = rdd_1.groupByKey().collect()
[[x, list(y)] for x, y in rdd_new]

[['python', [1, 2]], ['pandas', [3, 4]]]

In [26]:
# mapValues
rdd = sc.parallelize([("python", [1,2]), ("pandas", [3, 4])])
rdd.mapValues(sum).collect()

[('python', 3), ('pandas', 7)]

In [27]:
# groupBy + mapValues
rdd = sc.parallelize(range(10))
rdd.groupBy(lambda x: x %2).mapValues(list).collect()

[(0, [0, 2, 4, 6, 8]), (1, [1, 3, 5, 7, 9])]

In [28]:
# groupByKey + mapValues
rdd = sc.parallelize([("python", 1), ("python", 2), ("pandas", 3), ("pandas", 4)])
rdd.groupByKey().mapValues(list).collect()

[('python', [1, 2]), ('pandas', [3, 4])]

In [29]:
rdd.groupByKey().mapValues(sum).collect()

[('python', 3), ('pandas', 7)]

In [30]:
rdd.groupByKey().mapValues(max).collect()

[('python', 2), ('pandas', 4)]

In [34]:
# countByKey 以key分组计数
rdd.countByKey()  #.items()

defaultdict(int, {'python': 2, 'pandas': 2})

In [36]:
# countByValue
rdd1 = sc.parallelize([(1, 1), (1, 1), (3, 4), (2, 1)])
rdd2 = sc.parallelize([1, 2, 2, 3, 3, 3])
rdd1.countByValue().items() # 以元组计数

dict_items([((1, 1), 2), ((3, 4), 1), ((2, 1), 1)])

In [37]:
rdd2.countByValue().items() # 以值计数

dict_items([(1, 1), (2, 2), (3, 3)])

In [40]:
# cogroup 先对两个RDD分别进行groupByKey，然后对合并结果groupByKey
x = sc.parallelize([("a", 1), ("b", 2), ("a", 3)])
y = sc.parallelize([("a", 4), ("b", 5), ("b", 6)])
# x.cogroup(y).mapValues(list).collect()
[[x, [list(z) for z in y]] for x, y in x.cogroup(y).collect()]

[['a', [[1, 3], [4]]], ['b', [[2], [5, 6]]]]

In [41]:
# sortByKey 按key进行排序   比如按照日期sort
rdd = sc.parallelize([("python", 1), ("python", 2), ("pandas", 3), ("pandas", 4)])
rdd.sortByKey().collect()

[('pandas', 3), ('pandas', 4), ('python', 1), ('python', 2)]

In [42]:
# sampleByKey
fruit = sc.parallelize(["apple", "banana"])
number = sc.parallelize(range(10))
rdd = fruit.cartesian(number)
rdd.collect()

[('apple', 0),
 ('apple', 1),
 ('apple', 2),
 ('apple', 3),
 ('apple', 4),
 ('apple', 5),
 ('apple', 6),
 ('apple', 7),
 ('apple', 8),
 ('apple', 9),
 ('banana', 0),
 ('banana', 1),
 ('banana', 2),
 ('banana', 3),
 ('banana', 4),
 ('banana', 5),
 ('banana', 6),
 ('banana', 7),
 ('banana', 8),
 ('banana', 9)]

In [43]:
# 以key分组 按照比例进行随机抽样
# withReplacement是否放回
# fractions 抽样比例
# seed 随机种子
frac = ({"apple": 0.3, "banana": 0.5})
rdd.sampleByKey(False, frac, 1).collect()

[('apple', 3),
 ('apple', 5),
 ('apple', 7),
 ('banana', 1),
 ('banana', 5),
 ('banana', 6),
 ('banana', 7),
 ('banana', 9)]

In [47]:
# flatMapValues
rdd = sc.parallelize([("a", [1, 2, 3]), ("b", [4, 5, 6])])
rdd.flatMapValues(lambda x: x).collect()

[('a', 1), ('a', 2), ('a', 3), ('b', 4), ('b', 5), ('b', 6)]

In [49]:
# join
age = sc.parallelize([("jack", 20), ("rose", 18), ("tony", 20)])
gender = sc.parallelize([("jack", "male"), ("rose", "female"), ("tom", "male")])
age.join(gender).collect()  # 内连接确保都有， 因此无空

[('jack', (20, 'male')), ('rose', (18, 'female'))]

In [50]:
age.leftOuterJoin(gender).collect() # 外连接 可以理解为含有空

[('jack', (20, 'male')), ('tony', (20, None)), ('rose', (18, 'female'))]

In [51]:
age.rightOuterJoin(gender).collect()

[('tom', (None, 'male')), ('jack', (20, 'male')), ('rose', (18, 'female'))]

In [52]:
age.fullOuterJoin(gender).collect()

[('tom', (None, 'male')),
 ('jack', (20, 'male')),
 ('tony', (20, None)),
 ('rose', (18, 'female'))]