In [1]:
import pyspark
from pyspark import SparkContext, SparkConf
import findspark
findspark.init()
conf = SparkConf().setAppName('test3').setMaster('local[4]')
sc = SparkContext(conf=conf)

# Action动作算子

In [2]:
rdd = sc.parallelize(range(10))

In [3]:
# collect
rdd.collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [5]:
# take
rdd.take(4)

[0, 1, 2, 3]

In [6]:
# first
rdd.first()

0

In [7]:
# top
rdd.top(3)

[9, 8, 7]

In [8]:
# takeOrdered
rdd_2 = sc.parallelize([10, 7, 6, 9, 4, 3, 5, 2, 1])
rdd_2.takeOrdered(num=5)

[1, 2, 3, 4, 5]

In [11]:
rdd_2.takeOrdered(num=5, key=lambda x: -x)

[10, 9, 7, 6, 5]

In [12]:
# 重复抽样；抽样数量；随机种子
rdd_2.takeSample(False, 5, 0)

[2, 3, 7, 9, 4]

In [13]:
rdd_2.count()

9

In [14]:
rdd_2.stats()

(count: 9, mean: 5.222222222222222, stdev: 2.8974232912011777, max: 10.0, min: 1.0)

In [15]:
rdd_3 = sc.parallelize(range(51))

In [16]:
# [0, 25）->25个元素， [25,50]->26个元素
rdd_3.histogram(2)

([0, 25, 50], [25, 26])

In [17]:
rdd_3.histogram([0, 10, 40, 50])

([0, 10, 40, 50], [10, 30, 11])

In [18]:
# reduce 二元归并操作 运算的结果变为x 和 下一个y进行计算f
rdd_4 = sc.parallelize(range(10))
rdd_4.reduce(lambda x, y: x+y)

45

In [19]:
# foreach
acc = sc.accumulator(value=0)
rdd_4.foreach(lambda x: acc.add(x))
acc.value

45

In [22]:
rdd_5 = sc.parallelize((("a", 1), ("b", 2), ("c", 3)))
rdd_5.collectAsMap()

{'a': 1, 'b': 2, 'c': 3}

In [27]:
rdd_5.saveAsTextFile('./data/output.txt')

In [28]:
rdd_6 = sc.textFile('./data/output.txt')

In [30]:
#rdd_6.collect()

# 变换算子

In [32]:
rdd_7 = sc.parallelize(range(10))
rdd_7.collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [33]:
# map
rdd_7.map(lambda x: x**2).collect()

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [40]:
rdd_8 = sc.parallelize(['hello world', 'hello python'])
rdd_8.collect()

['hello world', 'hello python']

In [41]:
rdd_8.map(lambda x: x.split(" ")).collect()

[['hello', 'world'], ['hello', 'python']]

In [42]:
rdd_8.flatMap(lambda x: x.split(" ")).collect()

['hello', 'world', 'hello', 'python']

In [43]:
rdd_9 = sc.parallelize([1, 1, 2, 2, 3, 3, 4, 5])
rdd_9.distinct().collect()

[4, 1, 5, 2, 3]

In [44]:
# cartesian笛卡尔积
a = sc.parallelize([1,2])
b = sc.parallelize(["python", "pyspark"])
a.cartesian(b).collect()

[(1, 'python'), (1, 'pyspark'), (2, 'python'), (2, 'pyspark')]

In [46]:
# sortBy
rdd_10 = sc.parallelize([(1, 2, 3), (3, 2, 2), (4, 1, 1)])
rdd_10.sortBy(keyfunc=lambda x: x[2], ascending=True).collect()

[(4, 1, 1), (3, 2, 2), (1, 2, 3)]

In [47]:
# zip
rdd1 = sc.parallelize([1, 2, 3])
rdd2 = sc.parallelize(["python", "pandas", "pyspark"])
rdd1.zip(rdd2).collect()

[(1, 'python'), (2, 'pandas'), (3, 'pyspark')]

In [48]:
# zipWithIndex
rdd_11 = sc.parallelize(["python", "pandas", "pyspark"])
rdd_11.zipWithIndex().collect()

[('python', 0), ('pandas', 1), ('pyspark', 2)]