In [1]:
import time

In [2]:
import pyspark
from pyspark import SparkContext

try:
    sc
except NameError:
    sc = SparkContext()

In [3]:
rdd = sc.parallelize([4,10,9,7])
rdd.take(3)

[4, 10, 9]

In [83]:
n = 100_000
rdd = sc.parallelize(range(n), numSlices=10)
rdd.map(lambda x: x ** 2).sum()

333328333350000

In [85]:
# At a certain point in data size, 
# Python's internal tools will be beat in performance by a distributed processing framework.

for n in (100_000, 1_000_000, 10_000_000):
    t0 = time.time()
    rdd = sc.parallelize(range(n), numSlices=10)
    rdd.map(lambda x: x ** 2).sum()
    print('PySpark', n, time.time() - t0)
    
    t0 = time.time()
    sum(x ** 2 for x in range(n))
    print('Python ', n, time.time() - t0)
    
    n == 10_000_000 or print('--')

PySpark 100000 0.1241462230682373
Python  100000 0.030167818069458008
--
PySpark 1000000 0.16159510612487793
Python  1000000 0.26383113861083984
--
PySpark 10000000 1.564227819442749
Python  10000000 2.732858657836914


In [84]:
# Another way to find that sum.
# This is about as as fast as using parallelize(range(...))
rdd = sc.range(n, numSlices=10)
rdd.map(lambda x: x ** 2).sum()

333328333350000

In [107]:
rdd = sc.parallelize(range(5), numSlices=5)
rdd.glom().collect()  # glom collects per partition

[[0], [1], [2], [3], [4]]

In [108]:
rdd = sc.parallelize(range(5), numSlices=1)
rdd.glom().collect()  # only one partition in this RDD

[[0, 1, 2, 3, 4]]

https://www.slideshare.net/databricks/data-wrangling-with-pyspark-for-data-scientists-who-know-pandas-with-andrew-ray