In [3]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("Spark RDD Course")
sc = SparkContext(conf=conf)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=Spark RDD Course, master=local[*]) created by __init__ at <ipython-input-1-f122c41e7dce>:4 

# Introduction to `Spark` RDD

In [4]:
rdd = sc.parallelize([2, 3, 4])

Note that `parallelize` takes an optional argument to choose the number of partitions

In [5]:
rdd.getNumPartitions()

4

In [6]:
rdd = sc.parallelize(range(1000), 10)
rdd.getNumPartitions()

10

## Transformations

### `map`

In [7]:
rdd = sc.parallelize([2, 3, 4])
rdd = rdd.map(lambda x: list(range(1, x)))

In [8]:
rdd

PythonRDD[3] at RDD at PythonRDD.scala:53

`map` is a transformation which is lazy evaluate (hence delayed until an action is met in the DAG).

In [9]:
rdd.collect()

[[1], [1, 2], [1, 2, 3]]

### Exercice: `map` with a method

**Warning.** This example is a bad practice !!! Don't do this at home

In [10]:
class TelephoneDB(object):
    
    def __init__(self):
        self.tel = {'stephane': 1234, 'yasser': 4567}
    
    def add_tel(self, name):
        return name, self.tel[name]

In [11]:
#当类中数据特别大时，这么做会把类中所有数据发给每个worker
#非常坏的做法
tel_db = TelephoneDB()
names = ['stephane', 'yasser']
rdd = sc.parallelize(names).map(tel_db.add_tel).collect()
rdd

[('stephane', 1234), ('yasser', 4567)]

- Replace the `tel` dictionary by a `defaultdict` with default number `999` 
- Use it on a `rdd` containing names as above including an unknown one, and try it

In [None]:
from collections import defaultdict

class TelephoneDefaultDB(object):
    
    def __init__(self):
        self.tel = defaultdict(lambda: 999, {'stephane': 1234, 'yasser': 4567})
    
    def add_tel(self, name):
        return name, self.tel[name]
    
    def add_tel_rdd(self, rdd):
        
        return rdd.map(self.add_tel)

In [None]:

tel_db = TelephoneDefaultDB()
names = ['stephane', 'yasser', 'yiyang']
rdd = sc.parallelize(names).map(tel_db.add_tel).collect()
rdd

**Warning**. Once again, this is a bad idea to pass class methods to spark's `map`.
Since `add_tel` needs `self`, the whole object is serialized so that `spark` can use it.
This breaks if the `tel` is large, or if it is not serializable.

### `flatMap`

In [14]:
rdd = sc.parallelize([2, 3, 4, 5])
rdd.flatMap(lambda x: range(1, x)).collect()

[1, 1, 2, 1, 2, 3, 1, 2, 3, 4]

### `filter`

In [15]:
rdd = sc.parallelize(range(10))
rdd.filter(lambda x: x % 2 == 0).collect()

[0, 2, 4, 6, 8]

### `distinct`

In [16]:
rdd = sc.parallelize([1, 1, 4, 2, 1, 3, 3])
rdd.distinct().collect()

[4, 1, 2, 3]

### "Pseudo-set" operations

In [17]:
rdd1 = sc.parallelize(range(5))
rdd2 = sc.parallelize(range(3, 9))
rdd3 = rdd1.union(rdd2)
rdd3.collect()

[0, 1, 2, 3, 4, 3, 4, 5, 6, 7, 8]

In [18]:
rdd3.distinct().collect()

[0, 8, 1, 2, 3, 4, 5, 6, 7]

In [19]:
rdd1 = sc.parallelize([1, 2])
rdd2 = sc.parallelize(["a", "b"])
rdd1.cartesian(rdd2).collect()

[(1, 'a'), (1, 'b'), (2, 'a'), (2, 'b')]

## Actions

Well, `collect` is obviously an action...

### `count`, `countByValue`

In [20]:
rdd = sc.parallelize([1, 3, 1, 2, 2, 2])
rdd.count()

6

In [21]:
rdd.countByValue()

defaultdict(int, {1: 2, 3: 1, 2: 3})

### `take`, `takeOrdered`

In [22]:
rdd = sc.parallelize([(3, 'a'), (1, 'b'), (2, 'd')])

In [23]:
rdd.takeOrdered(2)

[(1, 'b'), (2, 'd')]

In [24]:
rdd.takeOrdered(2, key=lambda x: x[1])

[(3, 'a'), (1, 'b')]

### `reduce`, `fold`

In [25]:
rdd = sc.range(1, 4)
rdd.reduce(lambda a, b: a + b)

6

In [26]:
rdd.fold(0, lambda a, b: a + b)

6

In [27]:
rdd = sc.parallelize([1, 2, 4], 2)
rdd.fold(2.5, lambda a, b: a + b)

14.5

In [28]:
rdd = sc.parallelize([1, 2, 3], 5)
rdd.fold(2, lambda a, b: a + b)

18

In [29]:
rdd.getNumPartitions()

5

### `aggregate`

In [30]:
seqOp = lambda x, y: (x[0] + y, x[1] + 1)
combOp = lambda x, y: (x[0] + y[0], x[1] + y[1])
sc.parallelize([1, 2, 3, 4]).aggregate((0, 0), seqOp, combOp)

(10, 4)

### Exercice: sum of powers with `aggregate`

- Using `aggregate`, compute the sum, the sum of squares $x^2$ and the sum of $x^3$ for 
$x \in \{1, \ldots, 10 \}$.
- Check your computations using `numpy`

In [31]:
seqOp = lambda x, y: (x[0] + y, x[1] + y ** 2, x[2] + y ** 3)

In [32]:
combOp = lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2])

In [33]:
sc.range(1, 11).aggregate((0, 0, 0), seqOp, combOp)

(55, 385, 3025)

In [34]:
import numpy as np

x = np.arange(1, 11)
x

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [35]:
x.sum(), (x**2).sum(), (x**3).sum()

(55, 385, 3025)

# `PairRDD`

In [36]:
rdd = sc.parallelize([[1, "a", 7], [2, "b", 13], [2, "c", 17]])
rdd = rdd.map(lambda x: (x[0], x[1:]))
rdd.collect()

[(1, ['a', 7]), (2, ['b', 13]), (2, ['c', 17])]

## Transformations

### `keys`, `values`

In [37]:
rdd.keys().collect()

[1, 2, 2]

In [38]:
rdd.values().collect()

[['a', 7], ['b', 13], ['c', 17]]

**Warning**. An element mut be tuples with two elements (the key and the value)

In [39]:
rdd = sc.parallelize([[1, "a", 7], [2, "b", 13], [2, "c", 17]])
rdd.keys().collect()

[1, 2, 2]

In [40]:
rdd.values().collect()

['a', 'b', 'c']

The values are **not** what we expected wrong... so we **must** do

In [41]:
rdd = sc.parallelize([[1, "a", 7], [2, "b", 13], [2, "c", 17]]).map(lambda x: (x[0], x[1:]))
rdd.keys().collect()

[1, 2, 2]

In [42]:
rdd.values().collect()

[['a', 7], ['b', 13], ['c', 17]]

In [43]:
rdd.values().collect()

[['a', 7], ['b', 13], ['c', 17]]

Now the values are correct.

### `mapValues`, `flatMapValues`

In [44]:
rdd = sc.parallelize([("a", "x y z"), ("b", "p r")])
rdd.mapValues(lambda v: v.split(' ')).collect()

[('a', ['x', 'y', 'z']), ('b', ['p', 'r'])]

In [45]:
rdd.flatMapValues(lambda v: v.split(' ')).collect()

[('a', 'x'), ('a', 'y'), ('a', 'z'), ('b', 'p'), ('b', 'r')]

### `groupByKey`

In [46]:
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1), ("b", 3), ("c", 42)])
rdd.groupByKey().mapValues(list).collect()

[('b', [1, 3]), ('c', [42]), ('a', [1, 1])]

### `reduceByKey`

In [47]:
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
rdd.reduceByKey(lambda a, b: a + b).collect()

[('b', 1), ('a', 2)]

### `combineByKey`

In [48]:
rdd = sc.parallelize([('a', 1), ('b', 2), ('a', 13)])

def add(a, b): 
    return a + str(b)

rdd.combineByKey(str, add, add).collect()

[('b', '2'), ('a', '113')]

### `join`, `rightOuterJoin`, `leftOuterJoin`

In [54]:
employees = sc.parallelize([
    (31, "Rafferty"),
    (33, "Jones"),
    (33, "Heisenberg"),
    (34, "Robinson"),
    (34, "Smith"),
    (None, "Williams")
])

In [55]:
departments = sc.parallelize([
    (31, "Sales"),
    (33, "Engineering"),
    (34, "Clerical"),
    (35, "Marketing")
])

In [56]:
employees.join(departments).sortByKey().collect()

[(31, ('Rafferty', 'Sales')),
 (33, ('Jones', 'Engineering')),
 (33, ('Heisenberg', 'Engineering')),
 (34, ('Robinson', 'Clerical')),
 (34, ('Smith', 'Clerical'))]

In [57]:
employees.rightOuterJoin(departments).sortByKey().collect()

[(31, ('Rafferty', 'Sales')),
 (33, ('Jones', 'Engineering')),
 (33, ('Heisenberg', 'Engineering')),
 (34, ('Robinson', 'Clerical')),
 (34, ('Smith', 'Clerical')),
 (35, (None, 'Marketing'))]

In [53]:
employees.leftOuterJoin(departments).collect()

[(None, ('Williams', None)),
 (33, ('Jones', 'Engineering')),
 (33, ('Heisenberg', 'Engineering')),
 (34, ('Robinson', 'Clerical')),
 (34, ('Smith', 'Clerical')),
 (31, ('Rafferty', 'Sales'))]

## Actions

In [58]:
employees.countByKey()

defaultdict(int, {31: 1, 33: 2, 34: 2, None: 1})

In [62]:
employees.lookup(33)

['Jones', 'Heisenberg']

In [63]:
employees.lookup(None)

['Williams']

In [64]:
employees.collectAsMap()

{31: 'Rafferty', 33: 'Heisenberg', 34: 'Smith', None: 'Williams'}