# Learning PySpark 
### Video series

### Packt Publishing

**Author**: Tomasz Drabas
**Date**:   2017-12-10





# Section 2: Resilient Distributed Datasets - Transformations

In this section we will look at the Resilient Distributed Datasets (RDDs) and the transformations available.

## Creating RDDs
### Parallelize a collection
#### 1000 random numbers

In [1]:
import numpy as np

random_numbers = sc.parallelize([np.random.rand() for _ in range(1000)], 4)
random_numbers

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
3,,pyspark,idle,,,✔


SparkSession available as 'spark'.
ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:480

In [2]:
random_numbers.take(2)

[0.5556534656821962, 0.3949968193806832]

#### Tuples

In [3]:
rdd_random_tuples = sc.parallelize([
      ("Mark",  43, "6'1\"")
    , ("Stella",23, "5'6\"")
    , ("Skye",   6, "3'11\"")
    , ("Albert", 1, "2'7\"")
])
rdd_random_tuples.take(2)

[('Mark', 43, '6\'1"'), ('Stella', 23, '5\'6"')]

### Reading from files
The `sample_data.csv` was created based on the data found here: http://www.contextures.com/xlSampleData01.html.

In [4]:
rdd_from_file = sc.textFile('../data/sample_data.csv', 4)
rdd_from_file.take(2)

['OrderDate,Region,Rep,Item,Units,UnitCost,Total', '1/6/16,East,Jones,Pencil,95,1.99,189.05']

## Schema of an RDD
### Structured data

In [9]:
rdd_valid_structured = sc.parallelize([
      ('Tom', 0)
    , ('Spark', 1)
])

### Unstructured data
Sample Linux log data, source: https://www.cyberciti.biz/faq/linux-log-files-location-and-how-do-i-view-logs-files/

In [6]:
rdd_valid_unstructured = sc.parallelize([
      ('Jul 17 22:04:25 router  dnsprobe[276]: dns query failed')
    , ('Jul 17 22:04:29 router last message repeated 2 times')  
    , ('Jul 17 22:04:29 router  dnsprobe[276]: Primary DNS server Is Down... Switching To Secondary DNS server')
    , ('Jul 17 22:05:08 router  dnsprobe[276]: Switching Back To Primary DNS server')
    , ('Jul 17 22:26:11 debian -- MARK --')
    , ('Jul 17 22:46:11 debian -- MARK --')
    , ('Jul 17 22:47:36 router  -- MARK --')
    , ('Jul 17 22:47:36 router  dnsprobe[276]: dns query failed')
])

### Heterogeneous data

In [10]:
rdd_valid_heterogeneous = sc.parallelize([
      ('Toll invoice number', 'TRE2321')
    , {'Last bill balance': 0.00, 'New bill balance': 23.92}
    , ['2017-11-12 12:32:34PM', '2017-11-14 1:32:56PM']
])

## Understanding lazy execution

In [None]:
names_only = rdd_valid_structured.map(lambda element: element[0])

In [None]:
names_only.collect()

## .map(...) transformation
### Using lambdas

In [18]:
def splitOnComma(inputString):
    return inputString.split(',')

for el in rdd_from_file.map(splitOnComma).take(2):
    print(el)

['OrderDate', 'Region', 'Rep', 'Item', 'Units', 'UnitCost', 'Total']
['1/6/16', 'East', 'Jones', 'Pencil', '95', '1.99', '189.05']

In [19]:
for el in rdd_from_file.map(lambda element: element.split(',')).take(2):
    print(el)

['OrderDate', 'Region', 'Rep', 'Item', 'Units', 'UnitCost', 'Total']
['1/6/16', 'East', 'Jones', 'Pencil', '95', '1.99', '189.05']

### Transforming data

In [20]:
def convertToFloat(inputString):
    try:
        return float(inputString)
    except:
        return -666

to_filter = rdd_from_file \
    .map(lambda element: element.split(',')) \
    .map(lambda element:
         [e for e in element[:4]] +
         [convertToFloat(e) for e in element[4:]]
    )
    
to_filter.take(2)

[['OrderDate', 'Region', 'Rep', 'Item', -666, -666, -666], ['1/6/16', 'East', 'Jones', 'Pencil', 95.0, 1.99, 189.05]]

## .filter(...) transformation

In [22]:
filtered = to_filter.filter(lambda element: element[4] != -666)
filtered.take(2)

[['1/6/16', 'East', 'Jones', 'Pencil', 95.0, 1.99, 189.05], ['2017-03-02', 'Central', 'Kivell', 'Binder', 50.0, 19.99, 999.5]]

## .flatMap(...) transformation
### .map to flatMap comparison

In [27]:
def mapExample(inputList):
    outputList = []
    
    for item in inputList:
        temp = item.copy()
        temp[1] *= 10
        
        outputList.append(temp)
    return outputList

def flatMapExample(inputList):
    outputList = []
    
    for item in inputList:
        temp = item.copy()
        temp[1] *= 10
        
        outputList += temp
        
    return outputList

sampleList = [
      [1, 3]
    , [2, 4]
    , [3, 5]
    , [4, 6]
]

print(mapExample(sampleList))
print(flatMapExample(sampleList))

[[1, 30], [2, 40], [3, 50], [4, 60]]
[1, 30, 2, 40, 3, 50, 4, 60]

### Filtering malformed records

In [28]:
import datetime as dt

def parseCSVRow(inputRow):
    try:
        rowSplit = inputRow.split(',')
        rowSplit[0] = dt.datetime.strptime(rowSplit[0], '%m/%d/%y')
        rowSplit[4] = int(rowSplit[4])
        
        for i in [5,6]:
            rowSplit[i] = float(rowSplit[i])
        
        return [rowSplit]
    except:
        return []

rdd_from_file.map(parseCSVRow).take(3)

[[], [[datetime.datetime(2016, 1, 6, 0, 0), 'East', 'Jones', 'Pencil', 95, 1.99, 189.05]], []]

In [29]:
rdd_from_file.map(parseCSVRow).filter(lambda element: len(element) > 0).take(3)

[[[datetime.datetime(2016, 1, 6, 0, 0), 'East', 'Jones', 'Pencil', 95, 1.99, 189.05]], [[datetime.datetime(2016, 2, 9, 0, 0), 'Central', 'Jardine', 'Pencil', 36, 4.99, 179.64]], [[datetime.datetime(2016, 2, 26, 0, 0), 'Central', 'Gill', 'Pen', 27, 19.99, 539.73]]]

In [30]:
rdd_from_file_clean = rdd_from_file.flatMap(parseCSVRow)
rdd_from_file_clean.take(3)

[[datetime.datetime(2016, 1, 6, 0, 0), 'East', 'Jones', 'Pencil', 95, 1.99, 189.05], [datetime.datetime(2016, 2, 9, 0, 0), 'Central', 'Jardine', 'Pencil', 36, 4.99, 179.64], [datetime.datetime(2016, 2, 26, 0, 0), 'Central', 'Gill', 'Pen', 27, 19.99, 539.73]]

## .distinct(...) transformation

### Understanding the distinct method

In [31]:
sampleList = [1,1,1,4,3,4,6,4,8,6,9]

distinct = []
seen = {}

for elem in sampleList:
    if elem in seen:
        continue
    else:
        distinct.append(elem)
        seen[elem] = 1
    
print(distinct)

[1, 4, 3, 6, 8, 9]

### Finding distinct in RDD

In [32]:
items = rdd_from_file_clean.map(lambda element: element[3]).distinct()
items.collect()

['Desk', 'Pen', 'Pencil', 'Pen Set', 'Binder']

In [33]:
rdd_from_file_clean.map(lambda element: element[2]).distinct().count()

11

## .sample(...) transformation

In [34]:
rdd_from_file_clean.count()

42

In [35]:
rdd_from_file_clean.sample(False, 0.2).count()

9

## .join(...) transformation

### The mechanics of join

In [36]:
a = sc.parallelize([('a', 2), ('b', 3)])
b = sc.parallelize([('a', 4), ('b', 4), ('c', 4)])

a.join(b).collect()

[('b', (3, 4)), ('a', (2, 4))]

### Append location

In [37]:
cities = sc.parallelize([
      ('East', 'Boston')
    , ('Central', 'Chicago')
    , ('West', 'Seattle')
])

rdd_from_file_clean \
    .map(lambda element: (element[1], element)) \
    .join(cities) \
    .map(lambda element: element[1][0] + [element[1][1]]) \
    .take(2)

[[datetime.datetime(2016, 2, 9, 0, 0), 'Central', 'Jardine', 'Pencil', 36, 4.99, 179.64, 'Chicago'], [datetime.datetime(2016, 2, 26, 0, 0), 'Central', 'Gill', 'Pen', 27, 19.99, 539.73, 'Chicago']]

## .repartition(...) transformation

In [38]:
rdd_from_file_clean.getNumPartitions()

4

In [39]:
rdd_from_file_repartitioned = rdd_from_file.repartition(2)
rdd_from_file_repartitioned.getNumPartitions()

2

In [40]:
rdd_from_file_repartitioned_sorted = rdd_from_file_clean \
    .map(lambda element: (int(element[6]), element)) \
    .repartitionAndSortWithinPartitions(2, lambda x: x) \
    .map(lambda element: tuple(element[1]))

rdd_from_file_repartitioned_sorted.glom() \
    .collect()

[[(datetime.datetime(2017, 10, 31, 0, 0), 'Central', 'Andrews', 'Pencil', 14, 1.29, 18.06), (datetime.datetime(2017, 11, 17, 0, 0), 'Central', 'Jardine', 'Binder', 11, 4.99, 54.89), (datetime.datetime(2017, 5, 14, 0, 0), 'Central', 'Gill', 'Pencil', 53, 1.29, 68.37), (datetime.datetime(2016, 12, 12, 0, 0), 'Central', 'Smith', 'Pencil', 67, 1.29, 86.43), (datetime.datetime(2016, 8, 15, 0, 0), 'East', 'Jones', 'Pencil', 35, 4.99, 174.65), (datetime.datetime(2016, 9, 1, 0, 0), 'Central', 'Smith', 'Desk', 2, 125.0, 250.0), (datetime.datetime(2017, 7, 21, 0, 0), 'Central', 'Morgan', 'Pen Set', 55, 12.49, 686.95)], [(datetime.datetime(2017, 9, 10, 0, 0), 'Central', 'Gill', 'Pencil', 7, 1.29, 9.03), (datetime.datetime(2017, 2, 18, 0, 0), 'East', 'Jones', 'Binder', 4, 4.99, 19.96), (datetime.datetime(2016, 7, 12, 0, 0), 'East', 'Howard', 'Binder', 29, 1.99, 57.71), (datetime.datetime(2016, 5, 22, 0, 0), 'West', 'Thompson', 'Pencil', 32, 1.99, 63.68), (datetime.datetime(2017, 4, 10, 0, 0), 'Cen

In [None]:
rdd_from_file_repartitioned_sorted.getNumPartitions()