# First steps in PySpark 

## Work with RDD and Pair RDD abstractions 

In [1]:
sc

In [2]:
import random

random.seed(42)

def plural(string): 
    return string + 's'

hats = [ random.choice(['panama', 'sombrero', 'fedora', 'top hat']) for _ in range(100) ]

In [3]:
my_first_rdd = sc.parallelize(hats) 
my_first_rdd

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:480

In [4]:
hat_plurals = my_first_rdd.map(plural) 
hat_plurals

PythonRDD[1] at RDD at PythonRDD.scala:48

In [5]:
hat_plurals.take(5) 

['panamas', 'panamas', 'fedoras', 'sombreros', 'sombreros']

### Exercise

Calculate, from the rdd generated from hats, how many hats we have whose name does **not** start with an 's'

In [6]:
hat_plurals\
    .filter(lambda x: x[0] != 's')\
    .count()
           

71

## Storage Levels can save time

In [10]:
from pyspark import StorageLevel

# Some of the possible storage levels
StorageLevel.DISK_ONLY
StorageLevel.MEMORY_AND_DISK_2
StorageLevel.MEMORY_ONLY_SER

StorageLevel(False, True, False, False, 1)

In [11]:
hat_plurals.cache() # equivalent to: hat_plurals.persist(StorageLevel.MEMORY_ONLY_SER) 

PythonRDD[1] at RDD at PythonRDD.scala:48

In [12]:
hat_plurals.map(lambda x: 2).sum()

200

In [14]:
hat_plurals.unpersist()
hat_plurals.persist(StorageLevel.MEMORY_AND_DISK_2)

PythonRDD[1] at RDD at PythonRDD.scala:48

In [15]:
hat_plurals.getStorageLevel()

StorageLevel(True, True, False, False, 2)

### Partitioning

In [16]:
hat_plurals.getNumPartitions()

4

In [18]:
repartitioned = sc.parallelize(range(20)).repartition(5)
repartitioned.getNumPartitions()

5

In [19]:
repartitioned.glom().collect() 

[[15, 16, 17, 18, 19],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [],
 [],
 [10, 11, 12, 13, 14]]

## Grouping strategies in Spark

In [57]:
my_first_rdd.take(5)

['panama', 'panama', 'fedora', 'sombrero', 'sombrero']

In [26]:
pair_rdd = my_first_rdd.map(lambda word: (word, 1))
pair_rdd.take(5)

[('panama', 1), ('panama', 1), ('fedora', 1), ('sombrero', 1), ('sombrero', 1)]

In [29]:
pair_rdd.groupByKey().mapValues(len).collect()

[('fedora', 24), ('top hat', 20), ('panama', 27), ('sombrero', 29)]

In [30]:
pair_rdd.reduceByKey(lambda x, y: x + y).collect()

[('fedora', 24), ('top hat', 20), ('panama', 27), ('sombrero', 29)]

In [31]:
# doing all steps at once
my_first_rdd\
    .filter(lambda hat: hat[0] != 's')\
    .map(lambda hat: (hat, 1))\
    .reduceByKey(lambda x, y: x + y)\
    .collect()

[('fedora', 24), ('top hat', 20), ('panama', 27)]

## Apply word count to a file

We will use the [Complete Works of William Shakespeare](http://www.gutenberg.org/ebooks/100) from [Project Gutenberg](http://www.gutenberg.org/wiki/Main_Page)

In [81]:
!wget -v http://www.gutenberg.org/files/100/100-0.txt -O shakespeare.txt

--2018-09-08 12:38:33--  http://www.gutenberg.org/files/100/100-0.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5852404 (5,6M) [text/plain]
Saving to: ‘shakespeare.txt’


2018-09-08 12:38:38 (1,41 MB/s) - ‘shakespeare.txt’ saved [5852404/5852404]



### Removing capitalization and punctuation

In [32]:
import re

test_line = 'See at the end of this file: * CONTENT NOTE (added in 2017) *'

def clean(line):
    return re.sub('[^\w ]', '', line).lower()
    
clean(test_line)

'see at the end of this file  content note added in 2017 '

In [33]:
clean_lines = sc.textFile('shakespeare.txt')
clean_lines = clean_lines.map(clean)
clean_lines.take(10)

['',
 'project gutenbergs the complete works of william shakespeare by william',
 'shakespeare',
 '',
 'this ebook is for the use of anyone anywhere in the united states and',
 'most other parts of the world at no cost and with almost no restrictions',
 'whatsoever  you may copy it give it away or reuse it under the terms',
 'of the project gutenberg license included with this ebook or online at',
 'wwwgutenbergorg  if you are not located in the united states youll',
 'have to check the laws of the country where you are located before using']

In [34]:
words = clean_lines.flatMap(lambda line: line.split()) 
words.take(10)

['project',
 'gutenbergs',
 'the',
 'complete',
 'works',
 'of',
 'william',
 'shakespeare',
 'by',
 'william']

In [35]:
words = words.filter(lambda word: word != '')

### Count the words and show the top 10

In [41]:
from operator import add

counts = words\
    .map(lambda word: (word, 1))\
    .reduceByKey(add)

In [43]:
counts.takeOrdered(10, lambda tup: -tup[1])

[('the', 30002),
 ('and', 28358),
 ('i', 21867),
 ('to', 20816),
 ('of', 18815),
 ('a', 15992),
 ('you', 14437),
 ('my', 13191),
 ('in', 12032),
 ('that', 11781)]

## Practical - ETL with airline coupon data

In [1]:
lines = sc.textFile('coupon150720.csv')
lines.take(5)

['79062005698500,1,MAA,AUH,9W,9W,56.79,USD,1,H,H,0526,150904,OK,IAF0',
 '79062005698500,2,AUH,CDG,9W,9W,84.34,USD,1,H,H,6120,150905,OK,IAF0',
 '79062005924069,1,CJB,MAA,9W,9W,60.0,USD,1,H,H,2768,150721,OK,IAA0',
 '79065668570385,1,DEL,DXB,9W,9W,160.63,USD,2,S,S,0546,150804,OK,INA0',
 '79065668737021,1,AUH,IXE,9W,9W,152.46,USD,1,V,V,0501,150803,OK,INA0']

### Exercise

1. Take fields 0, 2, 3, 4, and 6 from each line of coupons
2. Keep only the amount. Get average, max, min and std

In [2]:
test_line = '79062005698500,1,MAA,AUH,9W,9W,56.79,USD,1,H,H,0526,150904,OK,IAF0'

def coupon_from_line(line):
    fields = line.split(',')
    return fields[0], fields[2], fields[3], fields[4], float(fields[6])

coupon_from_line(test_line)

('79062005698500', 'MAA', 'AUH', '9W', 56.79)

In [3]:
coupons = lines.map(coupon_from_line)
coupons.take(5)

[('79062005698500', 'MAA', 'AUH', '9W', 56.79),
 ('79062005698500', 'AUH', 'CDG', '9W', 84.34),
 ('79062005924069', 'CJB', 'MAA', '9W', 60.0),
 ('79065668570385', 'DEL', 'DXB', '9W', 160.63),
 ('79065668737021', 'AUH', 'IXE', '9W', 152.46)]

In [47]:
amounts = coupons.map(lambda coupon: coupon[4])
amounts.take(4)

[56.79, 84.34, 60.0, 160.63]

In [48]:
# max
amounts.reduce(lambda x, y: x if x > y else y)

6355194.0

In [49]:
# min
amounts.reduce(lambda x, y: x if x < y else y)

0.0

In [21]:
# save this point into cache
amounts.cache()

PythonRDD[15] at RDD at PythonRDD.scala:48

In [50]:
# calculate std by using formula
count, total, sum_squares = amounts\
    .map(lambda amount: (1, amount, amount ** 2))\
    .reduce(lambda t1, t2: (t1[0] + t2[0], t1[1] + t2[1], t1[2] + t2[2]))

count, total, sum_squares

(1232662, 184831898.49994844, 122763999131054.75)

In [52]:
from math import sqrt

avg = total / count
var = (sum_squares - count * (avg) ** 2) / count

avg, sqrt(var)

(149.94532037164157, 9978.482086123315)

### Exercise

Get stats on ticket amount for all tickets with destination MAD

You will need to extract ticket amounts with destination MAD, and then calculate:

1. Total ticket amounts per origin
2. Top 10 airlines by average amount

In [4]:
coupons.take(5)

[('79062005698500', 'MAA', 'AUH', '9W', 56.79),
 ('79062005698500', 'AUH', 'CDG', '9W', 84.34),
 ('79062005924069', 'CJB', 'MAA', '9W', 60.0),
 ('79065668570385', 'DEL', 'DXB', '9W', 160.63),
 ('79065668737021', 'AUH', 'IXE', '9W', 152.46)]

#### Part 1

In [5]:
# grouping all MAD destinations
mad_coupons = coupons.filter(lambda coupon: coupon[2] == 'MAD')
mad_coupons.take(5)

[('79062005639642', 'BRU', 'MAD', 'UX', 21.02),
 ('79065668754871', 'BRU', 'MAD', 'SN', 27.66),
 ('79065668917696', 'CDG', 'MAD', 'AF', 46.97),
 ('79062006133090', 'CDG', 'MAD', 'AF', 3.38),
 ('79062006110497', 'CDG', 'MAD', 'AF', 26.02)]

In [6]:
# convert each row to tuple for the interested columns
pair_rdd = mad_coupons.map(lambda coupon: (coupon[1], coupon[4]))
pair_rdd.take(5)

[('BRU', 21.02), ('BRU', 27.66), ('CDG', 46.97), ('CDG', 3.38), ('CDG', 26.02)]

In [7]:
# use reduce to group them by key
from operator import add

result = pair_rdd.reduceByKey(add)
result.takeOrdered(5, key=lambda t: -t[-1])

[('CCS', 94528.68),
 ('GRU', 87192.63999999998),
 ('EZE', 81074.63999999997),
 ('BOG', 74644.45000000001),
 ('LHR', 69609.53000000003)]

#### Part 2

In [8]:
mad_coupons.take(5)

[('79062005639642', 'BRU', 'MAD', 'UX', 21.02),
 ('79065668754871', 'BRU', 'MAD', 'SN', 27.66),
 ('79065668917696', 'CDG', 'MAD', 'AF', 46.97),
 ('79062006133090', 'CDG', 'MAD', 'AF', 3.38),
 ('79062006110497', 'CDG', 'MAD', 'AF', 26.02)]

In [9]:
# extracting columns needed
mad_filtered = mad_coupons.map(lambda coupon: (coupon[3], (coupon[4], 1)))
mad_filtered.take(5)

[('UX', (21.02, 1)),
 ('SN', (27.66, 1)),
 ('AF', (46.97, 1)),
 ('AF', (3.38, 1)),
 ('AF', (26.02, 1))]

In [13]:
# create a combiner that will operate the tuples
tuple_a = (21.02, 1)
tuple_b = (27.66, 1)

def combiner(tuple_a, tuple_b):
    return (tuple_a[0] + tuple_b[0], tuple_a[1] + tuple_b[1])

combiner(tuple_a, tuple_b)

(48.68, 2)

In [17]:
counts_per_airline = mad_filtered.reduceByKey(combiner) #only applies combiner to values
counts_per_airline.take(5)

[('SN', (3092.6099999999997, 61)),
 ('IB', (928396.8400000058, 7526)),
 ('KL', (18007.62999999999, 323)),
 ('AZ', (3715.28, 54)),
 ('', (0.0, 168))]

In [19]:
averages = counts_per_airline.mapValues(lambda t: t[0] / t[1]) #mapValues ignores the key (first element)
averages.takeOrdered(10, key=lambda t: -t[-1])

[('V0', 5418.098666666667),
 ('AC', 740.6200000000001),
 ('KE', 688.5261538461539),
 ('SV', 553.1742553191489),
 ('OB', 535.5044444444444),
 ('AR', 513.5304761904762),
 ('AV', 450.19509554140177),
 ('AM', 440.73421052631585),
 ('C2', 397.87),
 ('LA', 379.9537078651686)]