# What is Spark? <!-- \index{Spark} -->

## The Goal of Apache Spark

# Basic Concepts

In [None]:

from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local[*]").setAppName("Chapter 4: pyspark")
sc = SparkContext(conf=conf)

25/05/22 11:49:49 WARN Utils: Your hostname, Chris-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 172.20.10.8 instead (on interface en0)
25/05/22 11:49:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/22 11:49:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/22 11:49:49 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 50827)
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.13/3.13.2/Frameworks/Python.framework/Versions/3.13/lib/python3.13/socketserver.py", line 318, in _handle_request_noblock
    self.process_request(request, client_address)
    ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.13/3.13.2/Frameworks/Python.framework/Versions/3.13/lib/python3.13/socketserver.py", line 349, in process_request
    self.finish_request(request, client_address)
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.13/3.13.2/Frameworks/Python.framework/Versions/3.13/lib/python3.13/socketserver.py", line 362, in finish_request
    self.RequestHandlerClass(request, client_address, self)
    ~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.13/3.13.2/Frameworks/Python.fra

In [2]:
sc

# RDDs: Resilient Distributed Datasets <!-- \index{API!RDD} -->

## Creating RDDs

In [3]:
data = sc.parallelize([1, 2, 3, 4, 5])

25/05/22 11:50:00 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [4]:
print(data)

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:289


In [5]:
bookRDD = sc.textFile("data/quixote.txt")

In [7]:
bookRDD_hdfs = sc.textFile("hdfs://localhost:9000/user/pszit/quixote.txt")

In [8]:
type(bookRDD)

pyspark.rdd.RDD

In [6]:
bookRDD.getNumPartitions()

2

In [10]:
bookRDD = sc.textFile("data/quixote.txt", 3)

In [11]:
bookRDD.getNumPartitions()

3

## Operations with RDDs

In [7]:
quixote_lines = bookRDD.filter(lambda line: "Quixote" in line)

In [8]:
quixote_lines.first()

                                                                                

'The Project Gutenberg EBook of The History of Don Quixote by Miguel de Cervantes'

## Transformations

### Basic Transformations

In [9]:
rdd = sc.parallelize([1, 2, 3, 4, 1, 2, 3, 4])

#### `map(func)`

In [10]:
#operazione persa: l'RDD non muta , bisognava salvarlo da qualche parte 
rdd.map(lambda x: x * 2) 

PythonRDD[5] at RDD at PythonRDD.scala:53

In [11]:
rdd.collect()

[1, 2, 3, 4, 1, 2, 3, 4]

In [12]:
rdd_square = rdd.map(lambda x: x * 2)

In [13]:
rdd_square.collect()

[2, 4, 6, 8, 2, 4, 6, 8]

#### `filter(func)`

In [14]:
rdd.filter(lambda x: x != 1).collect()

[2, 3, 4, 2, 3, 4]

In [15]:
rdd.filter(lambda x: x % 2 == 0).collect()

[2, 4, 2, 4]

In [16]:
evens_rdd = rdd.filter(lambda x: x % 2 == 0).collect()

In [17]:
type(evens_rdd)

list

#### `map(func)` vs `flatMap(func)`

In [23]:
rdd.map(lambda x: x * 2).collect()

[2, 4, 6, 8, 2, 4, 6, 8]

In [None]:
# errore perchè assume livello di annidamento
rdd.flatMap(lambda x: x * 2).collect()


In [20]:
# no errore, qui abbiamo una lista di liste
rdd.flatMap(lambda x: [x * 2]).collect()

[2, 4, 6, 8, 2, 4, 6, 8]

In [22]:
lines = sc.parallelize(["Welcome to the Big",
                        "World of Big Big",
                        "Data Welcome World bye",
                        "World Hello MapReduce",
                        "GoodBye MapReduce"
                        "This Book on Big Data is fun"])

In [23]:
lines.collect()

['Welcome to the Big',
 'World of Big Big',
 'Data Welcome World bye',
 'World Hello MapReduce',
 'GoodBye MapReduceThis Book on Big Data is fun']

In [24]:
words_map = lines.map(lambda line: line.split(" "))

In [25]:
words_map.collect()

[['Welcome', 'to', 'the', 'Big'],
 ['World', 'of', 'Big', 'Big'],
 ['Data', 'Welcome', 'World', 'bye'],
 ['World', 'Hello', 'MapReduce'],
 ['GoodBye', 'MapReduceThis', 'Book', 'on', 'Big', 'Data', 'is', 'fun']]

In [26]:
"Welcome to the Big".split(" ")

['Welcome', 'to', 'the', 'Big']

In [27]:
wordsFlatMap = lines.flatMap(lambda line: line.split(" "))

In [30]:
wordsFlatMap.collect()[:5]

['Welcome', 'to', 'the', 'Big', 'World']

### Transformations with Pseudo-Sets <!-- \index{pseudo-sets} -->

In [31]:
rdd1 = sc.parallelize(["water", "wine", "beer", "water", "water", "wine"])
rdd2 = sc.parallelize(["beer", "beer", "water", "water",\
                       "wine", "coca-cola", "lemonade"])

In [32]:
rdd1.distinct().collect()

['wine', 'beer', 'water']

In [33]:
rdd1.union(rdd2).collect()

['water',
 'wine',
 'beer',
 'water',
 'water',
 'wine',
 'beer',
 'beer',
 'water',
 'water',
 'wine',
 'coca-cola',
 'lemonade']

In [34]:
rdd1.intersection(rdd2).collect()

['wine', 'beer', 'water']

In [None]:
rdd1.subtract(rdd2).collect()

[]

In [35]:
rdd1.distinct().cartesian(rdd2.distinct()).collect()

[('wine', 'wine'),
 ('wine', 'beer'),
 ('wine', 'water'),
 ('wine', 'lemonade'),
 ('wine', 'coca-cola'),
 ('beer', 'wine'),
 ('beer', 'beer'),
 ('beer', 'water'),
 ('beer', 'lemonade'),
 ('beer', 'coca-cola'),
 ('water', 'wine'),
 ('water', 'beer'),
 ('water', 'water'),
 ('water', 'lemonade'),
 ('water', 'coca-cola')]

## Actions on RDDs

In [36]:
rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8])

In [37]:
rdd.count()

8

In [38]:
rdd.take(1)

[1]

In [39]:
rdd.first()

1

In [40]:
rdd.reduce(lambda a, b: a + b)

36

In [41]:
rdd.takeOrdered(3)

[1, 2, 3]

In [42]:
rdd.takeOrdered(3, lambda s: -1 * s)

[8, 7, 6]

25/05/22 13:56:39 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1263889 ms exceeds timeout 120000 ms
25/05/22 13:56:39 WARN SparkContext: Killing executors is not supported by current scheduler.
25/05/22 13:56:41 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$

## Key-Value Transformations <!-- \index{transformation!key-value} -->

### Basic Key-Value Transformations 

In [None]:
rdd = sc.parallelize([(1, 2), (3, 4), (3, 6)])

In [None]:
rdd.map(lambda k_v: (k_v[0], 1)).collect()

[(1, 1), (3, 1), (3, 1)]

In [None]:
rdd.groupByKey().collect()

[(1, <pyspark.resultiterable.ResultIterable at 0x2e0d9d264f0>),
 (3, <pyspark.resultiterable.ResultIterable at 0x2e0d9d26e50>)]

In [None]:
rdd.groupByKey().map(lambda x : (x[0], list(x[1]))).collect()

[(1, [2]), (3, [4, 6])]

In [None]:
rdd.reduceByKey(lambda a, b: a + b).collect()

[(1, 2), (3, 10)]

In [None]:
rdd2 = sc.parallelize([(3, 'a'), (2, 'c'), (1, 'b')])

In [None]:
rdd2.sortByKey().collect()

[(1, 'b'), (2, 'c'), (3, 'a')]

### Join-like SQL Transformations <!-- \index{transformation!join-like} -->

In [None]:
people = sc.parallelize([("Lam", 35), ("Direnc", 35),\
                         ("Rebecca", 24), ("Edwina", 25)])

hobbies = sc.parallelize([("Lam", ["Triathlon", "Running", "Cycling"]),
                          ("Direnc", ["Lifting", "Running", "Reading"]),
                          ("Rebecca", ["Singing", "Dancing"]),
                          ("Grazziela", ["Running", "Music"])])

#### `join(rdd)`

In [None]:
people.join(hobbies).collect()

                                                                                

[('Direnc', (35, ['Lifting', 'Running', 'Reading'])),
 ('Lam', (35, ['Triathlon', 'Running', 'Cycling'])),
 ('Rebecca', (24, ['Singing', 'Dancing']))]

#### `leftOuterJoin(rdd)`

In [None]:
people.leftOuterJoin(hobbies).collect()

[('Direnc', (35, ['Lifting', 'Running', 'Reading'])),
 ('Edwina', (25, None)),
 ('Lam', (35, ['Triathlon', 'Running', 'Cycling'])),
 ('Rebecca', (24, ['Singing', 'Dancing']))]

#### `rightOuterJoin(rdd)`

In [None]:
people.rightOuterJoin(hobbies).collect()

[('Direnc', (35, ['Lifting', 'Running', 'Reading'])),
 ('Lam', (35, ['Triathlon', 'Running', 'Cycling'])),
 ('Rebecca', (24, ['Singing', 'Dancing'])),
 ('Grazziela', (None, ['Running', 'Music']))]

### `fullOuterJoin(rdd)`

In [None]:
people.fullOuterJoin(hobbies).collect()

[('Direnc', (35, ['Lifting', 'Running', 'Reading'])),
 ('Edwina', (25, None)),
 ('Lam', (35, ['Triathlon', 'Running', 'Cycling'])),
 ('Rebecca', (24, ['Singing', 'Dancing'])),
 ('Grazziela', (None, ['Running', 'Music']))]

## Key-Value Actions  <!-- \index{action!key-value} -->

In [None]:
rdd = sc.parallelize([(1, 2), (3, 4), (3, 6)])

In [None]:
rdd.count()

3

In [None]:
rdd.countByKey()

defaultdict(int, {1: 1, 3: 2})

In [None]:
rdd.collectAsMap()

{1: 2, 3: 6}

In [None]:
rdd.lookup(3)

[4, 6]

## Read/Write from/to Files

### Text Files

In [None]:
lines = sc.textFile("data/")

In [None]:
lines.count()

43476

In [None]:
files = sc.wholeTextFiles("data/")

In [None]:
files.count()

5

In [None]:
bookRDD = sc.textFile("data/quixote.txt")

In [None]:
quixote_lines = bookRDD.filter(lambda line: "Quixote" in line)

In [None]:
quixote_lines.saveAsTextFile("data/quixote_lines")

### Reading Other Formats

#### JSON Files <!-- \index{files!JSON} -->

In [None]:
import json

json_rdd = sc.textFile("data/people.json").map(lambda x: json.loads(x))

In [None]:
print(f"Number of elements in the RDD from people.json: {json_rdd.count()}")

Number of elements in the RDD from data.json: 5


In [None]:
json_rdd.first()

{'employee_id': 8761,
 'personal_info': {'name': 'Lam', 'age': 35},
 'location': 'UK',
 'hobbies': ['Lifting', 'Running', 'Reading'],
 'joined': '2010-05-10'}

In [None]:
json_rdd.map(lambda x: json.dumps(x)).saveAsTextFile("data/output.json")

#### CSV Files <!-- \index{files!CSV} -->

In [None]:
import csv
from io import StringIO
def loadRecord(line):
    """Parse a CSV line"""
    input_data = StringIO(line)
    reader = csv.DictReader(input_data, fieldnames=["employee_id", "name",
                                                    "age", "location", 
                                                    "hobbies", "joined"])
    return next(reader)

In [None]:
input_csv_rdd = sc.textFile("data/people.csv").map(loadRecord)

In [None]:
input_csv_rdd.take(2)

[{'employee_id': '8761',
  'name': 'Lam',
  'age': '35',
  'location': 'Vietnam',
  'hobbies': 'Lifting;Running;Reading',
  'joined': '2010-05-10'},
 {'employee_id': '12441',
  'name': 'Direnc',
  'age': '36',
  'location': 'Turkey',
  'hobbies': 'Triathlon;Running;Cycling',
  'joined': '2009-01-12'}]

In [None]:
def writeRecords(records):
    """Write out CSV lines"""
    output = StringIO()
    writer = csv.DictWriter(output, fieldnames=["employee_id", "name",
                                                "age", "location", 
                                                "hobbies", "joined"])
    for record in records:
        writer.writerow(record)
    return [output.getvalue()]

In [None]:
input_csv_rdd.mapPartitions(writeRecords)\
             .saveAsTextFile("data/output.csv")

## RDD Lineage <!-- \index{lineage} -->

In [None]:
rdd3 = rdd1.distinct().cartesian(rdd2.distinct())

## Cache Your RDDs

In [None]:
quixote_rdd = sc.textFile("data/quixote.txt")

In [None]:
words_quixote = quixote_rdd.flatMap(lambda line: line.split(' '))

In [None]:
words_quixote.take(5)

['', 'The', 'Project', 'Gutenberg', 'EBook']

In [None]:
words_quixote.filter(lambda line: "blockhead" in line).count()

17

In [None]:
words_quixote.filter(lambda line: "spear" in line).count()

13

In [None]:
words_quixote = quixote_rdd.flatMap(lambda line: line.split(" ")).cache()

In [None]:
words_quixote.filter(lambda line: "blockhead" in line).count()

17

In [None]:
words_quixote.filter(lambda line: "spear" in line).count()

13

# Advanced Concepts

## Shared Variables <!-- \index{variables!shared} -->

### Broadcast Variables  <!-- \index{variables!broadcast} -->

In [None]:
look_up_table = {1: "a", 2: "b", 3: "c", 4: "d"}

In [None]:
rdd = sc.parallelize([1, 2, 3, 4])

In [None]:
rdd.map(lambda v: look_up_table[v]).collect()

['a', 'b', 'c', 'd']

In [None]:
look_up_table_bc = sc.broadcast(look_up_table) 

In [None]:
rdd.map(lambda v: look_up_table_bc.value[v]).collect() 

['a', 'b', 'c', 'd']

### Accumulator Variables <!-- \index{variables!accumulator} -->

In [None]:
accum = sc.accumulator(0)

In [None]:
rdd = sc.parallelize([1, 2, 3, 4])

In [None]:
def f(x):
    global accum 
    accum += 1

In [None]:
rdd.foreach(f)

In [None]:
accum.value

10

In [None]:
quixote_rdd = sc.textFile("data/quixote.txt")

In [None]:
blank_lines = sc.accumulator(0)

def extract_words_blanklines(line):
    global blank_lines 
    if line == "":
        blank_lines += 1
    return line.split(" ")

In [None]:
words_quixote = quixote_rdd.flatMap(extract_words_blanklines)

In [None]:
words_quixote.count()

437863

In [None]:
blank_lines.value

6820

## Partitions

### `mapPartitions(func)`

In [None]:
rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7])

In [None]:
sum_count = rdd.map(lambda num: (num, 1))\
               .reduce(lambda x, y: (x[0] + y[0], x[1] + y[1]))

In [None]:
sum_count

(28, 7)

In [None]:
sum_count[0] / sum_count[1]

4.0

In [None]:
rdd.getNumPartitions()

12

In [None]:
def partition_counter(nums):
    """
    Input: `nums` is an iterator to the Integers 
            of a given partition.
    Output: returns a list (i.e., an iterator) with 
            the Word Count and the total length as a list 
    """
    sum_count = [0, 0]
    for num in nums:
        sum_count[0] += num
        sum_count[1] += 1
    return [sum_count]

In [None]:
rdd.mapPartitions(partition_counter)\
   .reduce(lambda x, y: (x[0] + y[0], x[1] + y[1]))

(28, 7)

### `mapPartitionsWithIndex(func)`

In [None]:
rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7], 3)

In [None]:
def show(index, iterator):
    return ["index: " + str(index) + " values: " + str(list(iterator))]

rdd.mapPartitionsWithIndex(show).collect()

['index: 0 values: [1, 2]',
 'index: 1 values: [3, 4]',
 'index: 2 values: [5, 6, 7]']

## Operations with Numeric RDDs  <!-- \index{RDD!numeric} -->

###  `stats()`

In [None]:
rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7])

In [None]:
results = rdd.stats() 

In [None]:
results

(count: 7, mean: 4.0, stdev: 2.0, max: 7.0, min: 1.0)

In [None]:
type(results)

pyspark.statcounter.StatCounter

In [None]:
print(f"(count: {rdd.count()}, mean: {rdd.mean()}\
, stdev: {rdd.stdev()}, max: {rdd.max()}, min: {rdd.min()})")

(count: 7, mean: 4.0, stdev: 2.0, max: 7, min: 1)


# Internal Working

## Anatomy of a Spark Application

In [None]:
rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [None]:
rdd.filter(lambda x : x<5)\
    .map(lambda x: (x, x))\
         .groupByKey()\
         .map(lambda k_v : (sum(k_v[1]),k_v[0]))\
         .sortByKey()\
         .count()

4

# Solution to Challenges

In [None]:
lst = [
    "Welcome to the Big World of Big Big Data Welcome World bye",
    "World Hello MapReduce GoodBye MapReduce",
    "This Book on Big Data is fun",
]

In [None]:
rdd = sc.parallelize(lst)

In [None]:
# use collect to visualize the output - we only show the first 5 elements
rdd.flatMap(lambda line: line.split(" ")).collect()[:5] 

['Welcome', 'to', 'the', 'Big', 'World']

In [None]:
rdd.flatMap(lambda line: line.split(" "))\
   .map(lambda word : (word, 1)).collect()[:5]

[('Welcome', 1), ('to', 1), ('the', 1), ('Big', 1), ('World', 1)]

In [None]:
rdd.flatMap(lambda line: line.split(" "))\
    .map(lambda word : (word, 1))\
    .groupByKey().map(lambda x : (x[0], len(list(x[1]))))\
    .collect()[:5]

[('Welcome', 2), ('of', 1), ('bye', 1), ('MapReduce', 2), ('GoodBye', 1)]

In [None]:
rdd.flatMap(lambda line: line.split(" "))\
   .map(lambda word : (word, 1))\
   .reduceByKey(lambda a, b: a + b).collect()[:5]

[('Welcome', 2), ('of', 1), ('bye', 1), ('MapReduce', 2), ('GoodBye', 1)]

In [None]:
from collections import defaultdict 

def partition_count(words):
    kv_dict = defaultdict(lambda: 0) 

    for word in words:
        kv_dict[word] += 1

    # returning a list of key-value pairs
    return list(kv_dict.items())

In [None]:
# we only show the first 5 elements [:5]
rdd.flatMap(lambda line: line.split(" "))\
   .mapPartitions(partition_count)\
   .reduceByKey(lambda a, b: a + b).collect()[:5]

[('Welcome', 2), ('of', 1), ('bye', 1), ('MapReduce', 2), ('GoodBye', 1)]

In [None]:
rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7])

In [None]:
avg = rdd.mean()

In [None]:
avg

4.0

In [None]:
def std(nums):
    sum_count = [0, 0]
    for num in nums:
        diff = num - avg
        sum_count[0] += diff * diff 
        sum_count[1] += 1
    return [sum_count]    

In [None]:
result = rdd.mapPartitions(std)\
   .reduce(lambda x, y: (x[0] + y[0], x[1] + y[1]))

In [None]:
result

(28.0, 7)

In [None]:
import math
math.sqrt(result[0] / result[1])

2.0

In [None]:
rdd.stdev()

2.0

In [None]:
def fast_std(nums):
    sum_count = [0, 0, 0]
    for num in nums:
        sum_count[0] += num
        sum_count[1] += num * num
        sum_count[2] += 1
    return [sum_count]    

In [None]:
result = rdd.mapPartitions(fast_std)\
   .reduce(lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2]))

In [None]:
result

(28, 140, 7)

In [None]:
avg = result[0] / result[2]

In [None]:
variance = result[1] / result[2] - avg * avg

In [None]:
variance

4.0

In [None]:
math.sqrt(variance)

2.0