### Lambda Function

In [1]:
# Lambda function for addition
add = lambda x, y: x + y
# Using the lambda function
result = add(3, 5)
print(result)  # Output: 8

8


Quadratic Formula using lambda function

In [2]:
import math
quadratic_roots = lambda a, b, c: (
    (-b + math.sqrt(b**2-(4*a*c)))/2*a,
    (-b - math.sqrt(b**2-(4*a*c)))/2*a
)
a, b, c = 1, -3, 2
roots = quadratic_roots(a, b, c)
print(f"The roots are: {roots}") # Output as a Tuple

The roots are: (2.0, 1.0)


### RDDs (Resilient Distributed Datasets)

immutable, partitioned collection of elements that can be operated on in parallel.

RDDs are not executed until an action is performed and persist (store) its data in memory across the cluster for reuse.


Check # cores in our machine

In [3]:
import os
print("Number of cores:", os.cpu_count())

Number of cores: 2


In [4]:
import multiprocessing
print("Number of cores:", multiprocessing.cpu_count())

Number of cores: 2


#### map function

In [5]:
from pyspark import SparkContext
sc = SparkContext('local', 'hands on Pyspark')
# Input RDD
rdd = sc.parallelize([1, 2, 3, 4], numSlices=2)
# Multiply each element by 2
mapped_rdd = rdd.map(lambda x: x * 2)
print(mapped_rdd.collect())  # Output: [2, 4, 6, 8]
# sc.stop()

[2, 4, 6, 8]


#### check the number of partitions

In [6]:
rdd.getNumPartitions() #2

2

#### filter function

In [7]:
# Input RDD
rdd = sc.parallelize([1, 2, 3, 4, 5, 6], numSlices = 3)
# Keep only even numbers
filtered_rdd = rdd.filter(lambda x: x % 2 == 0)
print(filtered_rdd.collect())  # Output: [2, 4, 6]

[2, 4, 6]


#### reduce function

In [8]:
# Input RDD
rdd = sc.parallelize([1, 2, 3, 4, 5], numSlices=2)
# Sum all elements
result = rdd.reduce(lambda x, y: x + y)
print(result)  # Output: 15

15


#### Inspect the contents in each partition


In [9]:
# Create an RDD with 2 partitions
rdd = sc.parallelize([1, 2, 3, 4, 5, 6], numSlices=2)
# Inspect contents of each partition
partitions = rdd.glom().collect()
# Print partition details
for i, partition in enumerate(partitions):
   print(f"Partition {i}: {partition}")

Partition 0: [1, 2, 3]
Partition 1: [4, 5, 6]


#### Stop the SparkContext

In [13]:
sc.stop()

#### Partitions of a list of 10M integers

In [14]:
import random
from pyspark import SparkContext

# Initialize SparkContext
sc = SparkContext("local", "Partition Example")

# Generate a list of 10,000,000 random integers in the range 1-1000
data = [random.randint(1, 1000) for _ in range(10_000_000)]

# Create an RDD without specifying the number of partitions (automatic partitioning)
rdd = sc.parallelize(data)

# Get the number of partitions
num_partitions = rdd.getNumPartitions()
print(f"Number of partitions: {num_partitions}") # ?

# Inspect the data distribution in partitions
partitions = rdd.glom().collect()  # glom groups data in each partition into lists

# Print the first 5 elements of each partition (for illustration purposes)
for i, partition in enumerate(partitions):
    print(f"Partition {i}: {partition[:5]}")  # Print first 5 elements from each partition

Number of partitions: 1
Partition 0: [55, 662, 741, 10, 135]


In [15]:
sc.stop()

#### Adjust number of partitions

RDD size = Task Size x Number of Partitions

Therefore, Number of Partitions = RDD Size/ Recommended Task Size

In [16]:
import random
from pyspark import SparkContext

# Initialize SparkContext
sc = SparkContext("local", "Partition Example")

# Create an RDD without specifying the number of partitions (automatic partitioning)
rdd = sc.parallelize(data, numSlices = 28)

# Get the number of partitions
num_partitions = rdd.getNumPartitions()
print(f"Number of partitions: {num_partitions}") # ?

# Inspect the data distribution in partitions
partitions = rdd.glom().collect()  # glom groups data in each partition into lists

# Print the first 5 elements of each partition (for illustration purposes)
for i, partition in enumerate(partitions):
    print(f"Partition {i}: {partition[:5]}")  # Print first 5 elements from each partition

Number of partitions: 28
Partition 0: [55, 662, 741, 10, 135]
Partition 1: [446, 214, 814, 566, 237]
Partition 2: [169, 845, 564, 577, 551]
Partition 3: [340, 435, 108, 263, 806]
Partition 4: [664, 820, 318, 184, 32]
Partition 5: [948, 544, 997, 835, 688]
Partition 6: [673, 567, 920, 998, 185]
Partition 7: [529, 984, 915, 160, 124]
Partition 8: [512, 938, 720, 75, 828]
Partition 9: [152, 469, 136, 5, 782]
Partition 10: [759, 539, 221, 908, 742]
Partition 11: [730, 698, 728, 881, 249]
Partition 12: [749, 617, 526, 893, 951]
Partition 13: [677, 670, 844, 771, 321]
Partition 14: [25, 327, 116, 323, 674]
Partition 15: [802, 111, 796, 272, 901]
Partition 16: [208, 514, 102, 753, 869]
Partition 17: [83, 49, 820, 482, 573]
Partition 18: [977, 54, 400, 476, 756]
Partition 19: [859, 523, 600, 214, 286]
Partition 20: [828, 206, 303, 766, 635]
Partition 21: [477, 237, 555, 199, 57]
Partition 22: [262, 431, 664, 754, 842]
Partition 23: [106, 304, 141, 274, 543]
Partition 24: [193, 584, 580, 887, 6

In [17]:
sc.stop()