# Introduce SparkContext, SparkSession and RDD data type

1. Create SparkContext in Spark version 2.X earlier
2. Create SparkContext (SparkSession) in Spark version 2.X and later
3. How to create RDD
4. RDD Operation: Actions
    - count
    - take
    - first
5. RDD Operation: Transformations
    - reduceByKey
    - groupByKey
    - groupBy
    - combineByKey
    - aggregateByKey
    - map
    - mapValues

## Create SparkContext in Apache Spark 2.X earlier

In [2]:
from pyspark import SparkContext

In [19]:
sc = SparkContext(appName="SparkContext Application", master="spark://spark-master:7077")

In [20]:
sc

In [21]:
sc.stop()

## Create SparkContext in Apache Spark 2.X and later

In [8]:
from pyspark.sql import SparkSession

In [32]:
spark = SparkSession.builder \
    .appName('Ingest checkin table into bronze') \
    .master('spark://spark-master:7077') \
    .config("spark.executor.memory", "2g") \
    .config("spark.sql.shuffle.partitions", "4") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider') \
    .config('spark.sql.warehouse.dir', f's3a://lakehouse/') \
    .enableHiveSupport() \
    .getOrCreate()

In [33]:
spark

In [27]:
spark.sql("SHOW DATABASES").show()

+---------+
|namespace|
+---------+
|  default|
|  test_db|
+---------+



## How to create RDDs

In [36]:
import random

names = ["Alice", "Bob", "Charlie", "David", "Emma", "Frank", "Grace", "Hannah", "Isaac", "Julia"]
data = [{"Name": random.choice(names), "Age": random.randint(20, 40)} for _ in range(100)]
rdd = spark.sparkContext.parallelize(data)

## RDD Operation: Actions

In [44]:
## TODO: show the specific number of elements in RDD by using take()
print("Several elements in RDD", rdd.take(5))

Several elements in RDD [{'Name': 'Grace', 'Age': 34}, {'Name': 'Bob', 'Age': 31}, {'Name': 'Julia', 'Age': 36}, {'Name': 'Frank', 'Age': 35}, {'Name': 'Alice', 'Age': 33}]


In [55]:
## TODO: show the first elements in RDD by using first()
print("First element in RDD", rdd.first())

First element in RDD {'Name': 'Grace', 'Age': 34}


In [54]:
## TODO: count the number of elements in RDD by using count()
print("The number of elements in RDD", rdd.count())

The number of elements in RDD 100


## RDD Operation: Transformations

In [69]:
## TODO: mapping function to each element in RDD by using map()
mapped_rdd = rdd.map(lambda x: (x['Name'].upper(), float(x['Age'])))
print("Mapped RDD", mapped_rdd.take(5))

Mapped RDD [('GRACE', 34.0), ('BOB', 31.0), ('JULIA', 36.0), ('FRANK', 35.0), ('ALICE', 33.0)]


In [79]:
## TODO: filtering records in RDD by using filter()
import re

filtered_rdd = rdd.filter(lambda x: re.search(r'E', x['Name'], re.IGNORECASE))
print("Filtered RDD", filtered_rdd.take(100))

Filtered RDD [{'Name': 'Grace', 'Age': 34}, {'Name': 'Alice', 'Age': 33}, {'Name': 'Grace', 'Age': 33}, {'Name': 'Alice', 'Age': 33}, {'Name': 'Emma', 'Age': 39}, {'Name': 'Grace', 'Age': 29}, {'Name': 'Grace', 'Age': 38}, {'Name': 'Emma', 'Age': 25}, {'Name': 'Charlie', 'Age': 27}, {'Name': 'Charlie', 'Age': 20}, {'Name': 'Alice', 'Age': 33}, {'Name': 'Alice', 'Age': 24}, {'Name': 'Alice', 'Age': 36}, {'Name': 'Alice', 'Age': 27}, {'Name': 'Grace', 'Age': 21}, {'Name': 'Charlie', 'Age': 31}, {'Name': 'Grace', 'Age': 29}, {'Name': 'Emma', 'Age': 21}, {'Name': 'Emma', 'Age': 21}, {'Name': 'Grace', 'Age': 22}, {'Name': 'Charlie', 'Age': 22}, {'Name': 'Grace', 'Age': 25}, {'Name': 'Charlie', 'Age': 23}, {'Name': 'Alice', 'Age': 33}, {'Name': 'Charlie', 'Age': 34}, {'Name': 'Grace', 'Age': 28}, {'Name': 'Grace', 'Age': 27}, {'Name': 'Alice', 'Age': 31}, {'Name': 'Charlie', 'Age': 37}, {'Name': 'Emma', 'Age': 29}, {'Name': 'Grace', 'Age': 25}, {'Name': 'Emma', 'Age': 39}, {'Name': 'Alice', 

In [84]:
## TODO: reduceByKey transformation by using reduceByKey()
reduced_rdd = rdd.map(lambda x: (x['Name'], x['Age'])).reduceByKey(lambda x, y: x + y)
print("Reduced RDD", reduced_rdd.take(100))

Reduced RDD [('Grace', 339), ('Isaac', 290), ('Emma', 206), ('Hannah', 215), ('Bob', 296), ('Julia', 358), ('Frank', 210), ('Alice', 365), ('David', 498), ('Charlie', 216)]


In [95]:
## TODO: Grouping transformation by using groupByKey() --> Not Recommended
grouped_rdd = rdd.map(lambda x: (x['Name'], x['Age'])).groupByKey()
for e in grouped_rdd.collect():
    print(f"{e[0]}: {list(e[1])}")

Grace: [34, 33, 29, 38, 21, 29, 22, 25, 28, 27, 25, 28]
Isaac: [29, 34, 24, 24, 28, 27, 39, 35, 20, 30]
Emma: [39, 25, 21, 21, 29, 39, 32]
Hannah: [26, 37, 27, 40, 20, 27, 38]
Bob: [31, 38, 26, 38, 39, 33, 21, 38, 32]
Julia: [36, 36, 20, 21, 29, 27, 33, 22, 33, 25, 39, 37]
Frank: [35, 39, 25, 20, 38, 24, 29]
Alice: [33, 33, 33, 24, 36, 27, 33, 31, 24, 32, 27, 32]
David: [28, 31, 25, 25, 24, 38, 38, 38, 30, 28, 28, 20, 37, 38, 37, 33]
Charlie: [27, 20, 31, 22, 23, 34, 37, 22]


In [94]:
## TODO: Grouping following functioning key by using groupBy() --> Recommended
grouped_rdd = rdd.map(lambda x: (x['Name'], x['Age'])).groupBy(lambda x: x[1] % 2)
for e in grouped_rdd.collect():
    print(f"{e[0]}: {list(e[1])}")

0: [('Grace', 34), ('Julia', 36), ('David', 28), ('Isaac', 34), ('Isaac', 24), ('Grace', 38), ('Charlie', 20), ('Julia', 36), ('Alice', 24), ('Alice', 36), ('Isaac', 24), ('Bob', 38), ('Isaac', 28), ('Hannah', 26), ('Julia', 20), ('David', 24), ('Grace', 22), ('Bob', 26), ('Charlie', 22), ('Bob', 38), ('David', 38), ('Frank', 20), ('David', 38), ('David', 38), ('Julia', 22), ('Frank', 38), ('Charlie', 34), ('Frank', 24), ('David', 30), ('David', 28), ('Grace', 28), ('Isaac', 20), ('Hannah', 40), ('Bob', 38), ('Alice', 24), ('Grace', 28), ('David', 28), ('David', 20), ('Hannah', 20), ('David', 38), ('Alice', 32), ('Charlie', 22), ('Hannah', 38), ('Alice', 32), ('Isaac', 30), ('Emma', 32), ('Bob', 32)]
1: [('Bob', 31), ('Frank', 35), ('Alice', 33), ('Grace', 33), ('Isaac', 29), ('Alice', 33), ('Emma', 39), ('Grace', 29), ('Emma', 25), ('Charlie', 27), ('David', 31), ('Frank', 39), ('Alice', 33), ('Alice', 27), ('Frank', 25), ('Grace', 21), ('Charlie', 31), ('Grace', 29), ('Julia', 21), (

In [116]:
## Combining values following key by using combineByKey() --> Recommended
## TODO: find average value of age for each 'Name'
def create_combiner(value):
    return (value, 1)

def merge_value(combiner, value):
    combined_age, count = combiner
    return (combined_age+value, count+1)

def merge_combiner(combiner1, combiner2):
    sum_age1, sum_count1 = combiner1
    sum_age2, sum_count2 = combiner2
    return (sum_age1+sum_age2, sum_count1+sum_count2)

# Calculate the sum of every 'Age' associating 'Name'
combined_rdd = rdd.map(lambda x: (x['Name'], x['Age'])).combineByKey(create_combiner, merge_value, merge_combiner)

# Calculate the average age of each 'Name'
average_age = combined_rdd.mapValues(lambda x: x[0]/x[1])
for e in average_age.collect():
    print(f"{e[0]}: {e[1]}")

Grace: 28.25
Isaac: 29.0
Emma: 29.428571428571427
Hannah: 30.714285714285715
Bob: 32.888888888888886
Julia: 29.833333333333332
Frank: 30.0
Alice: 30.416666666666668
David: 31.125
Charlie: 27.0


In [111]:
## Aggregating values following key by using aggregateByKey() --> Recommended
## TODO: find average value of age for each 'Name'
seqFunc = (lambda x, y: (x[0]+y, x[1]+1))
combFunc = (lambda c1, c2: (c1[0]+c2[0])/(c1[1]+c2[1]))
aggregated_rdd = rdd.map(lambda x: (x['Name'], x['Age'])).aggregateByKey((0,0), seqFunc, combFunc)

for e in aggregated_rdd.collect():
    print(f"{e[0]}: {e[1]}")

Grace: 28.25
Isaac: 29.0
Emma: 29.428571428571427
Hannah: 30.714285714285715
Bob: 32.888888888888886
Julia: 29.833333333333332
Frank: 30.0
Alice: 30.416666666666668
David: 31.125
Charlie: 27.0


In [117]:
spark.stop()