#### Pysparl environment setup

In [0]:
import os
import sys
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

#### Create spark session

In [0]:
#Import SparkSession
from pyspark.sql import SparkSession

#Create a Spark Session
spark = SparkSession.builder \
        .appName("Sparksession") \
        .master("local") \
        .getOrCreate()

In [0]:
# Test the data
data = [("Alice", 23),("Bob", 30),("Mahesh", 21)]
df = spark.createDataFrame(data, ["Name", "Age"])

#Let us show the data using show() method
df.show()

+------+---+
|  Name|Age|
+------+---+
| Alice| 23|
|   Bob| 30|
|Mahesh| 21|
+------+---+



In [0]:
spark.stop()

#### Difference between SparkContext and SparkSession

In [0]:
#  Spark Context :

# ->Used to be the entry point for spark in the earlier 
#   versions say 1.x
# ->Represents connection to spark cluster
# ->Coordinates task execution across the cluster
# ->Creates RDDs (Resilient Distributed Datasets)
# ->Performs Transformations and defines actions.

# Spark Session :

# ->The entry point for Spark since the version 2.0 that provides simple Interaction.
# ->Combines the Functionalities like HiveContext, SparkContext, SQLContext and StreamingContext.
# ->Supports multiple Programming Languages like : Scala, Java, R and Python
# ->Extends the functionality of Spark Context.
# ->Supports Advanced abstractions like Datasets and DataFrames
# ->Provides Data Source APIs, Machine Learning Algorithms and streaming capabilities

#### Creating pyspark context

In [0]:
from pyspark import SparkContext

#Create a spark context variable
sc=SparkContext(appName="SparkContext-Application")

#### Creating Spark Session with Manual Configuration

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
        .appName("Spark-Session-Manual-Config") \
        .config("spark.executor.memory", "2g") \
        .config("spark.sql.shuffle.partitions", "4") \
        .getOrCreate()

In [0]:
spark

In [0]:
# Shutdown the spark session
spark.stop()

#### RDDs(Resilienyt Distributed Datasets)

In [0]:
# Backbone of data processing in Spark
# Distributed, Fault-tolerant, parallelizable data structure
# Efficiently processes large datasets across the cluster
# RDDs are immutable, distributed, resilient, lazily evaluated, fault-tolerant
# Fault Tolerant operations may contain : map, filter, reduce, collect, count, save, etc.

#### Transformations

In [0]:
# Create new RDDs by applying computation/ Manipulation
# Lazy Evaluation, Lineage Graph
# Examples like map, filter, flatMap, reduceByKey, sortBy and join

#### Actions

In [0]:
# Return Results or perform actions on RDD, triggering execution
# Eager evaluation, data movement/ computation
# Examples like collect, count, first, take, save, foreach.

#### How to Create RDDs

In [0]:
numbers = [1, 2, 3, 4, 5]
rdd = spark.sparkContext.parallelize(numbers)

In [0]:
rdd.collect()

Out[6]: [1, 2, 3, 4, 5]

In [0]:
# Create an RDD from the List of Tuples
employees = [("Ajay", 8), ("Raman", 7), ("Pratap", 5), ("Mohan", 6), ("Raman", 3)]
employees_rdd = spark.sparkContext.parallelize(employees)

In [0]:
print("All the employee tuples:")
#Print the tuples in new line
for i in employees_rdd.collect():
    print(i)

All the employee tuples:
('Ajay', 8)
('Raman', 7)
('Pratap', 5)
('Mohan', 6)
('Raman', 3)


In [0]:
employees_rdd.collect()

Out[9]: [('Ajay', 8), ('Raman', 7), ('Pratap', 5), ('Mohan', 6), ('Raman', 3)]

#### RDDs - Actions

In [0]:
# Count() action is used to count the items in the RDD.
rdd_count=rdd.count()
print("number of items in the RDD:",rdd_count)

number of items in the RDD: 5


In [0]:
employee_rdd_count = employees_rdd.count()
print("The total number of items in the Employee RDD is : ", employee_rdd_count)

The total number of items in the Employee RDD is :  5


In [0]:
# first() action returns the first action from the RDD.
first_item = employees_rdd.first()
print("The first item in the RDD is : ", first_item)

The first item in the RDD is :  ('Ajay', 8)


In [0]:
# take() action is used to retrieve the n number of elements from the RDD
elements_needed = employees_rdd.take(3)
print("The elements from the RDD are : ")
for i in elements_needed :
    print(i)

The elements from the RDD are : 
('Ajay', 8)
('Raman', 7)
('Pratap', 5)


In [0]:
# foreach() action is used to print each element of the rdd
employees_rdd.foreach(lambda x: print(x))

In [0]:
# foreach() action is used to print each element of the rdd
employees_rdd.foreach(lambda x: print(x))

#### RDDs-Transformations

In [0]:
# In Transformations, the data will be changed but only returns result when any action is performed

# Map Transformations are done to convert the name to uppercase
mapped_rdd = employees_rdd.map(lambda x: (x[0].upper(), x[1]))

In [0]:
map_result = mapped_rdd.collect()
print("RDD in Upper case :",map_result)

RDD in Upper case : [('AJAY', 8), ('RAMAN', 7), ('PRATAP', 5), ('MOHAN', 6), ('RAMAN', 3)]


In [0]:
# filter Transformation : filter records based on any condition
filtered_rdd = employees_rdd.filter(lambda x:x[1] == 7)

filtered_result = filtered_rdd.collect()
print("RDD with Experience of 7 years :", filtered_result)

RDD with Experience of 7 years : [('Raman', 7)]


In [0]:
# ReduceBy Key : Calculate the total experience for each name
reduced_rdd = employees_rdd.reduceByKey(lambda x,y: x + y)
reduced_rdd.collect()

Out[20]: [('Raman', 10), ('Pratap', 5), ('Ajay', 8), ('Mohan', 6)]

In [0]:
# sortBy Transformation : This returns the data arranged in ascending or descending order
sorted_rdd_asc = employees_rdd.sortBy(lambda x: x[1], ascending = True)
print("The RDD in Ascending Order :",sorted_rdd_asc.collect())

sorted_rdd_desc = employees_rdd.sortBy(lambda x: x[1], ascending = False)
print("The RDD in Descending Order :",sorted_rdd_desc.collect())

The RDD in Ascending Order : [('Raman', 3), ('Pratap', 5), ('Mohan', 6), ('Raman', 7), ('Ajay', 8)]
The RDD in Descending Order : [('Ajay', 8), ('Raman', 7), ('Mohan', 6), ('Pratap', 5), ('Raman', 3)]
