## PySpark Tutorial in VSC

In [24]:
# Import necessary libraries
import os
from pyspark.sql import SparkSession
import shutil


In [11]:
# To use Spark with VSC please run this path if you need change path to SPARK_HOME where you install it
os.environ['SPARK_HOME'] = "/Users/maxrogowski/PycharmProjects/Spark"
os.environ['PYSPARK_DRIVER_PYTHON'] = 'code'
os.environ['PYSPARK_PYTHON'] = 'python'

In [12]:
# Create a Session
spark = SparkSession.builder.appName("PySpark-Get-Started").getOrCreate()


24/03/21 08:10:05 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [13]:
# Test the setup check if data is running on spark
data = [("Max", 40), ("Aga", 14), ("Tomas", 18)]
df = spark.createDataFrame(data, ["Name", "Body Age"])
df.show()

+-----+--------+
| Name|Body Age|
+-----+--------+
|  Max|      40|
|  Aga|      14|
|Tomas|      18|
+-----+--------+



In [14]:
# Stop Spark session 
spark.stop()

In [15]:
# Create a SparkSession RDD - demo 
spark = SparkSession.builder.appName("RDD-Demo").getOrCreate()
# Perform operations using the SparkSession
spark

In [16]:
# Put list to RDD
numbers = [1, 2, 3, 4, 5]
rdd = spark.sparkContext.parallelize(numbers)

# Collect action: Retrieve all elements of the RDD
rdd.collect()

[1, 2, 3, 4, 5]

In [17]:
# Create an RDD from a list of tuples
data = [("Max", 25), ("Tomas", 30), ("Aga", 35), ("Max", 40)]
rdd = spark.sparkContext.parallelize(data)

# Print all elements
print("All elements of the rdd: ", rdd.collect())

# Count action: Count the number of elements in the RDD
count = rdd.count()
print("The total number of elements in rdd: ", count)

# First action: Retrieve the first element of the RDD
first_element = rdd.first()
print("The first element of the rdd: ", first_element)

# Take action: Retrieve the n elements of the RDD
taken_elements = rdd.take(2)
print("The first two elements of the rdd: ", taken_elements)

All elements of the rdd:  [('Max', 25), ('Tomas', 30), ('Aga', 35), ('Max', 40)]
The total number of elements in rdd:  4
The first element of the rdd:  ('Max', 25)
The first two elements of the rdd:  [('Max', 25), ('Tomas', 30)]


In [21]:
# Pint in loop
rdd.foreach(lambda x: print(x))


('Tomas', 30)
('Max', 40)
('Max', 25)
('Aga', 35)


In [22]:
# Map transformation: Convert name to uppercase
mapped_rdd = rdd.map(lambda x: (x[0].upper(), x[1]))
result = mapped_rdd.collect()
print("RDD with uppercase name: ", result) 

# Filter transformation: Filter records where age is greater than 30
filtered_rdd = rdd.filter(lambda x: x[1] > 30)
print("Filter records where age is greater than 30: ", filtered_rdd.collect())

RDD with uppercase name:  [('MAX', 25), ('TOMAS', 30), ('AGA', 35), ('MAX', 40)]
Filter records where age is greater than 30:  [('Aga', 35), ('Max', 40)]


In [26]:
# ReduceByKey transformation: Calculate the total age for each name
reduced_rdd = rdd.reduceByKey(lambda x, y: x + y)
reduced_rdd.collect()
print("Calculate the total age for each name: ",  reduced_rdd.collect())

# SortBy transformation: Sort the RDD by age in descending order
sorted_rdd = rdd.sortBy(lambda x: x[1], ascending=False)
sorted_rdd.collect()
print("SortBy transformation: Sort the RDD by age in descending order: ",  sorted_rdd.collect())

# Save action: Save the RDD to a text file
# Removing the existing output directory if it exists
shutil.rmtree("output.txt", ignore_errors=False)

# Saving the RDD to a text file
rdd.saveAsTextFile("output.txt")


Calculate the total age for each name:  [('Max', 65), ('Tomas', 30), ('Aga', 35)]
SortBy transformation: Sort the RDD by age in descending order:  [('Max', 40), ('Aga', 35), ('Tomas', 30), ('Max', 25)]


In [27]:
# Shut down Spark Session
spark.stop()


## Create-DataFrame

In [None]:
spark = SparkSession.builder.appName("Create-DataFrame").getOrCreate()

ConnectionRefusedError: [Errno 61] Connection refused