In [1]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
                    .appName('helloSpark')
                    .getOrCreate()
        )

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 34222)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
                           ^^^^^^
  File "/usr/local/spark/python/pyspark/accumulators.py", line 271, in accum_updates
    num_updates =

Further info on Spark sessions:  
https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/spark_session.html

In [2]:
spark

In [3]:
# Let's generate some data for analysis

import random

names = ["Alice", "Ben", "Charles", "Daisy"]
start_range = 900
end_range = 5000
python_data = [[random.choice(names),random.randint(start_range,end_range)] for i in range(500000)]

In [4]:
# To read in a Python object (list, dict), we can use spark.createDataFrame
# We define a schema to have nicer column names and avoid Spark having to infer the schema
schema = "name STRING, salary INT"

df = spark.createDataFrame(python_data,schema=schema)

In [5]:
# to display some rows, you can use .show() 
df.show()

+-------+------+
|   name|salary|
+-------+------+
|Charles|  1292|
|    Ben|  1217|
|  Alice|  4010|
|    Ben|  3203|
|  Daisy|  2405|
|  Alice|  4748|
|    Ben|  3534|
|  Alice|   995|
|  Daisy|  2008|
|  Alice|  2941|
|Charles|  2699|
|    Ben|  4154|
|  Alice|  3443|
|  Alice|  4015|
|  Daisy|  3446|
|    Ben|  3902|
|Charles|  1691|
|  Daisy|  4472|
|    Ben|  3051|
|  Alice|  3604|
+-------+------+
only showing top 20 rows



In [6]:
df_new = (df.groupBy("name")
          .avg("salary")
         )

In [8]:
# It is lazy, so show actually perform all the work

df_new.show()

+-------+------------------+
|   name|       avg(salary)|
+-------+------------------+
|Charles| 2953.064667149693|
|    Ben|2955.2915893382824|
|  Alice|2944.0039734142465|
|  Daisy| 2950.613626348975|
+-------+------------------+



In [9]:
# Many of the functions hide behind spark.sql.functions
import pyspark.sql.functions as F

(df_new.select(
    "name"
    ,"avg(salary)"
    ,F.round("avg(salary)").alias("average")
    ).show()
)

+-------+------------------+-------+
|   name|       avg(salary)|average|
+-------+------------------+-------+
|Charles| 2953.064667149693| 2953.0|
|    Ben|2955.2915893382824| 2955.0|
|  Alice|2944.0039734142465| 2944.0|
|  Daisy| 2950.613626348975| 2951.0|
+-------+------------------+-------+



The following is for a comparison with the popular Python package `pandas`

In [10]:
import pandas as pd 

pd_df = pd.DataFrame(python_data,columns=["name","salary"])

In [11]:
pd_df

Unnamed: 0,name,salary
0,Charles,1292
1,Ben,1217
2,Alice,4010
3,Ben,3203
4,Daisy,2405
...,...,...
499995,Charles,2812
499996,Alice,2390
499997,Ben,3063
499998,Ben,3291


In [12]:
pd_df.groupby("name").mean("salary")

Unnamed: 0_level_0,salary
name,Unnamed: 1_level_1
Alice,2944.003973
Ben,2955.291589
Charles,2953.064667
Daisy,2950.613626


Which of these seemed to be faster?  
Why?

Let's have a quick walkthrough of a few more PySpark methods.  
For a longer (full) list of methods, see:  
https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html

In [None]:
# load CSV files into DataFrames
employees_df = spark.read.option("header", "true").csv("input/employees.csv")
departments_df = spark.read.option("header", "true").csv("input/departments.csv")

employees_df.show()
departments_df.show()

In [None]:

# convert salary from string to integer for proper filtering
employees_df = employees_df.withColumn("salary", F.col("salary").cast("integer"))

employees_df.show()

In [None]:

# use .filter() for ... filtering.
filtered_df = employees_df.filter(F.col("salary") > 55000)

# use .select() for ... selecting (columns).
selected_df = filtered_df.select("emp_id", "name", "department", "salary")

# use withColumnRenamed for renaming columns
renamed_df = selected_df.withColumnRenamed("emp_id", "employee_id")

# use selectExpr() for projecting SQL expressions
expr_df = renamed_df.selectExpr("employee_id", "name", "department", "salary", "salary / 12 as monthly_salary")

# use .join() for ... joining. Let's join with departments DataFrame on the 'department' column
joined_df = expr_df.join(departments_df, on="department", how="inner")


In [None]:

# .write for writing. There are multiple more options you can see in the next classes.
joined_df.write.mode("overwrite").option("header", "true").csv("output/joined_employees")

# NB - Spark has lazy evaluation. It will only execute the code when it needs to.
# This means that you can chain multiple transformations and actions together without any performance hit.
# The code will only be executed when you call an action like .show() or .write().