In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,sum,avg,max

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

simpleData = [("James","Sales","NY",90000,34,10000),
    ("Michael","Sales","NY",86000,56,20000),
    ("Robert","Sales","CA",81000,30,23000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Raman","Finance","CA",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jen","Finance","NY",79000,53,15000),
    ("Jeff","Marketing","CA",80000,25,18000),
    ("Kumar","Marketing","NY",91000,50,21000)
  ]

schema = ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

df.groupBy("department").sum("salary").show(truncate=False)

df.groupBy("department").count().show(truncate=False)


df.groupBy("department","state") \
    .sum("salary","bonus") \
   .show(truncate=False)

df.groupBy("department") \
    .agg(sum("salary").alias("sum_salary"), \
         avg("salary").alias("avg_salary"), \
         sum("bonus").alias("sum_bonus"), \
         max("bonus").alias("max_bonus") \
     ) \
    .show(truncate=False)
    
df.groupBy("department") \
    .agg(sum("salary").alias("sum_salary"), \
      avg("salary").alias("avg_salary"), \
      sum("bonus").alias("sum_bonus"), \
      max("bonus").alias("max_bonus")) \
    .where(col("sum_bonus") >= 50000) \
    .show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |CA   |99000 |40 |24000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+

+----------+-----------+
|department|sum(salary)|
+----------+-----------+
|Sales     |257000     |
|Finance   |351

In [0]:
#Creating the DataFrame: This section is the same as before, where a DataFrame df is created from a list of tuples with a specified schema.

#Grouping and Summing: This section shows how to group the DataFrame by the 'department' column and calculate the sum of 'salary' for each group using the groupBy and sum functions. The results are displayed using show().

#Grouping and Counting: This section demonstrates how to group the DataFrame by the 'department' column and count the number of occurrences in each group using the groupBy and count functions. The results are displayed using show().

#Grouping and Summing Multiple Columns: This section shows how to group the DataFrame by multiple columns ('department' and 'state') and calculate the sum of 'salary' and 'bonus' for each group using the groupBy and sum functions. The results are displayed using show().

#Grouping, Aggregating, and Alias: This section demonstrates how to group the DataFrame by the 'department' column and calculate multiple aggregations, such as sum, average, and max, on the 'salary' and 'bonus' columns using the groupBy and agg functions. Column aliases are assigned using alias. The results are displayed using show().

#Filtering After Aggregation: This section shows how to perform filtering after the aggregation. It filters the results from the previous section based on the condition that the sum of the 'bonus' column is greater than or equal to 50000 using the where function and displays the filtered results using show().

#These examples provide insights into using PySpark's DataFrame API for grouping, aggregating, and filtering data in various ways.