In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,sum,avg,max

spark = SparkSession.builder \
                    .appName('SparkByExamples.com') \
                    .getOrCreate()

simpleData = [("James","Sales","NY",90000,34,10000),
    ("Michael","Sales","NV",86000,56,20000),
    ("Robert","Sales","CA",81000,30,23000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Raman","Finance","DE",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jen","Finance","NY",79000,53,15000),
    ("Jeff","Marketing","NV",80000,25,18000),
    ("Kumar","Marketing","NJ",91000,50,21000)
  ]

schema = ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

dfSort=df.sort(df.state,df.salary).groupBy(df.state).agg(sum(df.salary))
dfSort.show()

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NV   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |DE   |99000 |40 |24000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |NV   |80000 |25 |18000|
|Kumar        |Marketing |NJ   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+

+-----+-----------+
|state|sum(salary)|
+-----+-----------+
|   NY|     252000|
|   NV|     166000|
|   CA|     171

In [0]:
#Importing Libraries and Creating SparkSession:

#The necessary libraries, pyspark.sql.SparkSession and pyspark.sql.functions, are imported.
#spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate() creates a SparkSession with the application name "SparkByExamples.com".
#Creating DataFrame:

#The code defines a list of tuples, simpleData, representing employee data with attributes such as name, department, state, salary, age, and bonus.
#schema is a list that defines the schema of the DataFrame.
#df = spark.createDataFrame(data=simpleData, schema=schema) creates a DataFrame, df, using the provided data and schema.
#Displaying DataFrame:

#df.printSchema() displays the schema of the DataFrame.
#df.show(truncate=False) displays the data in the DataFrame.
#Sorting, Grouping, and Aggregation:

#df.sort(df.state, df.salary) sorts the DataFrame by the "state" and "salary" columns.
#groupBy(df.state) groups the sorted DataFrame by the "state" column.
#agg(sum(df.salary)) performs aggregation by calculating the sum of the "salary" column within each group.
#dfSort.show() displays the sorted and aggregated DataFrame.
#The code demonstrates how to sort a DataFrame by multiple columns, group the data based on a specific column, and perform aggregation operations using the sort(), groupBy(), and agg() functions in PySpark. The resulting DataFrame (dfSort) displays the sum of salaries for each unique state.