In [19]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions as funct

In [20]:
sparksession = SparkSession.builder.appName("Group_By_Age").getOrCreate()

In [21]:
friendsData = sparksession.read.option("header","True").option("inferSchema","True").csv("fakefriends-header.csv")

In [22]:
friendsData.printSchema()

root
 |-- userID: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- friends: integer (nullable = true)



In [23]:
# selecting the rows which are needed for the program
friendByAge = friendsData.select("age","friends")

In [24]:
# grouping the friends by their age and finding average value of number of frineds
# inshort, finding average number of friends as per age group
friendByAge.groupBy("age").avg("friends").show()

+---+------------------+
|age|      avg(friends)|
+---+------------------+
| 31|            267.25|
| 65|             298.2|
| 53|222.85714285714286|
| 34|             245.5|
| 28|             209.1|
| 26|242.05882352941177|
| 27|           228.125|
| 44| 282.1666666666667|
| 22|206.42857142857142|
| 47|233.22222222222223|
| 52| 340.6363636363636|
| 40| 250.8235294117647|
| 20|             165.0|
| 57| 258.8333333333333|
| 54| 278.0769230769231|
| 48|             281.4|
| 19|213.27272727272728|
| 64| 281.3333333333333|
| 41|268.55555555555554|
| 43|230.57142857142858|
+---+------------------+
only showing top 20 rows



In [25]:
# sorting data as per age 
friendByAge.groupBy("age").avg("friends").sort("age").show()

+---+------------------+
|age|      avg(friends)|
+---+------------------+
| 18|           343.375|
| 19|213.27272727272728|
| 20|             165.0|
| 21|           350.875|
| 22|206.42857142857142|
| 23|             246.3|
| 24|             233.8|
| 25|197.45454545454547|
| 26|242.05882352941177|
| 27|           228.125|
| 28|             209.1|
| 29|215.91666666666666|
| 30| 235.8181818181818|
| 31|            267.25|
| 32| 207.9090909090909|
| 33| 325.3333333333333|
| 34|             245.5|
| 35|           211.625|
| 36|             246.6|
| 37|249.33333333333334|
+---+------------------+
only showing top 20 rows



this is a way of sort of clumping together mulitple commands on that aggregated grouped result.

When you want to compute multiple aggregate statistics [sum, average, count, maximum, or minimum value for each group] in a single operation. 
Instead of computing each statistic separately, you can use agg() to compute them all at once.

In [26]:
# rounding average age value
friendByAge.groupBy("age").agg(funct.round(funct.avg("friends"),2)).sort("age").show()

+---+----------------------+
|age|round(avg(friends), 2)|
+---+----------------------+
| 18|                343.38|
| 19|                213.27|
| 20|                 165.0|
| 21|                350.88|
| 22|                206.43|
| 23|                 246.3|
| 24|                 233.8|
| 25|                197.45|
| 26|                242.06|
| 27|                228.13|
| 28|                 209.1|
| 29|                215.92|
| 30|                235.82|
| 31|                267.25|
| 32|                207.91|
| 33|                325.33|
| 34|                 245.5|
| 35|                211.63|
| 36|                 246.6|
| 37|                249.33|
+---+----------------------+
only showing top 20 rows



In [27]:
# alias for column
friendByAge.groupBy("age").agg(funct.round(funct.avg("friends"),2).alias("AVG_Friends")).sort("age").show()

+---+-----------+
|age|AVG_Friends|
+---+-----------+
| 18|     343.38|
| 19|     213.27|
| 20|      165.0|
| 21|     350.88|
| 22|     206.43|
| 23|      246.3|
| 24|      233.8|
| 25|     197.45|
| 26|     242.06|
| 27|     228.13|
| 28|      209.1|
| 29|     215.92|
| 30|     235.82|
| 31|     267.25|
| 32|     207.91|
| 33|     325.33|
| 34|      245.5|
| 35|     211.63|
| 36|      246.6|
| 37|     249.33|
+---+-----------+
only showing top 20 rows



In [28]:
sparksession.stop

<bound method SparkSession.stop of <pyspark.sql.session.SparkSession object at 0x107c3d6d0>>

24/04/25 16:59:38 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1905587 ms exceeds timeout 120000 ms
24/04/25 16:59:38 WARN SparkContext: Killing executors is not supported by current scheduler.
24/04/25 16:59:40 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$