In [0]:
import os
import sys
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .appName("Aggregate Functions")\
        .getOrCreate()

In [0]:
simpleData = [("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("James", "Sales", 3000),
    ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900),
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100)
  ]
schema = ["employee_name", "department", "salary"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)



#### Aggregate Functions

##### approx_count_distinct

In [0]:
# This function returns the count of distinct values from  column
from pyspark.sql.functions import approx_count_distinct
print("approx_count_distinct: "+\
    str(df.select(approx_count_distinct("salary"))
        .collect()[0][0]))

approx_count_distinct: 6


##### avg

In [0]:
# This function returns the avg of the values from the given column
from pyspark.sql.functions import avg
print("avg of column:"+\
    str(df.select(avg("salary")).collect()[0][0]))

avg of column:3400.0


##### collect_list

In [0]:
# This function returns the list of the non distinct values collected from the column 
from pyspark.sql.functions import collect_list
print("collected list of values : " + \
     str(df.select(collect_list("employee_name")).collect()[0][0]))

collected list of values : ['James', 'Michael', 'Robert', 'Maria', 'James', 'Scott', 'Jen', 'Jeff', 'Kumar', 'Saif']


In [0]:
df.select(collect_list("employee_name").alias("names")).show(truncate=False)

+---------------------------------------------------------------------+
|names                                                                |
+---------------------------------------------------------------------+
|[James, Michael, Robert, Maria, James, Scott, Jen, Jeff, Kumar, Saif]|
+---------------------------------------------------------------------+



##### collect_set

In [0]:
# This function returns the list of the distinct values collected from the column 
from pyspark.sql.functions import collect_set
print("collect set : " + \
     str(df.select(collect_set("salary")) \
        .collect()[0][0]))

collect set : [4600, 3000, 3900, 4100, 3300, 2000]


In [0]:
s=df.select(collect_set("salary"))
s.show(truncate=False)

+------------------------------------+
|collect_set(salary)                 |
+------------------------------------+
|[4600, 3000, 3900, 4100, 3300, 2000]|
+------------------------------------+



##### countDistinct

In [0]:
from pyspark.sql.functions import countDistinct
df2=df.select(countDistinct("salary"))
df2.show()

+----------------------+
|count(DISTINCT salary)|
+----------------------+
|                     6|
+----------------------+



In [0]:
df2=df.select(countDistinct("department","salary"))
df2.show()

+----------------------------------+
|count(DISTINCT department, salary)|
+----------------------------------+
|                                 8|
+----------------------------------+



##### count

In [0]:
from pyspark.sql.functions import count
df2=df.select(count("salary"))
df2.show()

+-------------+
|count(salary)|
+-------------+
|           10|
+-------------+



##### first()

In [0]:
from pyspark.sql.functions import first
df.select(first("salary")).show()

+-------------+
|first(salary)|
+-------------+
|         3000|
+-------------+



##### last()

In [0]:
from pyspark.sql.functions import last
df.select(last("salary")).show()

+------------+
|last(salary)|
+------------+
|        4100|
+------------+



##### max

In [0]:
from pyspark.sql.functions import max
df.select(max("salary")).show()

+-----------+
|max(salary)|
+-----------+
|       4600|
+-----------+



##### min

In [0]:
from pyspark.sql.functions import min
df.select(min("salary")).show()

+-----------+
|min(salary)|
+-----------+
|       2000|
+-----------+



##### min

In [0]:
from pyspark.sql.functions import mean
df.select(mean("salary")).show()

+-----------+
|avg(salary)|
+-----------+
|     3400.0|
+-----------+



##### kurthosis

In [0]:
from pyspark.sql.functions import kurtosis
df.select(kurtosis("salary")).show()

+-------------------+
|   kurtosis(salary)|
+-------------------+
|-0.6467803030303032|
+-------------------+



##### Skewness

In [0]:
from pyspark.sql.functions import skewness
df.select(skewness("salary")).show()

+--------------------+
|    skewness(salary)|
+--------------------+
|-0.12041791181069571|
+--------------------+



##### stddev

In [0]:
from pyspark.sql.functions import stddev
df.select(stddev("salary")).show()

+-------------------+
|stddev_samp(salary)|
+-------------------+
|  765.9416862050705|
+-------------------+



##### stddev_samp

In [0]:
from pyspark.sql.functions import stddev_samp
df.select(stddev_samp("salary")).show()

+-------------------+
|stddev_samp(salary)|
+-------------------+
|  765.9416862050705|
+-------------------+



##### stddev_pop

In [0]:
from pyspark.sql.functions import stddev_pop
df.select(stddev_pop("salary")).show()

+------------------+
|stddev_pop(salary)|
+------------------+
|  726.636084983398|
+------------------+



##### sum

In [0]:
from pyspark.sql.functions import sum
df.select(sum("salary")).show()

+-----------+
|sum(salary)|
+-----------+
|      34000|
+-----------+



##### sum_distinct

In [0]:
from pyspark.sql.functions import sum_distinct
df.select(sum_distinct("salary")).show()

+--------------------+
|sum(DISTINCT salary)|
+--------------------+
|               20900|
+--------------------+



##### variance

In [0]:
from pyspark.sql.functions import variance
df.select(variance("salary")).show()

+-----------------+
| var_samp(salary)|
+-----------------+
|586666.6666666666|
+-----------------+

