In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.appName('pyspark-by-examples').getOrCreate()

simpleData = [("James", "Sales", 3000),
              ("Michael", "Sales", 4600),
              ("Robert", "Sales", 4100),
              ("Maria", "Finance", 3000),
              ("James", "Sales", 3000),
              ("Scott", "Finance", 3300),
              ("Jen", "Finance", 3900),
              ("Jeff", "Marketing", 3000),
              ("Kumar", "Marketing", 2000),
              ("Saif", "Sales", 4100)
              ]
schema = ["employee_name", "department", "salary"]
df = spark.createDataFrame(data=simpleData, schema=schema)
df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+



# approx_count_distinct()
Returns the count of distinct items in a group.

In [3]:
print("approx_count_distinct: " + str(df.select(approx_count_distinct("salary")).collect()[0][0]))

approx_count_distinct: 6


# avg()
Returns the average of values in the input column.

In [4]:
print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

avg: 3400.0


# collect_list()
Returns all values from an input column with duplicates.

In [5]:
df.select(collect_list("salary")).show(truncate=False)

+------------------------------------------------------------+
|collect_list(salary)                                        |
+------------------------------------------------------------+
|[3000, 4600, 4100, 3000, 3000, 3300, 3900, 3000, 2000, 4100]|
+------------------------------------------------------------+



# collect_set()
Returns all values from an input column with duplicate values eliminated.

In [6]:
df.select(collect_set("salary")).show(truncate=False)

+------------------------------------+
|collect_set(salary)                 |
+------------------------------------+
|[4600, 3000, 3900, 4100, 3300, 2000]|
+------------------------------------+



# count()
Returns number of elements in a column.

In [7]:
print("count: " + str(df.select(count("salary")).collect()[0][0]))

count: 10


# countDistinct()
Returns the number of distinct elements in a columns.

In [8]:
df2 = df.select(countDistinct("department", "salary"))
df2.show(truncate=False)
print("Distinct Count of Department & Salary: "+str(df2.collect()[0][0]))

+----------------------------------+
|count(DISTINCT department, salary)|
+----------------------------------+
|8                                 |
+----------------------------------+

Distinct Count of Department & Salary: 8


# first()
Returns the first element in a column. When ignoreNulls is set to true, it returns the first non-null element.

In [9]:
df.select(first("salary")).show(truncate=False)

+-------------+
|first(salary)|
+-------------+
|3000         |
+-------------+



# grouping()
Indicates whether a given input column is aggregated or not. Returns 1 for aggregated or 0 for not aggregated in the result. If you try grouping directly on the salary column you will get below error. 

<span style="color:red">Exception in thread "main" org.apache.spark.sql.AnalysisException:
// grouping() can only be used with GroupingSets/Cube/Rollup</span>

In [10]:
df.cube("department").agg(grouping("department"), sum("salary")).orderBy("department").show()

+----------+--------------------+-----------+
|department|grouping(department)|sum(salary)|
+----------+--------------------+-----------+
|      null|                   1|      34000|
|   Finance|                   0|      10200|
| Marketing|                   0|       5000|
|     Sales|                   0|      18800|
+----------+--------------------+-----------+



# kurtosis()
Returns the kurtosis of the values in a group.
Kurtosis is a measure of whether the data are heavy-tailed or 
light-tailed relative to a normal distribution. That is, data 
sets with high kurtosis tend to have heavy tails, or outliers. 
Data sets with low kurtosis tend to have light tails, or lack of outliers. 
A uniform distribution would be the extreme case.

In [11]:
df.select(kurtosis("salary")).show(truncate=False)

+-------------------+
|kurtosis(salary)   |
+-------------------+
|-0.6467803030303028|
+-------------------+



# last()
Returns the last element in a column. 
When ignoreNulls is set to true, it returns the last non-null element.

In [12]:
df.select(last("salary")).show(truncate=False)

+------------+
|last(salary)|
+------------+
|4100        |
+------------+



# max()
Returns the maximum value in a column.

In [13]:
df.select(max("salary")).show(truncate=False)

+-----------+
|max(salary)|
+-----------+
|4600       |
+-----------+



# mean()
Returns the average of the values in a column (alias for Avg).

In [14]:
df.select(mean("salary")).show(truncate=False)

+-----------+
|avg(salary)|
+-----------+
|3400.0     |
+-----------+



# min()
Returns the minimum value in a column.

In [15]:
df.select(min("salary")).show(truncate=False)

+-----------+
|min(salary)|
+-----------+
|2000       |
+-----------+



# skewness()
Returns the skewness of the values in a group.
A skewness value greater than 1 or less than -1 indicates a highly 
skewed distribution. A value between 0.5 and 1 or -0.5 and -1 is 
moderately skewed. A value between -0.5 and 0.5 indicates that the 
distribution is fairly symmetrical.

In [16]:
df.select(skewness("salary")).show(truncate=False)

+--------------------+
|skewness(salary)    |
+--------------------+
|-0.12041791181069564|
+--------------------+



# stddev()
Alias for `stddev_samp`.

`stddev_samp()` function returns the sample standard deviation of values in a column.

`stddev_pop(**)` function returns the population standard deviation of the values in a column.

In [17]:
df.select(stddev("salary"), stddev_samp("salary"), stddev_pop("salary")).show(truncate=False)

+-------------------+-------------------+------------------+
|stddev_samp(salary)|stddev_samp(salary)|stddev_pop(salary)|
+-------------------+-------------------+------------------+
|765.9416862050705  |765.9416862050705  |726.636084983398  |
+-------------------+-------------------+------------------+



# sum()
Returns the sum of all values in a column.

In [18]:
df.select(sum("salary")).show(truncate=False)

+-----------+
|sum(salary)|
+-----------+
|34000      |
+-----------+



# sum_distinct()
Returns the sum of all distinct values in a column.

In [19]:
df.select(sum_distinct("salary")).show(truncate=False)

+--------------------+
|sum(DISTINCT salary)|
+--------------------+
|20900               |
+--------------------+



# variance()
alias for `var_samp`

`var_samp()` function returns the unbiased variance of the values in a column.

`var_pop()` function returns the population variance of the values in a column.

In [20]:
df.select(variance("salary"), var_samp("salary"), var_pop("salary")).show(truncate=False)

+-----------------+-----------------+---------------+
|var_samp(salary) |var_samp(salary) |var_pop(salary)|
+-----------------+-----------------+---------------+
|586666.6666666666|586666.6666666666|528000.0       |
+-----------------+-----------------+---------------+

