In [1]:
#pyspark initialization
import findspark
findspark.init()

In [2]:
#building spark session
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [3]:
#reading CSV
df = spark.read.format("csv").option("header", "true").load("test_data.csv")
df.show()

+-------------+------------+
|Employee_Name|CGI_Location|
+-------------+------------+
|            A|   Bengaluru|
|            B| Bhubaneswar|
|            C|     Chennai|
|            D|  Coimbatore|
|            E|Ghandhinagar|
|            F|   Bengaluru|
|            G| Bhubaneswar|
|            H|     Chennai|
|            I|  Coimbatore|
|            J|Ghandhinagar|
|            K|   Bengaluru|
|            L| Bhubaneswar|
|            M|     Chennai|
|            N|  Coimbatore|
|            O|Ghandhinagar|
+-------------+------------+



In [4]:
#groupby 'CGI_Location' to get concatenated list of employees
import pyspark.sql.functions as sqlfunc
employee_location = df.groupBy('CGI_Location').agg(sqlfunc.concat_ws(', ', sqlfunc.collect_set(df.Employee_Name)).alias('Employees'))
employee_location.show()                                                   

+------------+---------+
|CGI_Location|Employees|
+------------+---------+
| Bhubaneswar|  G, B, L|
|     Chennai|  M, C, H|
|Ghandhinagar|  J, E, O|
|  Coimbatore|  I, N, D|
|   Bengaluru|  F, K, A|
+------------+---------+



In [5]:
#sorting 'CGI_Location'
from pyspark.sql.functions import *
employee_location = employee_location.sort(col('CGI_Location').asc())
employee_location.show()

+------------+---------+
|CGI_Location|Employees|
+------------+---------+
|   Bengaluru|  F, K, A|
| Bhubaneswar|  G, B, L|
|     Chennai|  M, C, H|
|  Coimbatore|  I, N, D|
|Ghandhinagar|  J, E, O|
+------------+---------+



In [6]:
#DataFrame creation to demonstate 'getItem' method
data = [
    ("Amit", [1, 2, 3]),
    ("Prashant", [4, 5, 6])]
columns = ["name", "numbers"]
df = spark.createDataFrame(data, columns)
df.show()

+--------+---------+
|    name|  numbers|
+--------+---------+
|    Amit|[1, 2, 3]|
|Prashant|[4, 5, 6]|
+--------+---------+



In [7]:
#importing functions
import pyspark.sql.functions as F
from pyspark.sql.functions import *

In [8]:
#demonstrating 'getItem'
df = df.withColumn("first_number", col("numbers").getItem(0))
df = df.withColumn("second_number", col("numbers").getItem(1))
df = df.withColumn("third_number", col("numbers").getItem(2))
df.show()

+--------+---------+------------+-------------+------------+
|    name|  numbers|first_number|second_number|third_number|
+--------+---------+------------+-------------+------------+
|    Amit|[1, 2, 3]|           1|            2|           3|
|Prashant|[4, 5, 6]|           4|            5|           6|
+--------+---------+------------+-------------+------------+



In [9]:
#DataFrame creation to demonstrate 'collect_list' and 'collect_set'
customized_data = [
    ("cust_1", "acc_1"),
    ("cust_2", "acc_2"),
    ("cust_3", "acc_3"),
    ("cust_4", "acc_4"),
    ("cust_1", "acc_1"),
    ("cust_2", "acc_2"),
    ("cust_3", "acc_3"),
    ("cust_4", "acc_4"),
    ("cust_1", "acc_1"),
    ("cust_2", "acc_2")]
columns = ["customer_id", "account_type"]
df = spark.createDataFrame(customized_data, columns)
df.show()

+-----------+------------+
|customer_id|account_type|
+-----------+------------+
|     cust_1|       acc_1|
|     cust_2|       acc_2|
|     cust_3|       acc_3|
|     cust_4|       acc_4|
|     cust_1|       acc_1|
|     cust_2|       acc_2|
|     cust_3|       acc_3|
|     cust_4|       acc_4|
|     cust_1|       acc_1|
|     cust_2|       acc_2|
+-----------+------------+



In [10]:
#demonstrating 'collect_list'
df_grouped_collect_list = df.groupby("customer_id").agg(sqlfunc.concat_ws(", ", collect_list(df.account_type)).alias("concat_account_type"))
df_grouped_collect_list.show()

+-----------+-------------------+
|customer_id|concat_account_type|
+-----------+-------------------+
|     cust_1|acc_1, acc_1, acc_1|
|     cust_2|acc_2, acc_2, acc_2|
|     cust_3|       acc_3, acc_3|
|     cust_4|       acc_4, acc_4|
+-----------+-------------------+



In [11]:
#demonstrating 'collect_set'
df_grouped_collect_set = df.groupby("customer_id").agg(sqlfunc.concat_ws(", ", collect_set(df.account_type)).alias("concat_account_type"))
df_grouped_collect_set.show()

+-----------+-------------------+
|customer_id|concat_account_type|
+-----------+-------------------+
|     cust_1|              acc_1|
|     cust_2|              acc_2|
|     cust_3|              acc_3|
|     cust_4|              acc_4|
+-----------+-------------------+

