In [2]:
from pyspark.sql import SparkSession
import getpass
username=getpass.getuser()
spark=SparkSession.\
    builder.\
    config('spark.ui.port','0').\
    config("spark.sql.warehouse.dir",f"/user/{username}/warehouse").\
    enableHiveSupport().\
    master('yarn').\
    getOrCreate()

In [4]:
orders_df = spark.read \
.format("csv") \
.option("header","true") \
.option("inferSchema","true") \
.load("source_file_path*")

In [3]:
orders_df.show(4)

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
+--------+--------------------+-----------+---------------+
only showing top 4 rows



In [4]:
orders_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



## Higher level APIs demo

In [5]:
orders_df.createOrReplaceTempView("orders")

#### 1 top 15 customers who placed the most number of orders

In [12]:
df00 = orders_df.groupby("customer_id")\
.count()\
.sort("count", ascending = False).limit(15)

In [13]:
df00.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       5897|   16|
|      12431|   16|
|        569|   16|
|       6316|   16|
|      12284|   15|
|       4320|   15|
|       5624|   15|
|       5283|   15|
|        221|   15|
|       5654|   15|
|       6248|   14|
|       3708|   14|
|       1011|   14|
|       8652|   14|
|       4517|   14|
+-----------+-----+



In [14]:
# using spark sql
df01 = spark.sql("select customer_id,count(order_id) as count from orders group by customer_id order by count desc limit 15")

In [15]:
df01.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       5897|   16|
|      12431|   16|
|        569|   16|
|       6316|   16|
|      12284|   15|
|       4320|   15|
|       5624|   15|
|       5283|   15|
|        221|   15|
|       5654|   15|
|       6248|   14|
|       3708|   14|
|       1011|   14|
|       8652|   14|
|       4517|   14|
+-----------+-----+



#### 2 Find the number of orders under each order status

In [16]:
df02 = orders_df.groupby("order_status").count()

In [17]:
df02.show()

+---------------+-----+
|   order_status|count|
+---------------+-----+
|PENDING_PAYMENT|15030|
|       COMPLETE|22899|
|        ON_HOLD| 3798|
| PAYMENT_REVIEW|  729|
|     PROCESSING| 8275|
|         CLOSED| 7556|
|SUSPECTED_FRAUD| 1558|
|        PENDING| 7610|
|       CANCELED| 1428|
+---------------+-----+



In [18]:
#spark sql
df02 = spark.sql("select order_status, count(order_id) as count from orders group by order_status")

In [19]:
df02.show()

+---------------+-----+
|   order_status|count|
+---------------+-----+
|PENDING_PAYMENT|15030|
|       COMPLETE|22899|
|        ON_HOLD| 3798|
| PAYMENT_REVIEW|  729|
|     PROCESSING| 8275|
|         CLOSED| 7556|
|SUSPECTED_FRAUD| 1558|
|        PENDING| 7610|
|       CANCELED| 1428|
+---------------+-----+



#### 3 number of active user(who placed atleast one order)

In [20]:
df03 = orders_df.select("customer_id").distinct().count()

In [22]:
print(df03)

12405


In [23]:
#spark sql
spark.sql("select count(distinct(customer_id)) as active_user from orders")

active_user
12405


#### 4 customer with most number of closed orders

In [24]:
df04 = orders_df.filter("order_status = 'CLOSED'")\
.groupby("customer_id")\
.count()\
.sort("count", ascending = False)

In [25]:
df04.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       1833|    6|
|       1687|    5|
|       1363|    5|
|       5493|    5|
|       2403|    4|
|      12431|    4|
|      10263|    4|
|       2236|    4|
|      10111|    4|
|       7948|    4|
|       4573|    4|
|       7879|    4|
|       2774|    4|
|       4588|    4|
|       1521|    4|
|        437|    4|
|      10018|    4|
|       5319|    4|
|       7850|    4|
|       3631|    4|
+-----------+-----+
only showing top 20 rows



In [6]:
df04 = spark.sql("select customer_id , count(order_id) as count from orders where order_status = 'CLOSED' group by customer_id order by count desc")

In [7]:
df04.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       1833|    6|
|       1363|    5|
|       1687|    5|
|       5493|    5|
|       7948|    4|
|       2768|    4|
|      10263|    4|
|       2236|    4|
|       2403|    4|
|       7879|    4|
|       4573|    4|
|       7850|    4|
|      12431|    4|
|       1521|    4|
|      10111|    4|
|        437|    4|
|      10018|    4|
|       5319|    4|
|       2774|    4|
|       3631|    4|
+-----------+-----+
only showing top 20 rows



In [None]:
spark.stop()