In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
    .appName("Spark exercise 2") \
    .master("local[*]") \
    .getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x000001B0F0046B30>


In [3]:
rdd = spark.sparkContext.parallelize([
    (1,'2013-07-25 00:00:00',11599, 'CLOSED'),
    (2,'2013-07-25 00:00:00',256, 'PENDING_PAYMENT'),
    (3,'2013-07-25 00:00:00',12111, 'COMPLETE'),
    (4,'2013-07-25 00:00:00',8827, 'CLOSED'),
    (5,'2013-07-25 00:00:00',11318, 'COMPLETE'),
    (6,'2013-07-25 00:00:00',7130, 'COMPLETE'),
    (7,'2013-07-25 00:00:00',4530, 'COMPLETE'),
    (8,'2013-07-25 00:00:00',2911, 'PROCESSING'),
    (9,'2013-07-25 00:00:00',5657, 'PENDING_PAYMENT'),
    (10,'2013-07-25 00:00:00',5648, 'PENDING_PAYMENT'),
    (11,'2013-07-25 00:00:00',918, 'PAYMENT_REVIEW'),
    (12,'2013-07-25 00:00:00',1837, 'CLOSED'),
    (13,'2013-07-25 00:00:00',9149, 'PENDING_PAYMENT'),
    (14,'2013-07-25 00:00:00',9842, 'PROCESSING'),
    (15,'2013-07-25 00:00:00',2568, 'COMPLETE'),
    (16,'2013-07-25 00:00:00',7276, 'PENDING_PAYMENT'),
    (17,'2013-07-25 00:00:00',2667, 'COMPLETE'),
    (18,'2013-07-25 00:00:00',1205, 'CLOSED'),
    (19,'2013-07-25 00:00:00',9488, 'PENDING_PAYMENT'),
    (20,'2013-07-25 00:00:00',9198, 'PROCESSING'),
    # Please add more orders for existing customers
    (21,'2013-07-25 00:00:00',11599, 'CLOSED'),
    (22,'2013-07-25 00:00:00',256, 'PENDING_PAYMENT'),
    (23,'2013-07-25 00:00:00',12111, 'COMPLETE'),
    (24,'2013-07-25 00:00:00',8827, 'CLOSED'),
    (25,'2013-07-25 00:00:00',11318, 'COMPLETE'),
    (26,'2013-07-25 00:00:00',7130, 'COMPLETE'),
    (27,'2013-07-25 00:00:00',4530, 'COMPLETE'),
    (28,'2013-07-25 00:00:00',2911, 'PROCESSING'),
    (29,'2013-07-25 00:00:00',5657, 'PENDING_PAYMENT'),
    (30,'2013-07-25 00:00:00',5648, 'PENDING_PAYMENT'),
    (31,'2013-07-25 00:00:00',918, 'PAYMENT_REVIEW'),
    (32,'2013-07-25 00:00:00',1837, 'CLOSED'),
    (33,'2013-07-25 00:00:00',9149, 'PENDING_PAYMENT'),
    (34,'2013-07-25 00:00:00',9842, 'PROCESSING'),
    (35,'2013-07-25 00:00:00',2568, 'COMPLETE'),
    (36,'2013-07-25 00:00:00',7276, 'PENDING_PAYMENT'),
    (37,'2013-07-25 00:00:00',2667, 'COMPLETE'),
    (38,'2013-07-25 00:00:00',1205, 'CLOSED'),
    (39,'2013-07-25 00:00:00',9488, 'PENDING_PAYMENT'),
    (40,'2013-07-25 00:00:00',9198, 'PROCESSING'),
])

In [4]:
# Map the RDD to a DF
from pyspark.sql import Row
df = rdd.map(lambda x: Row(order_id=x[0], order_date=x[1], customer_id=x[2], status=x[3]))

In [5]:
from pyspark.sql import types
schema = types.StructType([
    types.StructField('order_id', types.IntegerType(), True),
    types.StructField('order_date', types.StringType(), True),
    types.StructField('customer_id', types.IntegerType(), True),
    types.StructField('status', types.StringType(), True)
])

In [6]:
# Create DF with Schema
df = spark.createDataFrame(df, schema)

In [7]:
from pyspark.sql import functions as F
df = df.withColumn('order_date', F.to_timestamp(df['order_date']))

In [8]:
# check Schema  
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- status: string (nullable = true)



In [9]:
df.show(5)

+--------+-------------------+-----------+---------------+
|order_id|         order_date|customer_id|         status|
+--------+-------------------+-----------+---------------+
|       1|2013-07-25 00:00:00|      11599|         CLOSED|
|       2|2013-07-25 00:00:00|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|      12111|       COMPLETE|
|       4|2013-07-25 00:00:00|       8827|         CLOSED|
|       5|2013-07-25 00:00:00|      11318|       COMPLETE|
+--------+-------------------+-----------+---------------+
only showing top 5 rows



In [10]:
# Update order_date from timestamp to date
df = df.withColumn('order_date', F.to_date(df['order_date']))

In [11]:
df.show(5)

+--------+----------+-----------+---------------+
|order_id|order_date|customer_id|         status|
+--------+----------+-----------+---------------+
|       1|2013-07-25|      11599|         CLOSED|
|       2|2013-07-25|        256|PENDING_PAYMENT|
|       3|2013-07-25|      12111|       COMPLETE|
|       4|2013-07-25|       8827|         CLOSED|
|       5|2013-07-25|      11318|       COMPLETE|
+--------+----------+-----------+---------------+
only showing top 5 rows



In [12]:
#1: The number of orders for each status
df_1 = df.groupBy('status').count().show()

+---------------+-----+
|         status|count|
+---------------+-----+
|PENDING_PAYMENT|   12|
|       COMPLETE|   12|
|         CLOSED|    8|
|     PROCESSING|    6|
| PAYMENT_REVIEW|    2|
+---------------+-----+



In [13]:
#2: Top 10 customers who have placed the most orders
df_2 = df.groupBy('customer_id').count().orderBy('count',ascending=False).show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       8827|    2|
|      12111|    2|
|      11318|    2|
|        256|    2|
|      11599|    2|
|       4530|    2|
|       2911|    2|
|       5657|    2|
|       7130|    2|
|       5648|    2|
|        918|    2|
|       9842|    2|
|       1837|    2|
|       2568|    2|
|       9149|    2|
|       9488|    2|
|       2667|    2|
|       1205|    2|
|       9198|    2|
|       7276|    2|
+-----------+-----+



In [14]:
#3: Total number of customers who have placed orders
df_3 = df.select('customer_id').distinct().count()

In [15]:
print(df_3)

20


In [None]:
#3: Total number of customers who have placed orders using SQL
orders = df.createOrReplaceTempView('orders')
df_33 = spark.sql('select count(distinct customer_id) as total_customers from orders')
df_33.show()