 1. Ingest the CSV files into two PySpark DataFrames

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("CustomerOrders").getOrCreate()

customers_df = spark.read.option("header", "true").option("inferSchema", "true").csv("file:/Workspace/Shared/customers.csv")
orders_df = spark.read.option("header", "true").option("inferSchema", "true").csv("file:/Workspace/Shared/orders.csv")


 2. Infer schema and print the schema for both

In [0]:
customers_df.printSchema()
orders_df.printSchema()


root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Age: integer (nullable = true)

root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- OrderDate: date (nullable = true)



3. Add a column TotalAmount = Quantity * Price to orders

In [0]:
from pyspark.sql.functions import col

orders_df = orders_df.withColumn("TotalAmount", col("Quantity") * col("Price"))
orders_df.show()


+-------+----------+-------+--------+-----+----------+-----------+
|OrderID|CustomerID|Product|Quantity|Price| OrderDate|TotalAmount|
+-------+----------+-------+--------+-----+----------+-----------+
|   1001|       101| Laptop|       1|70000|2024-01-05|      70000|
|   1002|       102| Mobile|       2|25000|2024-02-10|      50000|
|   1003|       103|   Desk|       1|10000|2024-03-15|      10000|
|   1004|       101|  Mouse|       3| 1000|2024-04-01|       3000|
|   1005|       104|Monitor|       1|12000|2024-04-25|      12000|
+-------+----------+-------+--------+-----+----------+-----------+



4. Join both DataFrames on CustomerID

In [0]:
joined_df = orders_df.join(customers_df, on="CustomerID", how="inner")
joined_df.show()

+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|CustomerID|OrderID|Product|Quantity|Price| OrderDate|TotalAmount| Name|     City|Age|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|       101|   1001| Laptop|       1|70000|2024-01-05|      70000|Aditi|   Mumbai| 28|
|       102|   1002| Mobile|       2|25000|2024-02-10|      50000|Rohan|    Delhi| 35|
|       103|   1003|   Desk|       1|10000|2024-03-15|      10000|Meena|Bangalore| 41|
|       101|   1004|  Mouse|       3| 1000|2024-04-01|       3000|Aditi|   Mumbai| 28|
|       104|   1005|Monitor|       1|12000|2024-04-25|      12000|Kabir|Hyderabad| 30|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+



5. Filter orders where TotalAmount > 20000

In [0]:
high_value_orders = joined_df.filter(col("TotalAmount") > 20000)
high_value_orders.show()

+----------+-------+-------+--------+-----+----------+-----------+-----+------+---+
|CustomerID|OrderID|Product|Quantity|Price| OrderDate|TotalAmount| Name|  City|Age|
+----------+-------+-------+--------+-----+----------+-----------+-----+------+---+
|       101|   1001| Laptop|       1|70000|2024-01-05|      70000|Aditi|Mumbai| 28|
|       102|   1002| Mobile|       2|25000|2024-02-10|      50000|Rohan| Delhi| 35|
+----------+-------+-------+--------+-----+----------+-----------+-----+------+---+



6. Show customers who placed more than 1 order

In [0]:
from pyspark.sql.functions import count

multiple_orders = joined_df.groupBy("CustomerID").agg(count("*").alias("OrderCount")).filter(col("OrderCount") > 1)
multiple_orders.show()

+----------+----------+
|CustomerID|OrderCount|
+----------+----------+
|       101|         2|
+----------+----------+



7. Group orders by City and get average order value

In [0]:
from pyspark.sql.functions import avg

avg_order_by_city = joined_df.groupBy("City").agg(avg("TotalAmount").alias("AvgOrderValue"))
avg_order_by_city.show()

+---------+-------------+
|     City|AvgOrderValue|
+---------+-------------+
|Bangalore|      10000.0|
|   Mumbai|      36500.0|
|    Delhi|      50000.0|
|Hyderabad|      12000.0|
+---------+-------------+



8. Sort orders by OrderDate in descending order

In [0]:
sorted_orders = joined_df.orderBy(col("OrderDate").desc())
sorted_orders.show()

+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|CustomerID|OrderID|Product|Quantity|Price| OrderDate|TotalAmount| Name|     City|Age|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|       104|   1005|Monitor|       1|12000|2024-04-25|      12000|Kabir|Hyderabad| 30|
|       101|   1004|  Mouse|       3| 1000|2024-04-01|       3000|Aditi|   Mumbai| 28|
|       103|   1003|   Desk|       1|10000|2024-03-15|      10000|Meena|Bangalore| 41|
|       102|   1002| Mobile|       2|25000|2024-02-10|      50000|Rohan|    Delhi| 35|
|       101|   1001| Laptop|       1|70000|2024-01-05|      70000|Aditi|   Mumbai| 28|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+



9. Write the final result as a Parquet file partitioned by City

In [0]:
sorted_orders.write.mode("overwrite").partitionBy("City").parquet("file:/Workspace/Shared/output/orders_partitioned")


10. Create a temporary view

Total sales by customer

In [0]:
joined_df.createOrReplaceTempView("orders_view")

spark.sql("""
    SELECT Name, SUM(TotalAmount) AS TotalSales
    FROM orders_view
    GROUP BY Name
""").show()

+-----+----------+
| Name|TotalSales|
+-----+----------+
|Kabir|     12000|
|Rohan|     50000|
|Aditi|     73000|
|Meena|     10000|
+-----+----------+



Count of products per city

In [0]:
spark.sql("""
    SELECT City, COUNT(*) AS ProductCount
    FROM orders_view
    GROUP BY City
""").show()

+---------+------------+
|     City|ProductCount|
+---------+------------+
|Bangalore|           1|
|   Mumbai|           2|
|    Delhi|           1|
|Hyderabad|           1|
+---------+------------+



Top 2 cities by revenue

In [0]:
spark.sql("""
    SELECT City, SUM(TotalAmount) AS Revenue
    FROM orders_view
    GROUP BY City
    ORDER BY Revenue DESC
    LIMIT 2
""").show()


+------+-------+
|  City|Revenue|
+------+-------+
|Mumbai|  73000|
| Delhi|  50000|
+------+-------+

