In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, desc, sum

# Initialize Spark
spark = SparkSession.builder \
    .appName("CustomerOrdersAnalysis") \
    .getOrCreate()

In [3]:
# Load customers data
customers_df = spark.read.csv(
    "/content/customers.csv",
    header=True,
    inferSchema=True
)

# Load orders data
orders_df = spark.read.csv(
    "/content/orders.csv",
    header=True,
    inferSchema=True
)

# Verify data
print("Customers Data:")
customers_df.show()

print("\nOrders Data:")
orders_df.show()

Customers Data:
+----------+-----+---------+---+
|CustomerID| Name|     City|Age|
+----------+-----+---------+---+
|       101|Aditi|   Mumbai| 28|
|       102|Rohan|    Delhi| 35|
|       103|Meena|Bangalore| 41|
|       104|Kabir|Hyderabad| 30|
|       105| Zoya|  Chennai| 25|
+----------+-----+---------+---+


Orders Data:
+-------+----------+-------+--------+-----+----------+
|OrderID|CustomerID|Product|Quantity|Price| OrderDate|
+-------+----------+-------+--------+-----+----------+
|   1001|       101| Laptop|       1|70000|2024-01-05|
|   1002|       102| Mobile|       2|25000|2024-02-10|
|   1003|       103|   Desk|       1|10000|2024-03-15|
|   1004|       101|  Mouse|       3| 1000|2024-04-01|
|   1005|       104|Monitor|       1|12000|2024-04-25|
+-------+----------+-------+--------+-----+----------+



In [4]:
# 1. Add TotalAmount column
orders_df = orders_df.withColumn(
    "TotalAmount",
    col("Quantity") * col("Price")
)

# 2. Join DataFrames
joined_df = customers_df.join(orders_df, "CustomerID", "left")

In [5]:
# Task 3: High-value orders (>20000)
high_value_orders = joined_df.filter(col("TotalAmount") > 20000)
print("\nHigh value orders (>20000):")
high_value_orders.show()

# Task 4: Customers with multiple orders
multi_order_customers = joined_df.groupBy("CustomerID", "Name") \
    .agg(count("OrderID").alias("OrderCount")) \
    .filter(col("OrderCount") > 1)
print("\nCustomers with >1 order:")
multi_order_customers.show()

# Task 5: Average order by city
avg_by_city = joined_df.groupBy("City") \
    .agg(avg("TotalAmount").alias("AvgOrderValue")) \
    .orderBy(desc("AvgOrderValue"))
print("\nAverage order value by city:")
avg_by_city.show()

# Task 6: Sort by OrderDate
sorted_orders = joined_df.orderBy(desc("OrderDate"))
print("\nOrders sorted by date:")
sorted_orders.show()



High value orders (>20000):
+----------+-----+------+---+-------+-------+--------+-----+----------+-----------+
|CustomerID| Name|  City|Age|OrderID|Product|Quantity|Price| OrderDate|TotalAmount|
+----------+-----+------+---+-------+-------+--------+-----+----------+-----------+
|       101|Aditi|Mumbai| 28|   1001| Laptop|       1|70000|2024-01-05|      70000|
|       102|Rohan| Delhi| 35|   1002| Mobile|       2|25000|2024-02-10|      50000|
+----------+-----+------+---+-------+-------+--------+-----+----------+-----------+


Customers with >1 order:
+----------+-----+----------+
|CustomerID| Name|OrderCount|
+----------+-----+----------+
|       101|Aditi|         2|
+----------+-----+----------+


Average order value by city:
+---------+-------------+
|     City|AvgOrderValue|
+---------+-------------+
|    Delhi|      50000.0|
|   Mumbai|      36500.0|
|Hyderabad|      12000.0|
|Bangalore|      10000.0|
|  Chennai|         NULL|
+---------+-------------+


Orders sorted by date:


In [6]:
# Task 7: Save as Parquet
joined_df.write.partitionBy("City") \
    .mode("overwrite") \
    .parquet("output/customer_orders")

# Task 8-10: SQL Queries
joined_df.createOrReplaceTempView("customer_orders")

print("\nTotal sales by customer:")
spark.sql("""
    SELECT CustomerID, Name, SUM(TotalAmount) as TotalSpent
    FROM customer_orders
    GROUP BY CustomerID, Name
    ORDER BY TotalSpent DESC
""").show()

print("\nProducts per city:")
spark.sql("""
    SELECT City, COUNT(DISTINCT Product) as UniqueProducts
    FROM customer_orders
    GROUP BY City
""").show()

print("\nTop 2 cities by revenue:")
spark.sql("""
    SELECT City, SUM(TotalAmount) as TotalRevenue
    FROM customer_orders
    GROUP BY City
    ORDER BY TotalRevenue DESC
    LIMIT 2
""").show()


Total sales by customer:
+----------+-----+----------+
|CustomerID| Name|TotalSpent|
+----------+-----+----------+
|       101|Aditi|     73000|
|       102|Rohan|     50000|
|       104|Kabir|     12000|
|       103|Meena|     10000|
|       105| Zoya|      NULL|
+----------+-----+----------+


Products per city:
+---------+--------------+
|     City|UniqueProducts|
+---------+--------------+
|Bangalore|             1|
|  Chennai|             0|
|   Mumbai|             2|
|    Delhi|             1|
|Hyderabad|             1|
+---------+--------------+


Top 2 cities by revenue:
+------+------------+
|  City|TotalRevenue|
+------+------------+
|Mumbai|       73000|
| Delhi|       50000|
+------+------------+



In [7]:
spark.stop()
print("Analysis complete!")


Analysis complete!
