In [0]:
# 1. Set the Azure Blob Storage account access key
spark.conf.set(
    "fs.azure.account.key.hexa1.blob.core.windows.net",
    "tEQ86DOREVI9rWneMnt+hKiDIhJ46/Q6F6pnSlUUS3i5jh2bzdiu/eDh7JBuufX0TucDieYs3lyV+ASt6dYDEw=="  # Replace with your actual key
)

# 2. Read customers.csv (space-separated)
customers_df = spark.read.option("header", True) \
    .option("inferSchema", True) \
    .option("delimiter", " ") \
    .csv("wasbs://images@hexa1.blob.core.windows.net/customers.csv")

# 3. Read orders.csv (space-separated)
orders_df = spark.read.option("header", True) \
    .option("inferSchema", True) \
    .option("delimiter", " ") \
    .csv("wasbs://images@hexa1.blob.core.windows.net/orders.csv")

# 4. Display schemas and data
print(" Customers Schema:")
customers_df.printSchema()
customers_df.show()

print(" Orders Schema:")
orders_df.printSchema()
orders_df.show()



 Customers Schema:
root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Age: integer (nullable = true)

+----------+-----+---------+---+
|CustomerID| Name|     City|Age|
+----------+-----+---------+---+
|       101|Aditi|   Mumbai| 28|
|       102|Rohan|    Delhi| 35|
|       103|Meena|Bangalore| 41|
|       104|Kabir|Hyderabad| 30|
|       105| Zoya|  Chennai| 25|
+----------+-----+---------+---+

 Orders Schema:
root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- OrderDate: date (nullable = true)

+-------+----------+-------+--------+-----+----------+
|OrderID|CustomerID|Product|Quantity|Price| OrderDate|
+-------+----------+-------+--------+-----+----------+
|   1001|       101| Laptop|       1|70000|2024-01-05|
|   1002|       102| Mobile|       2|25000|2

In [0]:
from pyspark.sql.functions import col

orders_df = orders_df.withColumn("TotalAmount", col("Quantity") * col("Price"))
orders_df.show()


+-------+----------+-------+--------+-----+----------+-----------+
|OrderID|CustomerID|Product|Quantity|Price| OrderDate|TotalAmount|
+-------+----------+-------+--------+-----+----------+-----------+
|   1001|       101| Laptop|       1|70000|2024-01-05|      70000|
|   1002|       102| Mobile|       2|25000|2024-02-10|      50000|
|   1003|       103|   Desk|       1|10000|2024-03-15|      10000|
|   1004|       101|  Mouse|       3| 1000|2024-04-01|       3000|
|   1005|       104|Monitor|       1|12000|2024-04-25|      12000|
+-------+----------+-------+--------+-----+----------+-----------+



In [0]:
joined_df = orders_df.join(customers_df, on="CustomerID", how="inner")
joined_df.show()


+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|CustomerID|OrderID|Product|Quantity|Price| OrderDate|TotalAmount| Name|     City|Age|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|       101|   1001| Laptop|       1|70000|2024-01-05|      70000|Aditi|   Mumbai| 28|
|       102|   1002| Mobile|       2|25000|2024-02-10|      50000|Rohan|    Delhi| 35|
|       103|   1003|   Desk|       1|10000|2024-03-15|      10000|Meena|Bangalore| 41|
|       101|   1004|  Mouse|       3| 1000|2024-04-01|       3000|Aditi|   Mumbai| 28|
|       104|   1005|Monitor|       1|12000|2024-04-25|      12000|Kabir|Hyderabad| 30|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+



In [0]:
high_value_orders = joined_df.filter(col("TotalAmount") > 20000)
high_value_orders.show()


+----------+-------+-------+--------+-----+----------+-----------+-----+------+---+
|CustomerID|OrderID|Product|Quantity|Price| OrderDate|TotalAmount| Name|  City|Age|
+----------+-------+-------+--------+-----+----------+-----------+-----+------+---+
|       101|   1001| Laptop|       1|70000|2024-01-05|      70000|Aditi|Mumbai| 28|
|       102|   1002| Mobile|       2|25000|2024-02-10|      50000|Rohan| Delhi| 35|
+----------+-------+-------+--------+-----+----------+-----------+-----+------+---+



In [0]:
from pyspark.sql.functions import count

order_counts = joined_df.groupBy("CustomerID", "Name") \
    .agg(count("OrderID").alias("OrderCount")) \
    .filter(col("OrderCount") > 1)

order_counts.show()


+----------+-----+----------+
|CustomerID| Name|OrderCount|
+----------+-----+----------+
|       101|Aditi|         2|
+----------+-----+----------+



In [0]:
from pyspark.sql.functions import avg

avg_order_by_city = joined_df.groupBy("City") \
    .agg(avg("TotalAmount").alias("AvgOrderValue"))

avg_order_by_city.show()


+---------+-------------+
|     City|AvgOrderValue|
+---------+-------------+
|Bangalore|      10000.0|
|   Mumbai|      36500.0|
|    Delhi|      50000.0|
|Hyderabad|      12000.0|
+---------+-------------+



In [0]:
from pyspark.sql.functions import to_date

joined_df = joined_df.withColumn("OrderDate", to_date("OrderDate", "yyyy-MM-dd"))
sorted_orders = joined_df.orderBy(col("OrderDate").desc())

sorted_orders.show()


+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|CustomerID|OrderID|Product|Quantity|Price| OrderDate|TotalAmount| Name|     City|Age|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|       104|   1005|Monitor|       1|12000|2024-04-25|      12000|Kabir|Hyderabad| 30|
|       101|   1004|  Mouse|       3| 1000|2024-04-01|       3000|Aditi|   Mumbai| 28|
|       103|   1003|   Desk|       1|10000|2024-03-15|      10000|Meena|Bangalore| 41|
|       102|   1002| Mobile|       2|25000|2024-02-10|      50000|Rohan|    Delhi| 35|
|       101|   1001| Laptop|       1|70000|2024-01-05|      70000|Aditi|   Mumbai| 28|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+



In [0]:
sorted_orders.write.mode("overwrite") \
    .partitionBy("City") \
    .parquet("dbfs:/FileStore/output/customer_orders_parquet")


In [0]:
sorted_orders.createOrReplaceTempView("customer_orders_view")


In [0]:
%sql
SELECT Name, SUM(TotalAmount) AS TotalSales
FROM customer_orders_view
GROUP BY Name
ORDER BY TotalSales DESC


Name,TotalSales
Aditi,73000
Rohan,50000
Kabir,12000
Meena,10000


In [0]:
%sql
SELECT City, COUNT(DISTINCT Product) AS ProductCount
FROM customer_orders_view
GROUP BY City


City,ProductCount
Bangalore,1
Mumbai,2
Delhi,1
Hyderabad,1


In [0]:
%sql
SELECT City, SUM(TotalAmount) AS Revenue
FROM customer_orders_view
GROUP BY City
ORDER BY Revenue DESC
LIMIT 2


City,Revenue
Mumbai,73000
Delhi,50000
