In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
.appName("Data sets two")\
.getOrCreate()

spark

In [8]:
from google.colab import drive
drive.mount('/content/drive')

# Ingest the CSV files into two PySpark DataFrames
customer_df = spark.read.csv('/content/drive/MyDrive/customers (1).csv',header= True,inferSchema=True)
orders_df = spark.read.csv('/content/drive/MyDrive/orders (1).csv',header= True,inferSchema=True)
customer_df.show(5)
orders_df.show(5)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
+----------+-----+---------+---+
|CustomerID| Name|     City|Age|
+----------+-----+---------+---+
|       101|Aditi|   Mumbai| 28|
|       102|Rohan|    Delhi| 35|
|       103|Meena|Bangalore| 41|
|       104|Kabir|Hyderabad| 30|
|       105| Zoya|  Chennai| 25|
+----------+-----+---------+---+

+-------+----------+-------+--------+-----+----------+
|OrderID|CustomerID|Product|Quantity|Price| OrderDate|
+-------+----------+-------+--------+-----+----------+
|   1001|       101| Laptop|       1|70000|2024-01-05|
|   1002|       102| Mobile|       2|25000|2024-02-10|
|   1003|       103|   Desk|       1|10000|2024-03-15|
|   1004|       101|  Mouse|       3| 1000|2024-04-01|
|   1005|       104|Monitor|       1|12000|2024-04-25|
+-------+----------+-------+--------+-----+----------+



In [9]:
# 2. Infer schema and print the schema for both

customer_df.printSchema()
orders_df.printSchema()

root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Age: integer (nullable = true)

root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- OrderDate: date (nullable = true)



In [10]:
# 3. Add a column TotalAmount = Quantity * Price to orders

orders_df = orders_df.withColumn("TotalAmount",orders_df["Quantity"] * orders_df["Price"])
orders_df.show()

+-------+----------+-------+--------+-----+----------+-----------+
|OrderID|CustomerID|Product|Quantity|Price| OrderDate|TotalAmount|
+-------+----------+-------+--------+-----+----------+-----------+
|   1001|       101| Laptop|       1|70000|2024-01-05|      70000|
|   1002|       102| Mobile|       2|25000|2024-02-10|      50000|
|   1003|       103|   Desk|       1|10000|2024-03-15|      10000|
|   1004|       101|  Mouse|       3| 1000|2024-04-01|       3000|
|   1005|       104|Monitor|       1|12000|2024-04-25|      12000|
+-------+----------+-------+--------+-----+----------+-----------+



In [12]:
# 4. Join both DataFrames on CustomerID

joined_df = orders_df.join(customer_df,on = "CustomerId")
joined_df.show()

+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|CustomerID|OrderID|Product|Quantity|Price| OrderDate|TotalAmount| Name|     City|Age|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|       101|   1001| Laptop|       1|70000|2024-01-05|      70000|Aditi|   Mumbai| 28|
|       102|   1002| Mobile|       2|25000|2024-02-10|      50000|Rohan|    Delhi| 35|
|       103|   1003|   Desk|       1|10000|2024-03-15|      10000|Meena|Bangalore| 41|
|       101|   1004|  Mouse|       3| 1000|2024-04-01|       3000|Aditi|   Mumbai| 28|
|       104|   1005|Monitor|       1|12000|2024-04-25|      12000|Kabir|Hyderabad| 30|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+



In [13]:
# 5. Filter orders where TotalAmount > 20000

orders_df.filter(orders_df['TotalAmount'] > 20000).show()

+-------+----------+-------+--------+-----+----------+-----------+
|OrderID|CustomerID|Product|Quantity|Price| OrderDate|TotalAmount|
+-------+----------+-------+--------+-----+----------+-----------+
|   1001|       101| Laptop|       1|70000|2024-01-05|      70000|
|   1002|       102| Mobile|       2|25000|2024-02-10|      50000|
+-------+----------+-------+--------+-----+----------+-----------+



In [14]:
from ast import alias
# 6. Show customers who placed more than 1 order
from pyspark.sql.functions import count,col

order_amt = joined_df.groupBy("CustomerID").agg(count("*").alias("Count"))
order_amt.filter(col("Count") > 1).show()

+----------+-----+
|CustomerID|Count|
+----------+-----+
|       101|    2|
+----------+-----+



In [15]:
# 7. Group orders by City and get average order value
from pyspark.sql.functions import avg,col

joined_df.groupBy('City').agg(avg(col('TotalAmount')).alias("Average")).show()

+---------+-------+
|     City|Average|
+---------+-------+
|Bangalore|10000.0|
|   Mumbai|36500.0|
|    Delhi|50000.0|
|Hyderabad|12000.0|
+---------+-------+



In [16]:
# 8. Sort orders by OrderDate in descending order

orders_df = orders_df.orderBy(orders_df["OrderDate"].desc())
orders_df.show()

+-------+----------+-------+--------+-----+----------+-----------+
|OrderID|CustomerID|Product|Quantity|Price| OrderDate|TotalAmount|
+-------+----------+-------+--------+-----+----------+-----------+
|   1005|       104|Monitor|       1|12000|2024-04-25|      12000|
|   1004|       101|  Mouse|       3| 1000|2024-04-01|       3000|
|   1003|       103|   Desk|       1|10000|2024-03-15|      10000|
|   1002|       102| Mobile|       2|25000|2024-02-10|      50000|
|   1001|       101| Laptop|       1|70000|2024-01-05|      70000|
+-------+----------+-------+--------+-----+----------+-----------+



In [18]:
# 9. Write the final result as a Parquet file partitioned by City

orders_df.write.mode("overwrite").parquet("/content/drive/MyDrive/order_df_parquet")

In [19]:
# 10. Create a temporary view and run Spark SQL:
joined_df.createOrReplaceTempView("ord_cus_views")

# Total sales by customer
spark.sql("""select name, sum(totalamount) as totalsales from ord_cus_views group by name""").show()

# Count of products per city
spark.sql("""select city, count(product) as productpercity from ord_cus_views group by city""").show()

# Top 2 cities by revenue
spark.sql("""select city, sum(totalamount) as revenue from ord_cus_views group by city order by revenue desc limit 2""").show()

+-----+----------+
| name|totalsales|
+-----+----------+
|Kabir|     12000|
|Rohan|     50000|
|Aditi|     73000|
|Meena|     10000|
+-----+----------+

+---------+--------------+
|     city|productpercity|
+---------+--------------+
|Bangalore|             1|
|   Mumbai|             2|
|    Delhi|             1|
|Hyderabad|             1|
+---------+--------------+

+------+-------+
|  city|revenue|
+------+-------+
|Mumbai|  73000|
| Delhi|  50000|
+------+-------+

