# **Customers & Orders**

**Create DataFrames**

In [1]:
!pip install pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DataFrame-Exercises").getOrCreate()



In [2]:
# Customers Data
customers_data = [
(1, "Rahul Sharma", "Bangalore", 28),
(2, "Priya Singh", "Delhi", 32),
(3, "Aman Kumar", "Hyderabad", 25),
(4, "Sneha Reddy", "Chennai", 35),
(5, "Arjun Mehta", "Mumbai", 30),
(6, "Divya Nair", "Delhi", 29)
]
customers_cols = ["customer_id", "name", "city", "age"]
customers_df = spark.createDataFrame(customers_data, customers_cols)
# Orders Data
orders_data = [
(101, 1, "Laptop", 55000),
(102, 2, "Mobile", 25000),
(103, 1, "Headphones", 3000),
(104, 3, "Chair", 5000),
(105, 5, "Book", 700),
(106, 2, "Tablet", 20000),
(107, 6, "Shoes", 2500),
(108, 7, "Camera", 30000) # Order with non-existent customer
]
orders_cols = ["order_id", "customer_id", "product", "amount"]
orders_df = spark.createDataFrame(orders_data, orders_cols)
customers_df.show()
orders_df.show()

+-----------+------------+---------+---+
|customer_id|        name|     city|age|
+-----------+------------+---------+---+
|          1|Rahul Sharma|Bangalore| 28|
|          2| Priya Singh|    Delhi| 32|
|          3|  Aman Kumar|Hyderabad| 25|
|          4| Sneha Reddy|  Chennai| 35|
|          5| Arjun Mehta|   Mumbai| 30|
|          6|  Divya Nair|    Delhi| 29|
+-----------+------------+---------+---+

+--------+-----------+----------+------+
|order_id|customer_id|   product|amount|
+--------+-----------+----------+------+
|     101|          1|    Laptop| 55000|
|     102|          2|    Mobile| 25000|
|     103|          1|Headphones|  3000|
|     104|          3|     Chair|  5000|
|     105|          5|      Book|   700|
|     106|          2|    Tablet| 20000|
|     107|          6|     Shoes|  2500|
|     108|          7|    Camera| 30000|
+--------+-----------+----------+------+



**Exercises (Operations on DataFrames)**

**Basic Operations**

1. Select only name and city from customers.

In [3]:
customers_df.select("name", "city").show()

+------------+---------+
|        name|     city|
+------------+---------+
|Rahul Sharma|Bangalore|
| Priya Singh|    Delhi|
|  Aman Kumar|Hyderabad|
| Sneha Reddy|  Chennai|
| Arjun Mehta|   Mumbai|
|  Divya Nair|    Delhi|
+------------+---------+



2. Filter customers older than 30.

In [4]:
customers_df.filter(customers_df.age > 30).show()

+-----------+-----------+-------+---+
|customer_id|       name|   city|age|
+-----------+-----------+-------+---+
|          2|Priya Singh|  Delhi| 32|
|          4|Sneha Reddy|Chennai| 35|
+-----------+-----------+-------+---+



3. Count how many customers are from "Delhi".

In [5]:
customers_df.filter(customers_df.city == "Delhi").count()

2

4. Find distinct cities in the customer list.

In [6]:
customers_df.select("city").distinct().show()

+---------+
|     city|
+---------+
|Bangalore|
|    Delhi|
|Hyderabad|
|  Chennai|
|   Mumbai|
+---------+



**Aggregations**

5. Find the average age of customers.

In [7]:
from pyspark.sql.functions import avg
customers_df.agg(avg("age")).show()

+------------------+
|          avg(age)|
+------------------+
|29.833333333333332|
+------------------+



6. Find the maximum and minimum order amount.

In [8]:
from pyspark.sql.functions import max, min
orders_df.agg(max("amount"), min("amount")).show()


+-----------+-----------+
|max(amount)|min(amount)|
+-----------+-----------+
|      55000|        700|
+-----------+-----------+



7. Count number of orders placed by each customer.

In [9]:
orders_df.groupBy("customer_id").count().withColumnRenamed("count", "order_count").show()

+-----------+-----------+
|customer_id|order_count|
+-----------+-----------+
|          1|          2|
|          3|          1|
|          2|          2|
|          7|          1|
|          6|          1|
|          5|          1|
+-----------+-----------+



8. Calculate total spending of each customer.

In [10]:
from pyspark.sql.functions import sum
orders_df.groupBy("customer_id").agg(sum("amount").alias("total_spent")).show()

+-----------+-----------+
|customer_id|total_spent|
+-----------+-----------+
|          1|      58000|
|          3|       5000|
|          2|      45000|
|          7|      30000|
|          6|       2500|
|          5|        700|
+-----------+-----------+



**Joins**

9. Perform an inner join between customers and orders.

In [11]:
customers_df.join(orders_df, "customer_id", "inner").show()

+-----------+------------+---------+---+--------+----------+------+
|customer_id|        name|     city|age|order_id|   product|amount|
+-----------+------------+---------+---+--------+----------+------+
|          1|Rahul Sharma|Bangalore| 28|     101|    Laptop| 55000|
|          1|Rahul Sharma|Bangalore| 28|     103|Headphones|  3000|
|          2| Priya Singh|    Delhi| 32|     102|    Mobile| 25000|
|          2| Priya Singh|    Delhi| 32|     106|    Tablet| 20000|
|          3|  Aman Kumar|Hyderabad| 25|     104|     Chair|  5000|
|          5| Arjun Mehta|   Mumbai| 30|     105|      Book|   700|
|          6|  Divya Nair|    Delhi| 29|     107|     Shoes|  2500|
+-----------+------------+---------+---+--------+----------+------+



10. Perform a left join to show all customers (even without orders).

In [12]:
customers_df.join(orders_df, "customer_id", "left").show()

+-----------+------------+---------+---+--------+----------+------+
|customer_id|        name|     city|age|order_id|   product|amount|
+-----------+------------+---------+---+--------+----------+------+
|          1|Rahul Sharma|Bangalore| 28|     103|Headphones|  3000|
|          1|Rahul Sharma|Bangalore| 28|     101|    Laptop| 55000|
|          3|  Aman Kumar|Hyderabad| 25|     104|     Chair|  5000|
|          2| Priya Singh|    Delhi| 32|     106|    Tablet| 20000|
|          2| Priya Singh|    Delhi| 32|     102|    Mobile| 25000|
|          6|  Divya Nair|    Delhi| 29|     107|     Shoes|  2500|
|          5| Arjun Mehta|   Mumbai| 30|     105|      Book|   700|
|          4| Sneha Reddy|  Chennai| 35|    NULL|      NULL|  NULL|
+-----------+------------+---------+---+--------+----------+------+



11. Find customers who have never placed an order.

In [13]:
customers_df.join(orders_df, "customer_id", "left") \
    .filter(orders_df.order_id.isNull()) \
    .select("customer_id", "name").show()

+-----------+-----------+
|customer_id|       name|
+-----------+-----------+
|          4|Sneha Reddy|
+-----------+-----------+



12. Find orders that belong to non-existent customers.

In [14]:
orders_df.join(customers_df, "customer_id", "left") \
    .filter(customers_df.name.isNull()) \
    .select("order_id", "customer_id", "product", "amount").show()

+--------+-----------+-------+------+
|order_id|customer_id|product|amount|
+--------+-----------+-------+------+
|     108|          7| Camera| 30000|
+--------+-----------+-------+------+



**Sorting & Grouping**

13. List customers ordered by age (descending)

In [15]:
customers_df.orderBy(customers_df.age.desc()).show()

+-----------+------------+---------+---+
|customer_id|        name|     city|age|
+-----------+------------+---------+---+
|          4| Sneha Reddy|  Chennai| 35|
|          2| Priya Singh|    Delhi| 32|
|          5| Arjun Mehta|   Mumbai| 30|
|          6|  Divya Nair|    Delhi| 29|
|          1|Rahul Sharma|Bangalore| 28|
|          3|  Aman Kumar|Hyderabad| 25|
+-----------+------------+---------+---+



14. Show top 3 highest order amounts

In [16]:
orders_df.orderBy(orders_df.amount.desc()).show(3)

+--------+-----------+-------+------+
|order_id|customer_id|product|amount|
+--------+-----------+-------+------+
|     101|          1| Laptop| 55000|
|     108|          7| Camera| 30000|
|     102|          2| Mobile| 25000|
+--------+-----------+-------+------+
only showing top 3 rows



15. Group customers by city and find average age

In [17]:
customers_df.groupBy("city").agg(avg("age").alias("avg_age")).show()

+---------+-------+
|     city|avg_age|
+---------+-------+
|Bangalore|   28.0|
|    Delhi|   30.5|
|Hyderabad|   25.0|
|  Chennai|   35.0|
|   Mumbai|   30.0|
+---------+-------+



16. Group orders by product and find total sales amount

In [18]:
orders_df.groupBy("product").agg(sum("amount").alias("total_sales")).show()

+----------+-----------+
|   product|total_sales|
+----------+-----------+
|     Chair|       5000|
|    Laptop|      55000|
|    Mobile|      25000|
|Headphones|       3000|
|      Book|        700|
|    Camera|      30000|
|     Shoes|       2500|
|    Tablet|      20000|
+----------+-----------+



**SQL Operations**

17. Register both DataFrames as temp views

In [20]:
customers_df.createOrReplaceTempView("customers")
orders_df.createOrReplaceTempView("orders")

18. Write a SQL query to find total revenue by city

In [21]:
spark.sql("""
    SELECT c.city, SUM(o.amount) AS total_revenue
    FROM customers c
    JOIN orders o ON c.customer_id = o.customer_id
    GROUP BY c.city
""").show()


+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|        58000|
|   Mumbai|          700|
|    Delhi|        47500|
|Hyderabad|         5000|
+---------+-------------+



19. Write a SQL query to list top 2 customers by total spend

In [22]:
spark.sql("""
    SELECT c.name, SUM(o.amount) AS total_spent
    FROM customers c
    JOIN orders o ON c.customer_id = o.customer_id
    GROUP BY c.name
    ORDER BY total_spent DESC
    LIMIT 2
""").show()


+------------+-----------+
|        name|total_spent|
+------------+-----------+
|Rahul Sharma|      58000|
| Priya Singh|      45000|
+------------+-----------+



20. Write a SQL query to find all customers who bought products worth more than 20,000

In [23]:
spark.sql("""
    SELECT DISTINCT c.customer_id, c.name, o.product, o.amount
    FROM customers c
    JOIN orders o ON c.customer_id = o.customer_id
    WHERE o.amount > 20000
""").show()


+-----------+------------+-------+------+
|customer_id|        name|product|amount|
+-----------+------------+-------+------+
|          1|Rahul Sharma| Laptop| 55000|
|          2| Priya Singh| Mobile| 25000|
+-----------+------------+-------+------+

