### **Task -  Retail Store Analysis**


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, avg, max

#   Create Spark Session
spark = SparkSession.builder.appName("RetailStoreAnalysis").getOrCreate()

#   Create DataFrames
customers_data = [
    (1, "Rahul", "Bangalore", 25),
    (2, "Priya", "Delhi", 32),
    (3, "Aman", "Hyderabad", 29),
    (4, "Sneha", "Chennai", 35)
]
customers_cols = ["customer_id", "name", "city", "age"]

orders_data = [
    (101, 1, "Laptop", 55000),
    (102, 2, "Mobile", 25000),
    (103, 1, "Headphones", 3000),
    (104, 3, "Book", 700),
    (105, 4, "Chair", 5000),
    (106, 2, "Shoes", 2000)
]
orders_cols = ["order_id", "customer_id", "product", "amount"]

customers_df = spark.createDataFrame(customers_data, customers_cols)
orders_df = spark.createDataFrame(orders_data, orders_cols)

# Show DataFrames
print("Customers DataFrame:")
customers_df.show()
print("Orders DataFrame:")
orders_df.show()

Customers DataFrame:
+-----------+-----+---------+---+
|customer_id| name|     city|age|
+-----------+-----+---------+---+
|          1|Rahul|Bangalore| 25|
|          2|Priya|    Delhi| 32|
|          3| Aman|Hyderabad| 29|
|          4|Sneha|  Chennai| 35|
+-----------+-----+---------+---+

Orders DataFrame:
+--------+-----------+----------+------+
|order_id|customer_id|   product|amount|
+--------+-----------+----------+------+
|     101|          1|    Laptop| 55000|
|     102|          2|    Mobile| 25000|
|     103|          1|Headphones|  3000|
|     104|          3|      Book|   700|
|     105|          4|     Chair|  5000|
|     106|          2|     Shoes|  2000|
+--------+-----------+----------+------+



In [0]:
#  Show all customers who are older than 30
print("Customers older than 30:")
customers_df.filter(col("age") > 30).show()

Customers older than 30:
+-----------+-----+-------+---+
|customer_id| name|   city|age|
+-----------+-----+-------+---+
|          2|Priya|  Delhi| 32|
|          4|Sneha|Chennai| 35|
+-----------+-----+-------+---+



In [0]:
#  List all distinct cities where customers live
print("Distinct Cities:")
customers_df.select("city").distinct().show()

Distinct Cities:
+---------+
|     city|
+---------+
|Bangalore|
|Hyderabad|
|    Delhi|
|  Chennai|
+---------+



In [0]:
# Find the total amount spent by each customer
print("Total amount spent by each customer:")
orders_df.groupBy("customer_id").agg(sum("amount").alias("total_spent")).show()

Total amount spent by each customer:
+-----------+-----------+
|customer_id|total_spent|
+-----------+-----------+
|          2|      27000|
|          3|        700|
|          1|      58000|
|          4|       5000|
+-----------+-----------+



In [0]:
# Find the average order amount across all orders
print("Average order amount:")
orders_df.agg(avg("amount").alias("avg_order_amount")).show()

Average order amount:
+------------------+
|  avg_order_amount|
+------------------+
|15116.666666666666|
+------------------+



In [0]:
#  Find the most expensive order placed
print("Most expensive order:")
orders_df.orderBy(col("amount").desc()).limit(1).show()

Most expensive order:
+--------+-----------+-------+------+
|order_id|customer_id|product|amount|
+--------+-----------+-------+------+
|     101|          1| Laptop| 55000|
+--------+-----------+-------+------+



In [0]:
#  Join customers with their orders and display name, city, product, amount
print("Join customers with orders:")
joined_df = customers_df.join(orders_df, "customer_id")
joined_df.select("name", "city", "product", "amount").show()

Join customers with orders:
+-----+---------+----------+------+
| name|     city|   product|amount|
+-----+---------+----------+------+
|Rahul|Bangalore|    Laptop| 55000|
|Priya|    Delhi|    Mobile| 25000|
|Rahul|Bangalore|Headphones|  3000|
| Aman|Hyderabad|      Book|   700|
|Sneha|  Chennai|     Chair|  5000|
|Priya|    Delhi|     Shoes|  2000|
+-----+---------+----------+------+



In [0]:
# Find the city with the highest total spending
print("City with highest total spending:")
city_spending = joined_df.groupBy("city").agg(sum("amount").alias("total_spent"))
city_spending.orderBy(col("total_spent").desc()).limit(1).show()

City with highest total spending:
+---------+-----------+
|     city|total_spent|
+---------+-----------+
|Bangalore|      58000|
+---------+-----------+



In [0]:
#  Using SQL, list the top 2 customers by total spend
print("Top 2 customers by total spend (SQL):")
customers_df.createOrReplaceTempView("customers")
orders_df.createOrReplaceTempView("orders")

spark.sql("""
SELECT c.name, SUM(o.amount) as total_spent
FROM customers c
JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.name
ORDER BY total_spent DESC
LIMIT 2
""").show()

Top 2 customers by total spend (SQL):
+-----+-----------+
| name|total_spent|
+-----+-----------+
|Rahul|      58000|
|Priya|      27000|
+-----+-----------+



In [0]:
#  Save orders_df as CSV and load it back
import os
csv_path = os.path.abspath("orders_csv")
orders_df.write.mode("overwrite").csv(csv_path, header=True)
new_orders_df = spark.read.csv(csv_path, header=True, inferSchema=True)
new_orders_df.show()


+--------+-----------+----------+------+
|order_id|customer_id|   product|amount|
+--------+-----------+----------+------+
|     103|          1|Headphones|  3000|
|     101|          1|    Laptop| 55000|
|     102|          2|    Mobile| 25000|
|     105|          4|     Chair|  5000|
|     106|          2|     Shoes|  2000|
|     104|          3|      Book|   700|
+--------+-----------+----------+------+

