In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
.appName("sales_db_sql")\
.getOrCreate()

spark

In [3]:
# Database & Table Tasks
# 1. Create a new database named sales_db .
spark.sql("create database sales_db")

DataFrame[]

In [4]:
# 2. Set the current database to sales_db .
spark.sql("use sales_db")

DataFrame[]

In [6]:
# Create a table product_sales with columns:
# ProductID (INT)
# ProductName (STRING)
# Category (STRING)
# Price (DOUBLE)
# Quantity (INT)
# SaleDate (DATE)

spark.sql("""create table product_sales(
  productid int,
  productname string,
  category string,
  price double,
  quantity int,
  salesdate date
)
using parquet""")

DataFrame[]

In [7]:
# 4. Insert at least 5 rows into product_sales .
spark.sql("""insert into product_sales values
(101, 'Laptop', 'Electronics', 75000.00, 3, DATE('2024-12-01')),
(102, 'Mobile', 'Electronics', 25000.00, 5, DATE('2024-12-02')),
(103, 'Chair', 'Furniture', 3500.00, 2, DATE('2024-12-03')),
(104, 'Desk', 'Furniture', 5000.00, 1, DATE('2024-12-04')),
(105, 'Headphones', 'Electronics', 1500.00, 4, DATE('2024-12-05'))""")

DataFrame[]

In [8]:
# Query Tasks
# 5. Select all records from product_sales .
spark.sql("select * from product_sales").show()

+---------+-----------+-----------+-------+--------+----------+
|productid|productname|   category|  price|quantity| salesdate|
+---------+-----------+-----------+-------+--------+----------+
|      103|      Chair|  Furniture| 3500.0|       2|2024-12-03|
|      104|       Desk|  Furniture| 5000.0|       1|2024-12-04|
|      105| Headphones|Electronics| 1500.0|       4|2024-12-05|
|      101|     Laptop|Electronics|75000.0|       3|2024-12-01|
|      102|     Mobile|Electronics|25000.0|       5|2024-12-02|
+---------+-----------+-----------+-------+--------+----------+



In [9]:
# 6. Retrieve products where price is above 500.
spark.sql("select * from product_sales where price > 500").show()

+---------+-----------+-----------+-------+--------+----------+
|productid|productname|   category|  price|quantity| salesdate|
+---------+-----------+-----------+-------+--------+----------+
|      103|      Chair|  Furniture| 3500.0|       2|2024-12-03|
|      104|       Desk|  Furniture| 5000.0|       1|2024-12-04|
|      105| Headphones|Electronics| 1500.0|       4|2024-12-05|
|      101|     Laptop|Electronics|75000.0|       3|2024-12-01|
|      102|     Mobile|Electronics|25000.0|       5|2024-12-02|
+---------+-----------+-----------+-------+--------+----------+



In [11]:
# 7. Calculate total sale amount ( Price * Quantity ) for each product.
spark.sql("select productname,(price*quantity) as totalsales from product_sales").show()

+-----------+----------+
|productname|totalsales|
+-----------+----------+
|      Chair|    7000.0|
|       Desk|    5000.0|
| Headphones|    6000.0|
|     Laptop|  225000.0|
|     Mobile|  125000.0|
+-----------+----------+



In [12]:
# 8. Find the number of products sold in each Category .
spark.sql("select sum(quantity) as numberofproducts from product_sales").show()

+----------------+
|numberofproducts|
+----------------+
|              15|
+----------------+



In [13]:
# 9. Sort products by total sales in descending order.
spark.sql("select productname,(price*quantity) as totalsales from product_sales order by totalsales").show()

+-----------+----------+
|productname|totalsales|
+-----------+----------+
|       Desk|    5000.0|
| Headphones|    6000.0|
|      Chair|    7000.0|
|     Mobile|  125000.0|
|     Laptop|  225000.0|
+-----------+----------+



In [15]:
# Temporary View Tasks
# 10. Create a PySpark DataFrame with dummy product data.
from datetime import date
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType

data = [
    (201,"Keyboard","Electronics",1200.00,2,date(2024, 12, 1)),
    (202, "Mouse", "Electronics", 800.0, 1, date(2024, 12, 2)),
    (203, "Notebook", "Stationery", 50.0, 5, date(2024, 12, 3)),
    (204, "Pen", "Stationery", 20.0, 10, date(2024, 12, 4)),
    (205, "Monitor", "Electronics", 15000.0, 1, date(2024, 12, 5))
]

schema = StructType([
    StructField("ProductID", IntegerType(), True),
    StructField("ProductName", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("Price", DoubleType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("SaleDate", DateType(), True)
])

df = spark.createDataFrame(data, schema)
df.show()

+---------+-----------+-----------+-------+--------+----------+
|ProductID|ProductName|   Category|  Price|Quantity|  SaleDate|
+---------+-----------+-----------+-------+--------+----------+
|      201|   Keyboard|Electronics| 1200.0|       2|2024-12-01|
|      202|      Mouse|Electronics|  800.0|       1|2024-12-02|
|      203|   Notebook| Stationery|   50.0|       5|2024-12-03|
|      204|        Pen| Stationery|   20.0|      10|2024-12-04|
|      205|    Monitor|Electronics|15000.0|       1|2024-12-05|
+---------+-----------+-----------+-------+--------+----------+



In [17]:
# 11. Register it as a temporary view called temp_orders .
df.createOrReplaceTempView("temp_orders")

In [18]:
# 12. Run a SQL query to filter temp_orders where quantity > 1.
spark.sql("select * from temp_orders where Quantity >1").show()

+---------+-----------+-----------+------+--------+----------+
|ProductID|ProductName|   Category| Price|Quantity|  SaleDate|
+---------+-----------+-----------+------+--------+----------+
|      201|   Keyboard|Electronics|1200.0|       2|2024-12-01|
|      203|   Notebook| Stationery|  50.0|       5|2024-12-03|
|      204|        Pen| Stationery|  20.0|      10|2024-12-04|
+---------+-----------+-----------+------+--------+----------+



In [23]:
# Global View Tasks
# 13. Create a global temp view from a PySpark DataFrame named global_orders .
# 14. Run a SQL query on the global view from another notebook cell/session.

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType
from datetime import date

data = [
    (301, "Tablet", "Electronics", 22000.0, 2, date(2024,12,6)),
    (302, "Book", "Stationery", 500.0, 3, date(2024,12,7)),
    (303, "Lamp", "Furniture", 1500.0, 1, date(2024,12,8)),
    (304, "Router", "Electronics", 3200.0, 2, date(2024,12,9)),
    (305, "Marker", "Stationery", 80.0, 6, date(2024,12,10))
]

schema = StructType([
    StructField("ProductID", IntegerType(), True),
    StructField("ProductName", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("Price", DoubleType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("SaleDate", DateType(), True)
])

df_global = spark.createDataFrame(data, schema)
df_global.show()

+---------+-----------+-----------+-------+--------+----------+
|ProductID|ProductName|   Category|  Price|Quantity|  SaleDate|
+---------+-----------+-----------+-------+--------+----------+
|      301|     Tablet|Electronics|22000.0|       2|2024-12-06|
|      302|       Book| Stationery|  500.0|       3|2024-12-07|
|      303|       Lamp|  Furniture| 1500.0|       1|2024-12-08|
|      304|     Router|Electronics| 3200.0|       2|2024-12-09|
|      305|     Marker| Stationery|   80.0|       6|2024-12-10|
+---------+-----------+-----------+-------+--------+----------+



In [24]:
df_global.createOrReplaceGlobalTempView("global_orders")

In [25]:
result = spark.sql("SELECT * FROM global_temp.global_orders WHERE Quantity > 1")
result.show()

+---------+-----------+-----------+-------+--------+----------+
|ProductID|ProductName|   Category|  Price|Quantity|  SaleDate|
+---------+-----------+-----------+-------+--------+----------+
|      301|     Tablet|Electronics|22000.0|       2|2024-12-06|
|      302|       Book| Stationery|  500.0|       3|2024-12-07|
|      304|     Router|Electronics| 3200.0|       2|2024-12-09|
|      305|     Marker| Stationery|   80.0|       6|2024-12-10|
+---------+-----------+-----------+-------+--------+----------+



In [26]:
# Join Tasks
# 15. Create a second table customer_details with:
# CustomerID , Name , Gender , City , SignupDate
spark.sql("""create table customer_details(
  customerid int,
  name string,
  gender string,
  city string,
  signupdate date
)
using parquet
""")

DataFrame[]

In [27]:
# 16. Insert at least 3 records into customer_details .
spark.sql("""
INSERT INTO customer_details VALUES
(101, 'Asha', 'Female', 'Chennai', DATE('2024-11-15')),
(102, 'Ravi', 'Male', 'Mumbai', DATE('2024-11-20')),
(105, 'Meena', 'Female', 'Delhi', DATE('2024-11-22'));
""")

DataFrame[]

In [28]:
# 17. Write a SQL join between product_sales and customer_details based on ProductID = CustomerID (simulate a match).
spark.sql("select p.productid,p.productname,c.name,c.city from product_sales p join customer_details c on productid = customerid").show()

+---------+-----------+-----+-------+
|productid|productname| name|   city|
+---------+-----------+-----+-------+
|      101|     Laptop| Asha|Chennai|
|      102|     Mobile| Ravi| Mumbai|
|      105| Headphones|Meena|  Delhi|
+---------+-----------+-----+-------+



In [34]:
# 18. List customers who bought more than 2 products.
spark.sql("""
    SELECT c.customerid, c.name, SUM(p.quantity) AS numberofproduct
    FROM product_sales p
    JOIN customer_details c
    ON p.productid = c.customerid
    GROUP BY c.customerid, c.name
    HAVING SUM(p.quantity) > 2
""").show()

+----------+-----+---------------+
|customerid| name|numberofproduct|
+----------+-----+---------------+
|       101| Asha|              3|
|       105|Meena|              4|
|       102| Ravi|              5|
+----------+-----+---------------+



In [36]:
# View & Summary Tasks
# 19. Create a SQL view sales_summary that includes: ProductName , Price , Quantity , Total = Price * Quantity

spark.sql("""
    CREATE OR REPLACE VIEW sales_summary AS
    SELECT
        ProductName,
        Price,
        Quantity,
        (Price * Quantity) AS Total
    FROM
        product_sales
""")

DataFrame[]

In [37]:
# 20. Query the view for records with Total > 1000 .
spark.sql("""
    SELECT * FROM sales_summary
    WHERE Total > 1000
""").show()

+-----------+-------+--------+--------+
|ProductName|  Price|Quantity|   Total|
+-----------+-------+--------+--------+
|      Chair| 3500.0|       2|  7000.0|
|       Desk| 5000.0|       1|  5000.0|
| Headphones| 1500.0|       4|  6000.0|
|     Laptop|75000.0|       3|225000.0|
|     Mobile|25000.0|       5|125000.0|
+-----------+-------+--------+--------+



In [38]:
# Cleanup Tasks
# 21. Drop the view sales_summary .
spark.sql("drop view sales_summary")
# 22. Drop the tables product_sales and customer_details .
spark.sql("drop table product_sales")
spark.sql("drop table customer_details")
# 23. Drop the database sales_db .
spark.sql("drop database sales_db")

DataFrame[]