In [5]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Sales SQL Example") \
    .enableHiveSupport() \
    .getOrCreate()

# Create database
spark.sql("CREATE DATABASE IF NOT EXISTS sales_db")

# Use the database
spark.sql("USE sales_db")

# Create table
spark.sql("""
    CREATE TABLE IF NOT EXISTS product_sales (
        ProductID INT,
        ProductName STRING,
        Category STRING,
        Price DOUBLE,
        Quantity INT,
        SaleDate DATE
    )
""")

# Insert rows
spark.sql("""
    INSERT INTO product_sales VALUES
    (101, 'Laptop', 'Electronics', 60000, 2, DATE('2024-06-01')),
    (102, 'Mobile', 'Electronics', 20000, 3, DATE('2024-06-02')),
    (103, 'Shoes', 'Footwear', 1500, 4, DATE('2024-06-03')),
    (104, 'Jeans', 'Apparel', 1200, 1, DATE('2024-06-04')),
    (105, 'Watch', 'Accessories', 3000, 2, DATE('2024-06-05'))
""")

# Show inserted data
spark.sql("SELECT * FROM product_sales").show()




+---------+-----------+-----------+-------+--------+----------+
|ProductID|ProductName|   Category|  Price|Quantity|  SaleDate|
+---------+-----------+-----------+-------+--------+----------+
|      101|     Laptop|Electronics|60000.0|       2|2024-06-01|
|      102|     Mobile|Electronics|20000.0|       3|2024-06-02|
|      103|      Shoes|   Footwear| 1500.0|       4|2024-06-03|
|      104|      Jeans|    Apparel| 1200.0|       1|2024-06-04|
|      105|      Watch|Accessories| 3000.0|       2|2024-06-05|
+---------+-----------+-----------+-------+--------+----------+



In [6]:
# 5. Select all records
spark.sql("SELECT * FROM product_sales").show()

# 6. Products where price > 500
spark.sql("SELECT * FROM product_sales WHERE Price > 500").show()

# 7. Total sale amount per product
spark.sql("""
    SELECT ProductName, Price, Quantity, (Price * Quantity) AS TotalAmount
    FROM product_sales
""").show()

# 8. Number of products per category
spark.sql("""
    SELECT Category, COUNT(*) AS NumProducts
    FROM product_sales
    GROUP BY Category
""").show()

# 9. Sort by total sales descending
spark.sql("""
    SELECT ProductName, (Price * Quantity) AS TotalSales
    FROM product_sales
    ORDER BY TotalSales DESC
""").show()


+---------+-----------+-----------+-------+--------+----------+
|ProductID|ProductName|   Category|  Price|Quantity|  SaleDate|
+---------+-----------+-----------+-------+--------+----------+
|      101|     Laptop|Electronics|60000.0|       2|2024-06-01|
|      102|     Mobile|Electronics|20000.0|       3|2024-06-02|
|      103|      Shoes|   Footwear| 1500.0|       4|2024-06-03|
|      104|      Jeans|    Apparel| 1200.0|       1|2024-06-04|
|      105|      Watch|Accessories| 3000.0|       2|2024-06-05|
+---------+-----------+-----------+-------+--------+----------+

+---------+-----------+-----------+-------+--------+----------+
|ProductID|ProductName|   Category|  Price|Quantity|  SaleDate|
+---------+-----------+-----------+-------+--------+----------+
|      101|     Laptop|Electronics|60000.0|       2|2024-06-01|
|      102|     Mobile|Electronics|20000.0|       3|2024-06-02|
|      103|      Shoes|   Footwear| 1500.0|       4|2024-06-03|
|      104|      Jeans|    Apparel| 120

In [7]:
# 10. Create PySpark DataFrame
dummy_data = [
    (201, "Tablet", "Electronics", 15000.0, 2, "2024-06-01"),
    (202, "Socks", "Apparel", 300.0, 5, "2024-06-02"),
    (203, "Keyboard", "Electronics", 800.0, 1, "2024-06-03"),
]
columns = ["ProductID", "ProductName", "Category", "Price", "Quantity", "SaleDate"]

df_temp = spark.createDataFrame(dummy_data, columns)

# 11. Register temporary view
df_temp.createOrReplaceTempView("temp_orders")

# 12. Query where quantity > 1
spark.sql("SELECT * FROM temp_orders WHERE Quantity > 1").show()


+---------+-----------+-----------+-------+--------+----------+
|ProductID|ProductName|   Category|  Price|Quantity|  SaleDate|
+---------+-----------+-----------+-------+--------+----------+
|      201|     Tablet|Electronics|15000.0|       2|2024-06-01|
|      202|      Socks|    Apparel|  300.0|       5|2024-06-02|
+---------+-----------+-----------+-------+--------+----------+



In [8]:
# 13. Create global temp view
df_temp.createOrReplaceGlobalTempView("global_orders")

# 14. Query global view (can be accessed across sessions via global_temp)
spark.sql("SELECT * FROM global_temp.global_orders WHERE Quantity > 1").show()


+---------+-----------+-----------+-------+--------+----------+
|ProductID|ProductName|   Category|  Price|Quantity|  SaleDate|
+---------+-----------+-----------+-------+--------+----------+
|      201|     Tablet|Electronics|15000.0|       2|2024-06-01|
|      202|      Socks|    Apparel|  300.0|       5|2024-06-02|
+---------+-----------+-----------+-------+--------+----------+



In [9]:
# 15. Create customer_details table
spark.sql("""
    CREATE TABLE IF NOT EXISTS customer_details (
        CustomerID INT,
        Name STRING,
        Gender STRING,
        City STRING,
        SignupDate DATE
    )
""")

# 16. Insert customer data
spark.sql("""
    INSERT INTO customer_details VALUES
    (101, 'Ali', 'Male', 'Hyderabad', DATE('2024-01-01')),
    (102, 'Neha', 'Female', 'Mumbai', DATE('2024-02-15')),
    (105, 'Raj', 'Male', 'Delhi', DATE('2024-03-20'))
""")

# 17. Join product_sales and customer_details (ProductID = CustomerID)
spark.sql("""
    SELECT ps.ProductID, ps.ProductName, cd.Name, cd.City
    FROM product_sales ps
    JOIN customer_details cd
    ON ps.ProductID = cd.CustomerID
""").show()

# 18. Customers who bought more than 2 products
spark.sql("""
    SELECT cd.Name, ps.ProductName, ps.Quantity
    FROM product_sales ps
    JOIN customer_details cd
    ON ps.ProductID = cd.CustomerID
    WHERE ps.Quantity > 2
""").show()


+---------+-----------+----+---------+
|ProductID|ProductName|Name|     City|
+---------+-----------+----+---------+
|      101|     Laptop| Ali|Hyderabad|
|      102|     Mobile|Neha|   Mumbai|
|      105|      Watch| Raj|    Delhi|
+---------+-----------+----+---------+

+----+-----------+--------+
|Name|ProductName|Quantity|
+----+-----------+--------+
|Neha|     Mobile|       3|
+----+-----------+--------+



In [10]:
# 19. Create view sales_summary
spark.sql("""
    CREATE VIEW IF NOT EXISTS sales_summary AS
    SELECT ProductName, Price, Quantity, (Price * Quantity) AS Total
    FROM product_sales
""")

# 20. Query the view
spark.sql("SELECT * FROM sales_summary WHERE Total > 1000").show()


+-----------+-------+--------+--------+
|ProductName|  Price|Quantity|   Total|
+-----------+-------+--------+--------+
|     Laptop|60000.0|       2|120000.0|
|     Mobile|20000.0|       3| 60000.0|
|      Shoes| 1500.0|       4|  6000.0|
|      Jeans| 1200.0|       1|  1200.0|
|      Watch| 3000.0|       2|  6000.0|
+-----------+-------+--------+--------+



In [11]:
# 21. Drop view
spark.sql("DROP VIEW IF EXISTS sales_summary")

# 22. Drop tables
spark.sql("DROP TABLE IF EXISTS product_sales")
spark.sql("DROP TABLE IF EXISTS customer_details")

# 23. Drop database
spark.sql("DROP DATABASE IF EXISTS sales_db CASCADE")


DataFrame[]