In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("SparkSQLTasks") \
    .getOrCreate()


Database & Table Tasks

1.


In [4]:
spark.sql("CREATE DATABASE IF NOT EXISTS sales_db")


DataFrame[]

2.

In [5]:
spark.sql("USE sales_db")


DataFrame[]

3.

In [6]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS product_sales (
        ProductID INT,
        ProductName STRING,
        Category STRING,
        Price DOUBLE,
        Quantity INT,
        SaleDate DATE
    )
    USING PARQUET
""")


DataFrame[]

4.

In [8]:
spark.sql("""
    INSERT INTO product_sales VALUES
    (101, 'Laptop', 'Electronics', 75000, 2, DATE('2023-06-01')),
    (102, 'Phone', 'Electronics', 30000, 1, DATE('2023-06-02')),
    (103, 'Shoes', 'Fashion', 2500, 3, DATE('2023-06-03')),
    (104, 'Book', 'Books', 500, 5, DATE('2023-06-04')),
    (105, 'Toy', 'Toys', 1200, 4, DATE('2023-06-05'))
""")


DataFrame[]

Query Tasks

5.

In [9]:
spark.sql("SELECT * FROM product_sales").show()


+---------+-----------+-----------+-------+--------+----------+
|ProductID|ProductName|   Category|  Price|Quantity|  SaleDate|
+---------+-----------+-----------+-------+--------+----------+
|      101|     Laptop|Electronics|75000.0|       2|2023-06-01|
|      102|      Phone|Electronics|30000.0|       1|2023-06-02|
|      103|      Shoes|    Fashion| 2500.0|       3|2023-06-03|
|      104|       Book|      Books|  500.0|       5|2023-06-04|
|      105|        Toy|       Toys| 1200.0|       4|2023-06-05|
+---------+-----------+-----------+-------+--------+----------+



6.

In [10]:
spark.sql("SELECT * FROM product_sales WHERE Price > 500").show()


+---------+-----------+-----------+-------+--------+----------+
|ProductID|ProductName|   Category|  Price|Quantity|  SaleDate|
+---------+-----------+-----------+-------+--------+----------+
|      101|     Laptop|Electronics|75000.0|       2|2023-06-01|
|      102|      Phone|Electronics|30000.0|       1|2023-06-02|
|      103|      Shoes|    Fashion| 2500.0|       3|2023-06-03|
|      105|        Toy|       Toys| 1200.0|       4|2023-06-05|
+---------+-----------+-----------+-------+--------+----------+



7.

In [11]:
spark.sql("""
    SELECT ProductName, Price, Quantity, (Price * Quantity) AS TotalSale
    FROM product_sales
""").show()


+-----------+-------+--------+---------+
|ProductName|  Price|Quantity|TotalSale|
+-----------+-------+--------+---------+
|     Laptop|75000.0|       2| 150000.0|
|      Phone|30000.0|       1|  30000.0|
|      Shoes| 2500.0|       3|   7500.0|
|       Book|  500.0|       5|   2500.0|
|        Toy| 1200.0|       4|   4800.0|
+-----------+-------+--------+---------+



8.

In [12]:
spark.sql("""
    SELECT Category, SUM(Quantity) AS TotalSold
    FROM product_sales
    GROUP BY Category
""").show()


+-----------+---------+
|   Category|TotalSold|
+-----------+---------+
|Electronics|        3|
|    Fashion|        3|
|      Books|        5|
|       Toys|        4|
+-----------+---------+



9.

In [13]:
spark.sql("""
    SELECT ProductName, (Price * Quantity) AS TotalSale
    FROM product_sales
    ORDER BY TotalSale DESC
""").show()


+-----------+---------+
|ProductName|TotalSale|
+-----------+---------+
|     Laptop| 150000.0|
|      Phone|  30000.0|
|      Shoes|   7500.0|
|        Toy|   4800.0|
|       Book|   2500.0|
+-----------+---------+



Temporary View Tasks

10.

In [14]:
from pyspark.sql import Row

temp_data = [
    Row(ProductID=201, ProductName='Tablet', Quantity=2),
    Row(ProductID=202, ProductName='Monitor', Quantity=1),
    Row(ProductID=203, ProductName='Keyboard', Quantity=3)
]

temp_df = spark.createDataFrame(temp_data)


11.

In [15]:
temp_df.createOrReplaceTempView("temp_orders")


12.

In [16]:
spark.sql("SELECT * FROM temp_orders WHERE Quantity > 1").show()


+---------+-----------+--------+
|ProductID|ProductName|Quantity|
+---------+-----------+--------+
|      201|     Tablet|       2|
|      203|   Keyboard|       3|
+---------+-----------+--------+



Global View Tasks

13.

In [17]:
temp_df.createGlobalTempView("global_orders")


14.


In [18]:
spark.sql("SELECT * FROM global_temp.global_orders WHERE Quantity > 1").show()


+---------+-----------+--------+
|ProductID|ProductName|Quantity|
+---------+-----------+--------+
|      201|     Tablet|       2|
|      203|   Keyboard|       3|
+---------+-----------+--------+



Join Tasks

15.

In [19]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS customer_details (
        CustomerID INT,
        Name STRING,
        Gender STRING,
        City STRING,
        SignupDate DATE
    )
    USING PARQUET
""")


DataFrame[]

16.

In [21]:
spark.sql("""
    INSERT INTO customer_details VALUES
    (101, 'Alice', 'F', 'Mumbai', DATE('2020-01-15')),
    (102, 'Bob', 'M', 'Delhi', DATE('2019-03-22')),
    (104, 'Carol', 'F', 'Chennai', DATE('2021-07-30'))
""")


DataFrame[]

17.

In [22]:
spark.sql("""
    SELECT ps.*, cd.Name, cd.City
    FROM product_sales ps
    JOIN customer_details cd
    ON ps.ProductID = cd.CustomerID
""").show()


+---------+-----------+-----------+-------+--------+----------+-----+-------+
|ProductID|ProductName|   Category|  Price|Quantity|  SaleDate| Name|   City|
+---------+-----------+-----------+-------+--------+----------+-----+-------+
|      101|     Laptop|Electronics|75000.0|       2|2023-06-01|Alice| Mumbai|
|      102|      Phone|Electronics|30000.0|       1|2023-06-02|  Bob|  Delhi|
|      104|       Book|      Books|  500.0|       5|2023-06-04|Carol|Chennai|
+---------+-----------+-----------+-------+--------+----------+-----+-------+



18.

In [23]:
spark.sql("""
    SELECT cd.Name, ps.Quantity
    FROM product_sales ps
    JOIN customer_details cd
    ON ps.ProductID = cd.CustomerID
    WHERE ps.Quantity > 2
""").show()


+-----+--------+
| Name|Quantity|
+-----+--------+
|Carol|       5|
+-----+--------+



View & Summary Tasks

19.

In [24]:
spark.sql("""
    CREATE OR REPLACE VIEW sales_summary AS
    SELECT ProductName, Price, Quantity, (Price * Quantity) AS Total
    FROM product_sales
""")


DataFrame[]

20.

In [25]:
spark.sql("SELECT * FROM sales_summary WHERE Total > 1000").show()


+-----------+-------+--------+--------+
|ProductName|  Price|Quantity|   Total|
+-----------+-------+--------+--------+
|     Laptop|75000.0|       2|150000.0|
|      Phone|30000.0|       1| 30000.0|
|      Shoes| 2500.0|       3|  7500.0|
|       Book|  500.0|       5|  2500.0|
|        Toy| 1200.0|       4|  4800.0|
+-----------+-------+--------+--------+



Cleanup Tasks

21.

In [26]:
spark.sql("DROP VIEW IF EXISTS sales_summary")


DataFrame[]

22.

In [27]:
spark.sql("DROP TABLE IF EXISTS product_sales")
spark.sql("DROP TABLE IF EXISTS customer_details")


DataFrame[]

23.

In [28]:
spark.sql("DROP DATABASE IF EXISTS sales_db CASCADE")


DataFrame[]