In [None]:
from pyspark.sql import SparkSession
import pandas as pd
from io import StringIO

spark=SparkSession.builder.appName("Colab PySpark Setup").getOrCreate()

spark

In [18]:
#1. Create a new database named sales_db.
spark.sql("""create database if not exists sales_db""")

#2. Set the current database to sales_db.
spark.sql("""use sales_db""")

#3. Create a table product_sales with columns:
# """ProductID (INT)
# ProductName (STRING)
# Category (STRING)
# Price (DOUBLE)
# Quantity (INT)
# SaleDate (DATE)
#"""
spark.sql("""create table if not exists product_sales(productid int,
productname string, category string,
price double, quantity int, saledate date) using parquet""")
#4. Insert at least 5 rows into product_sales.
spark.sql("""insert into product_sales values
  (301, 'Bluetooth Speaker', 'Electronics', 2999.00, 5, date '2024-06-06'),
  (302, 'Desk Lamp', 'Home Goods', 899.00, 8, date '2024-06-07'),
  (303, 'Sticky Notes', 'Office Supplies', 35.00, 20, date '2024-06-08'),
  (304, 'Wireless Mouse', 'Electronics', 599.00, 12, date '2024-06-09'),
  (305, 'Coffee Mug', 'Kitchenware', 199.00, 15, date '2024-06-10');
""")


DataFrame[]

In [19]:
#5. Select all records from product_sales.
spark.sql("""select * from product_sales""").show()
#6. Retrieve products where price is above 500.
spark.sql("""select * from product_sales where Price > 500""").show()
#7. Calculate total sale amount (Price * Quantity) for each product.
spark.sql("""select ProductName, Price * Quantity as TotalSaleAmount from product_sales""").show()
#8. Find the number of products sold in each Category.
spark.sql("""select Category, count(ProductID) as NumberOfProductsSold from product_sales group by Category""").show()
#9. Sort products by total sales in descending order.
spark.sql("""select ProductName, Price * Quantity as TotalSaleAmount from product_sales order by TotalSaleAmount desc""").show()


+---------+-----------------+---------------+------+--------+----------+
|productid|      productname|       category| price|quantity|  saledate|
+---------+-----------------+---------------+------+--------+----------+
|      303|     Sticky Notes|Office Supplies|  35.0|      20|2024-06-08|
|      304|   Wireless Mouse|    Electronics| 599.0|      12|2024-06-09|
|      305|       Coffee Mug|    Kitchenware| 199.0|      15|2024-06-10|
|      301|Bluetooth Speaker|    Electronics|2999.0|       5|2024-06-06|
|      302|        Desk Lamp|     Home Goods| 899.0|       8|2024-06-07|
+---------+-----------------+---------------+------+--------+----------+

+---------+-----------------+-----------+------+--------+----------+
|productid|      productname|   category| price|quantity|  saledate|
+---------+-----------------+-----------+------+--------+----------+
|      304|   Wireless Mouse|Electronics| 599.0|      12|2024-06-09|
|      301|Bluetooth Speaker|Electronics|2999.0|       5|2024-06-0

In [20]:

#10. Create a PySpark DataFrame with dummy product data.
from pyspark.sql import Row
data = [
    Row(306, 'Gaming Console', 'Electronics', 39999.0, 3, '2024-06-21'),
    Row(307, 'Study Table', 'Furniture', 6500.0, 2, '2024-06-22'),
    Row(308, 'Marker Pens', 'Office Supplies', 180.0, 20, '2024-06-23'),
    Row(309, 'Wireless Earbuds', 'Electronics', 1999.0, 5, '2024-06-24'),
    Row(310, 'Lunch Box', 'Accessories', 499.0, 8, '2024-06-25'),
]
# Create DataFrame
df = spark.createDataFrame(data, ["ProductID", "ProductName", "Category", "Price", "Quantity", "SaleDate"])

#11. Register it as a temporary view called temp_orders.
# Register temporary view
df.createOrReplaceTempView("temp_orders")

#12. Run a SQL query to filter temp_orders where quantity > 1.
spark.sql("select * from product_sales where quantity>1").show()

+---------+-----------------+---------------+------+--------+----------+
|productid|      productname|       category| price|quantity|  saledate|
+---------+-----------------+---------------+------+--------+----------+
|      303|     Sticky Notes|Office Supplies|  35.0|      20|2024-06-08|
|      304|   Wireless Mouse|    Electronics| 599.0|      12|2024-06-09|
|      305|       Coffee Mug|    Kitchenware| 199.0|      15|2024-06-10|
|      301|Bluetooth Speaker|    Electronics|2999.0|       5|2024-06-06|
|      302|        Desk Lamp|     Home Goods| 899.0|       8|2024-06-07|
+---------+-----------------+---------------+------+--------+----------+



In [21]:

#13. Create a global temp view from a PySpark DataFrame named global_orders.
df.createOrReplaceGlobalTempView("global_orders")
#14. Run a SQL query on the global view from another notebook cell/session.
spark.sql("select * from global_temp.global_orders").show()

+---------+----------------+---------------+-------+--------+----------+
|ProductID|     ProductName|       Category|  Price|Quantity|  SaleDate|
+---------+----------------+---------------+-------+--------+----------+
|      306|  Gaming Console|    Electronics|39999.0|       3|2024-06-21|
|      307|     Study Table|      Furniture| 6500.0|       2|2024-06-22|
|      308|     Marker Pens|Office Supplies|  180.0|      20|2024-06-23|
|      309|Wireless Earbuds|    Electronics| 1999.0|       5|2024-06-24|
|      310|       Lunch Box|    Accessories|  499.0|       8|2024-06-25|
+---------+----------------+---------------+-------+--------+----------+



In [22]:
#15. Create a second table customer_details with:
#CustomerID, Name, Gender, City, SignupDate
spark.sql("""create table if not exists customer_details(CustomerID INT,
Name STRING, Gender STRING, City STRING, SignupDate DATE) using parquet""")
#16. Insert at least 3 records into customer_details.
spark.sql("""
insert into customer_details values
  (301, 'Lewis', 'Male', 'Mumbai', DATE '2024-05-12'),
  (302, 'Max', 'Male', 'Delhi', DATE '2024-05-15'),
  (303, 'Landoni', 'Female', 'Bangalore', DATE '2024-05-20')
""")
#17. Write a SQL join between product_sales and customer_details based on
#ProductID = CustomerID (simulate a match).
spark.sql("""
select p.*, c.Name, c.Gender, c.City, c.SignupDate
from product_sales p
join customer_details c ON p.ProductID = c.CustomerID
""").show()

#18. List customers who bought more than 2 products.
spark.sql("""
select c.Name, SUM(p.Quantity) AS TotalQuantity
from customer_details c
join product_sales p ON c.CustomerID = p.ProductID
group by c.Name
having SUM(p.Quantity) > 2
""").show()


+---------+-----------------+---------------+------+--------+----------+-------+------+---------+----------+
|productid|      productname|       category| price|quantity|  saledate|   Name|Gender|     City|SignupDate|
+---------+-----------------+---------------+------+--------+----------+-------+------+---------+----------+
|      303|     Sticky Notes|Office Supplies|  35.0|      20|2024-06-08|Landoni|Female|Bangalore|2024-05-20|
|      301|Bluetooth Speaker|    Electronics|2999.0|       5|2024-06-06|  Lewis|  Male|   Mumbai|2024-05-12|
|      302|        Desk Lamp|     Home Goods| 899.0|       8|2024-06-07|    Max|  Male|    Delhi|2024-05-15|
+---------+-----------------+---------------+------+--------+----------+-------+------+---------+----------+

+-------+-------------+
|   Name|TotalQuantity|
+-------+-------------+
|Landoni|           20|
|    Max|            8|
|  Lewis|            5|
+-------+-------------+



In [23]:

#19. Create a SQL view sales_summary that includes:
#ProductName, Price, Quantity, Total = Price * Quantity

spark.sql("""create or replace view sales_summary as
select productname, price, quantity, price * quantity as total
from product_sales
""")

#20. Query the view for records with
#Total > 1000 .
spark.sql("""
select *
from sales_summary
where total > 1000
""").show()

+-----------------+------+--------+-------+
|      productname| price|quantity|  total|
+-----------------+------+--------+-------+
|   Wireless Mouse| 599.0|      12| 7188.0|
|       Coffee Mug| 199.0|      15| 2985.0|
|Bluetooth Speaker|2999.0|       5|14995.0|
|        Desk Lamp| 899.0|       8| 7192.0|
+-----------------+------+--------+-------+



In [24]:

#21. Drop the view sales_summary .
spark.sql("""drop view if exists sales_summary""")

#22. Drop the tables product_sales and customer_details
spark.sql("drop table product_sales")
spark.sql("drop table customer_details")
#23. Drop the database sales_db .
spark.sql("drop database sales_db")

DataFrame[]