In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("PracticeProject") \
    .enableHiveSupport() \
    .getOrCreate()

spark.sql("CREATE DATABASE IF NOT EXISTS sales")

# Customers Data
customers_data = [
    (101, 'Ali', 'ali@gmail.com', 'Mumbai', '2022-05-10'),
    (102, 'Neha', 'neha@yahoo.com', 'Delhi', '2023-01-15'),
    (103, 'Ravi', 'ravi@hotmail.com', 'Bangalore', '2021-11-01'),
    (104, 'Sneha', 'sneha@outlook.com', 'Hyderabad', '2020-07-22'),
    (105, 'Amit', 'amit@gmail.com', 'Chennai', '2023-03-10'),
]

orders_data = [
    (1, 101, 'Laptop', 'Electronics', 2, 50000.0, '2024-01-10'),
    (2, 101, 'Mouse', 'Electronics', 1, 1200.0, '2024-01-15'),
    (3, 102, 'Tablet', 'Electronics', 1, 20000.0, '2024-02-01'),
    (4, 103, 'Bookshelf', 'Furniture', 1, 3500.0, '2024-02-10'),
    (5, 104, 'Mixer', 'Appliances', 1, 5000.0, '2024-02-15'),
    (6, 105, 'Notebook', 'Stationery', 5, 500.0, '2024-03-01'),
    (7, 102, 'Phone', 'Electronics', 1, 30000.0, '2024-03-02'),
]

# Create DataFrames
customers_df = spark.createDataFrame(customers_data, ["CustomerID", "Name", "Email", "City", "SignupDate"])
orders_df = spark.createDataFrame(orders_data, ["OrderID", "CustomerID", "Product", "Category", "Quantity", "Price", "OrderDate"])

# Save DataFrames as Hive tables
customers_df.write.mode("overwrite").saveAsTable("sales.customers")
orders_df.write.mode("overwrite").saveAsTable("sales.orders")


In [3]:
from pyspark.sql.functions import col, expr, when, lower, year, lit, coalesce

# 1. Add TotalAmount = Price * Quantity
orders_df = orders_df.withColumn("TotalAmount", col("Price") * col("Quantity"))
orders_df.show()

# 2. Filter orders with TotalAmount > 10000
orders_df.filter(col("TotalAmount") > 10000).show()

# 3. Standardize the City field in customers_df
customers_df = customers_df.withColumn("City", lower(col("City")))
customers_df.show()

# 4. Extract year from OrderDate
orders_df = orders_df.withColumn("OrderYear", year(col("OrderDate")))
orders_df.select("OrderID", "OrderDate", "OrderYear").show()

# 5. Fill nulls in Price with default value 0
orders_df = orders_df.fillna({"Price": 0.0})
customers_df = customers_df.fillna({"Email": "not_provided@example.com"})

# 6. Categorize orders based on TotalAmount
orders_df = orders_df.withColumn("OrderCategory",
    when(col("TotalAmount") < 5000, "Low")
    .when((col("TotalAmount") >= 5000) & (col("TotalAmount") <= 20000), "Medium")
    .otherwise("High"))
orders_df.select("OrderID", "TotalAmount", "OrderCategory").show()


+-------+----------+---------+-----------+--------+-------+----------+-----------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|
+-------+----------+---------+-----------+--------+-------+----------+-----------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|
+-------+----------+---------+-----------+--------+-------+----------+-----------+

+-------+----------+-------+-----------+--------+-------+----------+-----------+
|Orde

In [5]:
# Register tables (if not using Hive context)
orders_df.createOrReplaceTempView("orders")
customers_df.createOrReplaceTempView("customers")

# 7. Orders made by Ali
spark.sql("""
SELECT o.*
FROM orders o
JOIN customers c ON o.CustomerID = c.CustomerID
WHERE c.Name = 'Ali'
""").show()

# 8. Total spending by each customer
spark.sql("""
SELECT c.Name, SUM(o.TotalAmount) as TotalSpent
FROM orders o
JOIN customers c ON o.CustomerID = c.CustomerID
GROUP BY c.Name
""").show()

# 9. Category with highest total revenue
spark.sql("""
SELECT Category, SUM(TotalAmount) as Revenue
FROM orders
GROUP BY Category
ORDER BY Revenue DESC
LIMIT 1
""").show()

# 10. Create a view for customer_orders
spark.sql("""
CREATE OR REPLACE TEMP VIEW customer_orders AS
SELECT c.Name AS CustomerName, o.Product, o.TotalAmount
FROM orders o
JOIN customers c ON o.CustomerID = c.CustomerID
""")

# 11. Query products ordered after Feb 2024

spark.sql("""
SELECT co.CustomerName, co.Product, co.TotalAmount
FROM customer_orders co
JOIN orders o ON co.Product = o.Product
WHERE o.OrderDate > '2024-02-01'
""").show()



+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+-------------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|OrderCategory|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+-------------+
|      1|       101| Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|         High|
|      2|       101|  Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|          Low|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+-------------+

+-----+----------+
| Name|TotalSpent|
+-----+----------+
| Ravi|    3500.0|
|Sneha|    5000.0|
| Amit|    2500.0|
| Neha|   50000.0|
|  Ali|  101200.0|
+-----+----------+

+-----------+--------+
|   Category| Revenue|
+-----------+--------+
|Electronics|151200.0|
+-----------+--------+

+------------+---------+-----------+
|CustomerName|  Product|TotalAmount|
+------

In [6]:
# 12. Global Temp View
customers_df.createOrReplaceGlobalTempView("customers")

spark.sql("SELECT * FROM global_temp.customers WHERE City = 'mumbai'").show()

# 13. Save orders_df with TotalAmount to Parquet
orders_df.write.mode("overwrite").parquet("/tmp/orders_parquet")

# 14. Read back and count orders
orders_parquet = spark.read.parquet("/tmp/orders_parquet")
orders_parquet.count()


+----------+----+-------------+------+----------+
|CustomerID|Name|        Email|  City|SignupDate|
+----------+----+-------------+------+----------+
|       101| Ali|ali@gmail.com|mumbai|2022-05-10|
+----------+----+-------------+------+----------+



7

In [7]:
from pyspark.sql.functions import udf, concat_ws, regexp_replace, to_date, datediff, current_date
from pyspark.sql.types import StringType

# 15. Email masking UDF
def mask_email(email):
    if email:
        parts = email.split("@")
        return parts[0][0] + "***@" + parts[1]
    return email

mask_email_udf = udf(mask_email, StringType())
customers_df = customers_df.withColumn("MaskedEmail", mask_email_udf(col("Email")))
customers_df.select("Email", "MaskedEmail").show()

# 16. Use concat_ws to create full label
customers_df = customers_df.withColumn("Label", concat_ws(" ", col("Name"), lit("from"), col("City")))
customers_df.select("Label").show()

# 17. Clean product names
orders_df = orders_df.withColumn("CleanProduct", regexp_replace(col("Product"), "[^a-zA-Z0-9 ]", ""))
orders_df.select("Product", "CleanProduct").show()

# 18. Calculate customer age in days
customers_df = customers_df.withColumn("SignupDate", to_date(col("SignupDate")))
customers_df = customers_df.withColumn("CustomerAgeDays", datediff(current_date(), col("SignupDate")))
customers_df.select("Name", "CustomerAgeDays").show()


+-----------------+----------------+
|            Email|     MaskedEmail|
+-----------------+----------------+
|    ali@gmail.com|  a***@gmail.com|
|   neha@yahoo.com|  n***@yahoo.com|
| ravi@hotmail.com|r***@hotmail.com|
|sneha@outlook.com|s***@outlook.com|
|   amit@gmail.com|  a***@gmail.com|
+-----------------+----------------+

+--------------------+
|               Label|
+--------------------+
|     Ali from mumbai|
|     Neha from delhi|
| Ravi from bangalore|
|Sneha from hyderabad|
|   Amit from chennai|
+--------------------+

+---------+------------+
|  Product|CleanProduct|
+---------+------------+
|   Laptop|      Laptop|
|    Mouse|       Mouse|
|   Tablet|      Tablet|
|Bookshelf|   Bookshelf|
|    Mixer|       Mixer|
| Notebook|    Notebook|
|    Phone|       Phone|
+---------+------------+

+-----+---------------+
| Name|CustomerAgeDays|
+-----+---------------+
|  Ali|           1121|
| Neha|            871|
| Ravi|           1311|
|Sneha|           1778|
| Amit|       