In [None]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

builder = SparkSession.builder \
    .appName("SalesTransactionsETL") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:

csv_data = '''transaction_id,customer_name,region,product,category,quantity,unit_price,date
1,Rajesh,North,Laptop,Electronics,1,55000,2024-01-12
2,Sneha,West,Refrigerator,Electronics,1,32000,2024-02-05
3,Anil,South,Shampoo,Personal Care,5,150,2024-01-17
4,Divya,North,Mobile,Electronics,2,20000,2024-03-22
5,Vikram,East,Washing Machine,Electronics,1,28000,2024-02-28
6,Preeti,West,Sneakers,Fashion,2,4000,2024-01-31
7,Aman,South,TV,Electronics,1,45000,2024-02-15
8,Isha,North,Notebook,Stationery,10,60,2024-01-10
9,Kunal,East,Pencil,Stationery,20,10,2024-03-05
10,Tanvi,West,Face Cream,Personal Care,3,200,2024-03-19
'''

with open("/tmp/sales_transactions.csv", "w") as f:
    f.write(csv_data)

df = spark.read.option("header", True).option("inferSchema", True).csv("/tmp/sales_transactions.csv")
df.show()


In [None]:
df.write.mode("overwrite").parquet("/tmp/output/sales_parquet")
df.write.format("delta").mode("overwrite").save("/tmp/delta/sales_transactions")

In [None]:
spark.sql("DROP TABLE IF EXISTS sales_transactions")
spark.sql("CREATE TABLE sales_transactions USING DELTA LOCATION '/tmp/delta/sales_transactions'")


In [None]:
from pyspark.sql.functions import col, expr, month, date_format

df = df.withColumn("total_amount", col("quantity") * col("unit_price")) \
       .withColumn("month", month("date")) \
       .withColumn("formatted_date", date_format("date", "dd-MMM-yyyy")) \
       .withColumn("is_high_value", expr("total_amount > 30000"))

df.show()


In [None]:
df.groupBy("region").count().show()

In [None]:
df.groupBy("category").agg(expr("sum(total_amount) as total_sales")) \
  .orderBy("total_sales", ascending=False).show(3)

In [None]:
df.groupBy("month").agg(expr("sum(total_amount) as revenue")) \
  .orderBy("month").show()

In [None]:
max_val = df.agg({"total_amount": "max"}).collect()[0][0]
df.filter(col("total_amount") == max_val).select("customer_name", "total_amount").show()


In [None]:

df.filter(month("date").between(1, 3)).agg(expr("sum(total_amount) as q1_sales")).show()

In [None]:
from delta.tables import DeltaTable

deltaTable = DeltaTable.forPath(spark, "/tmp/delta/sales_transactions")


deltaTable.update(
    condition="category = 'Stationery'",
    set={"unit_price": "unit_price * 1.10"}
)

In [None]:
deltaTable.delete("quantity < 3")

In [None]:

from datetime import date
new_data = [(11, "Arjun", "Central", "Tablet", "Electronics", 1, 30000, date.today())]
new_df = spark.createDataFrame(new_data, df.schema)
new_df.write.format("delta").mode("append").save("/tmp/delta/sales_transactions")

In [None]:
df.write.format("delta").mode("overwrite").partitionBy("region").save("/tmp/delta/sales_by_region")


In [None]:
df.write.format("delta").mode("overwrite").partitionBy("month").save("/tmp/delta/sales_by_month")



In [None]:
df.show()