In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [0]:
df = spark.read.format("parquet")\
  .load("abfss://bronze@project1ds.dfs.core.windows.net/products")

In [0]:
df.display()

In [0]:
df = df.drop("_rescued_data")
df.display()

In [0]:
df.printSchema()

In [0]:
#Check for nulls and duplicates
dq_report = df.select(
    count(when(col("product_id").isNull(), 1)).alias("null_product_id"),
    count("*").alias("total_rows"),
)

dup_report = (df.groupBy("product_id")
                .agg(count("*").alias("cnt"))
                .filter(col("cnt") > 1)
                .count())

dq_report.show()
print(f"Duplicate product_id count: {dup_report}")

### **Functions**

In [0]:
df.createOrReplaceTempView("products")

In [0]:
%sql
CREATE OR REPLACE FUNCTION project1_catalog.bronze.discount_func(p_price DOUBLE)
RETURNS DOUBLE
RETURN p_price * 0.90;

In [0]:
%sql
select product_id, price, project1_catalog.bronze.discount_func(price) as discounted_price from products;

In [0]:
df = df.withColumn("discounted_price",expr("project1_catalog.bronze.discount_func(price)"))
df.display()

In [0]:
df.write.mode("overwrite").format("delta").save("abfss://silver@project1ds.dfs.core.windows.net/products")

In [0]:
%sql
create table if not exists project1_catalog.silver.products_silver
using delta
location "abfss://silver@project1ds.dfs.core.windows.net/products"

In [0]:
%sql
select * from project1_catalog.silver.products_silver;