### **Data Reading**

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [0]:
df = spark.read.format("parquet")\
  .load("abfss://bronze@project1ds.dfs.core.windows.net/orders")

In [0]:
display(df)

In [0]:
df.printSchema()

### **Checking null and duplication**

In [0]:
#Check for nulls and duplicates
dq_report = df.select(
    count(when(col("order_id").isNull(), 1)).alias("null_order_id"),
    count("*").alias("total_rows"),
)

dup_report = (df.groupBy("order_id")
                .agg(count("*").alias("cnt"))
                .filter(col("cnt") > 1)
                .count())

dq_report.show()
print(f"Duplicate order_id count: {dup_report}")

### **Classes**

In [0]:
class DataProcessing:
    
    def add_year(self, df: DataFrame, date_col: str = "order_date", out_col: str = "year") -> DataFrame:
        # order_date is date → year() works directly
        return df.withColumn(out_col, year(col(date_col)))

    def drop_rescued(self, df: DataFrame, col_name: str = "_rescued_data") -> DataFrame:
        return df.drop(col_name)

    # ————— window specs & ranks —————
    def _w_by_year_amount(self,
                          part_col: str = "year",
                          order_col: str = "total_amount",
                          tie_breaker: str = "order_id"):
        
        order_expr = desc(coalesce(col(order_col), lit(float("-inf"))))
        return Window.partitionBy(part_col).orderBy(order_expr, col(tie_breaker))

    def dense_rank(self, df: DataFrame,
                   part_col: str = "year",
                   order_col: str = "total_amount",
                   out_col: str = "dense_rank_flag",
                   tie_breaker: str = "order_id") -> DataFrame:
        w = self._w_by_year_amount(part_col, order_col, tie_breaker)
        return df.withColumn(out_col, dense_rank().over(w))

    def row_number(self, df: DataFrame,
                   part_col: str = "year",
                   order_col: str = "total_amount",
                   out_col: str = "row_number_flag",
                   tie_breaker: str = "order_id") -> DataFrame:
        w = self._w_by_year_amount(part_col, order_col, tie_breaker)
        return df.withColumn(out_col, row_number().over(w))

    def rank(self, df: DataFrame,
             part_col: str = "year",
             order_col: str = "total_amount",
             out_col: str = "rank_flag",
             tie_breaker: str = "order_id") -> DataFrame:
        w = self._w_by_year_amount(part_col, order_col, tie_breaker)
        return df.withColumn(out_col, rank().over(w))


In [0]:
obj = DataProcessing()
df_results= obj.add_year(df)
df_results= obj.drop_rescued(df_results)
df_results= obj.dense_rank(df_results)
df_results.display()

### **Data Writing**

In [0]:
df.write.format("Delta").mode("overwrite").save("abfss://silver@project1ds.dfs.core.windows.net/orders")

In [0]:
%sql
create table if not exists project1_catalog.silver.orders_silver
using delta
location "abfss://silver@project1ds.dfs.core.windows.net/orders"

In [0]:
%sql
select * from project1_catalog.silver.orders_silver;