%md
# FNB Bronze â€“ Raw Ingestion Layer

This notebook ingests Food & Beverages (FNB) source data and writes it into the Bronze layer.  
The Bronze table contains raw, unvalidated, schema-on-read data used for auditing and traceability.

Data Source:

| Tables |
| ------ |
| <raw_source_location_or_file> |

Target Tables:

| Tables |
| ------ |
| default.bronze_fnb_sales |


### Imports

In [0]:
from pyspark.sql.functions import current_timestamp, col, count, when, max, min

### Reading Files from Volumes

In [0]:
df_fact=spark.read.csv("/Volumes/workspace/default/datasets/Food_Sales_Fact.csv",header=True,inferSchema=True)
df_market=spark.read.csv("/Volumes/workspace/default/datasets/Food_Sales_Market.csv",header=True,inferSchema=True)
df_product=spark.read.csv("/Volumes/workspace/default/datasets/Food_Sales_Product.csv",header=True,inferSchema=True)

### Transformations

#### Fact

In [0]:
df_fact_filtered = (
    df_fact.select(
        "MARKET_TAG",
        "COUNTRY",
        "PRODUCT_TAG",
        "TOTAL_VALUE_SALES",
        "TOTAL_UNIT_SALES",
        "TOTAL_VOLUME_SALES",
        "ACV_WEIGHTED_DISTRIBUTION",
        "PROMO_SALES_ANY_PROMO",
        "PROMO_UNITS_ANY_PROMO",
        # "AVERAGE_UNIT_PRICE",
        "currency_code",
        "DATE",
        "YEAR_MONTH"
    )
    .withColumnRenamed("MARKET_TAG", "RETAILER_ID")
    .withColumnRenamed("PRODUCT_TAG", "PRODUCT_ID")
    .withColumnRenamed("TOTAL_VALUE_SALES", "SALES")
    .withColumnRenamed("TOTAL_UNIT_SALES", "UNITS")
    .withColumnRenamed("TOTAL_VOLUME_SALES", "VOLUME")
    .withColumnRenamed("ACV_WEIGHTED_DISTRIBUTION", "DISTRIBUTION")
    .withColumnRenamed("PROMO_SALES_ANY_PROMO", "PROMO_SALES")
    .withColumnRenamed("PROMO_UNITS_ANY_PROMO", "PROMO_UNITS")
    .withColumnRenamed("currency_code", "CURRENCY_CODE")
    .distinct()
)

#### Market

In [0]:
df_market_filtered = (
    df_market.select("MARKET_TAG", "MARKET_NAME_LONG")
    .withColumnRenamed("MARKET_TAG", "RETAILER_ID")
    .withColumnRenamed("MARKET_NAME_LONG", "RETAILER")
    .distinct()
)

#### Product

In [0]:
df_product_filtered = (
    df_product.select("PRODUCT_TAG", "ITEM_CODE", "CATEGORY", "SEGMENT", "BRAND", "MANUFACTURER")
    .withColumnRenamed("PRODUCT_TAG", "PRODUCT_ID")
    .distinct()
)

#### Fact X Market X Product

In [0]:
df_final = df_fact_filtered.join(df_market_filtered, "RETAILER_ID", "left").join(
    df_product_filtered, "PRODUCT_ID", "left"
)

In [0]:
%skip
# For main version 
df_final = df_final.na.drop(how="any")

In [0]:
df_final_filtered = (
    df_final.select(
        "COUNTRY",
        "RETAILER_ID",
        "RETAILER",
        "PRODUCT_ID",
        "ITEM_CODE",
        "CATEGORY",
        "SEGMENT",
        "BRAND",
        "MANUFACTURER",
        "SALES",
        "UNITS",
        "VOLUME",
        "DISTRIBUTION",
        "PROMO_SALES",
        "PROMO_UNITS",
        # "AVERAGE_UNIT_PRICE",
        "CURRENCY_CODE",
        "DATE",
    )
    .withColumn("LAST_UPDATED", current_timestamp())
    .withColumn("PRODUCT_ID", col("PRODUCT_ID").try_cast("string"))
    .withColumn("ITEM_CODE", col("ITEM_CODE").try_cast("string"))
    .withColumn("VOLUME", col("VOLUME").try_cast("string"))
    .withColumn("DATE", col("DATE").try_cast("string"))
    # .filter('year_month <= "202509"')
)

In [0]:
df_final_filtered.count()

In [0]:
df_final_filtered.select('RETAILER',"DATE").groupBy('RETAILER').agg(min("DATE"),max("DATE")).display()

In [0]:
df_final_filtered = df_final_filtered.filter(
    ((col("RETAILER") == "COSTCO") & (col("DATE") <= "2025-10-11")) |
    ((col("RETAILER") == "CVS") & (col("DATE") <= "2025-10-04")) |
    ((col("RETAILER") == "WALGREENS") & (col("DATE") <= "2025-10-18")) | 
    ((col("RETAILER") == "WALMART") & (col("DATE") <= "2025-10-25")) |
    ((col("RETAILER").isNull()))
)

In [0]:
df_final_filtered.select('RETAILER',"DATE").groupBy('RETAILER').agg(min("DATE"),max("DATE")).display()

In [0]:
df_final_filtered.count()

### Save as Table

In [0]:
# df_final_filtered.write.format('delta').mode('overwrite').option("mergeSchema", "true").partitionBy('CATEGORY','RETAILER').saveAsTable('default.bronze_fnb_sales')

In [0]:
%sql
select CATEGORY,RETAILER,max(DATE), min(DATE), count(*) from default.silver_fnb_sales group by all

In [0]:
%sql
select CATEGORY,RETAILER,max(DATE), min(DATE), count(*) from default.gold_fnb_sales group by all

In [0]:
%sql
select CATEGORY,RETAILER,max(DATE), min(DATE), count(*) from default.gold_fnb_sales version as of 0 group by all