##Bronze to Silver Layer

### 01 Product Information

#####Initilization

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import DataFrame
from pyspark.sql.types import StringType, DateType
from pyspark.sql.functions import col, trim
from pyspark.sql.window import Window

#####Filter Null Primary Key

In [0]:
df = spark.table('dlh.bronze_db.bronze_prod_info')

In [0]:
df1 = df.groupBy("prd_id").agg(F.count("*").alias("id_count")).filter(col("id_count") == 1)

In [0]:
# df_null_check = df1.filter(col('prd_cost').isNull()).display()

#####Trim: Function

In [0]:
from pyspark.sql.functions import when, col, row_number
from pyspark.sql import functions as F
from pyspark.sql.types import StringType
from pyspark.sql import DataFrame

def trimmed(df: DataFrame) -> DataFrame:
    for field in df.schema.fields:
        if isinstance(field.dataType, StringType):
            df = df.withColumn(field.name, F.trim(F.col(field.name)))
    return df

In [0]:
df1 = trimmed(df)

##### Handle Null in Product Cost

In [0]:
df2 = df1.withColumn("prd_cost", F.coalesce(col("prd_cost"), F.lit(0)))

#####Extract Category id from Product Key 

In [0]:
df3 = df2.withColumn("cat_id", F.regexp_replace(F.substring(col("prd_key"),1,5), "-", "_"))

#####Prasing Product Key

In [0]:
df4 = df3.withColumn("prd_key", F.substring(col("prd_key"), 7,F.length(col("prd_key"))))

#####Standarise the Product Line

In [0]:
df5 = (df4
        .withColumn("prd_line", 
                    F.when(F.upper(col("prd_line")) == "M", "Mountain")
                     .when(F.upper(col("prd_line")) == "S", "Other Sales")
                     .when(F.upper(col("prd_line")) == "T", "Touring")
                     .when(F.upper(col("prd_line")) == "R", "Road")
                     .otherwise("n/a"))
)

#####Parsing Date: 
`prd_start_date`  `prd_end_date`

In [0]:
ws = Window.partitionBy(col("prd_key")).orderBy(col("prd_start_dt"))
 
df6 = df5.withColumn("prd_end_dt", F.date_sub(F.lead("prd_start_dt",1).over(ws),1))

df6.display()


#####Rename: Function

In [0]:
RENAME_MAP = {
    "prd_id": "product_id",
    "cat_id": "category_id",
    "prd_key": "product_number",
    "prd_nm": "product_name",
    "prd_cost": "product_cost",
    "prd_line": "product_line",
    "prd_start_dt": "start_date",
    "prd_end_dt": "end_date"
}

def renamed(df: DataFrame ) -> DataFrame:
    for old_name, new_name in RENAME_MAP.items():
        df = df.withColumnRenamed(old_name,new_name)
    return df

In [0]:
df7 = renamed(df6)


In [0]:
df8 = df7.select(col("product_id"), 
                 col("category_id"),
                 col("product_number"),
                 col("product_name"),
                 col("product_cost"),
                 col("product_line"),
                 col("start_date"),
                 col("end_date"))

In [0]:
df9 = df8.withColumn("ingest_ts", F.current_timestamp())
df9.count()

##### Write Dataframe to Delta Table

In [0]:
spark.sql("DROP TABLE IF EXISTS dlh.silver_db.silver_prod_info ")

(df9
    .write
    .mode("overwrite")
    .format("delta")
    .saveAsTable('dlh.silver_db.silver_prod_info'))