In [0]:
# import all neccesary functions
import logging
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [0]:
# logger config
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler()  # logs display in console
    ]
)
logger = logging.getLogger("ETL_PIPELINE")

# Data ingstension

In [0]:
file=dbutils.fs.ls(f"abfss://my-container@storagedbrics.dfs.core.windows.net/project-sales/source")
path=file[0].path

# Bronze Layer

In [0]:
logger.info("Beginig of Bronze layer file")
try:
  df=spark.read.format("csv")\
              .option("header", True)\
              .option("inferSchema", True)\
              .load(path)
  logger.info(f"File readed succesfully")
except Exception as e:
  logger.error(f"Error reading file: {e}")

2025-12-23 13:00:09,037 [INFO] Beginig of Bronze layer file
2025-12-23 13:00:09,038 [INFO] File readed succesfully


In [0]:
# write data to Bronze layer

logger.info("Saving data to bronze layer")
try:
    df.write.format("delta")\
        .mode("overwrite")\
        .save(f"abfss://my-container@storagedbrics.dfs.core.windows.net/project-sales/datalake/bronze")
    logger.info("Data saved succesfully to bronze layer")
except:
    logger.error("Error during saving data to bronze layer")
# reading bronze layer data to silver

2025-12-23 13:00:16,548 [INFO] Saving data to bronze layer
2025-12-23 13:00:28,455 [INFO] Data saved succesfully to bronze layer


# Silver layer

In [0]:
# reading bronze layer data to silver


logger.info("Reading data from Bronze layer to Silver layer")
try:
  df_silver=spark.read.format("delta")\
                    .option("header", True)\
                    .option("inferSchema", True)\
                    .load(f"abfss://my-container@storagedbrics.dfs.core.windows.net/project-sales/datalake/bronze")
  logger.info("Data succesfully readed from Bronze layer to Silver layer")
except Exception as e:
  logger.error(f"Error reading file from Bronze layer to Silver layer: {e}")


2025-12-23 13:00:28,573 [INFO] Reading data from Bronze layer to Silver layer
2025-12-23 13:00:28,574 [INFO] Data succesfully readed from Bronze layer to Silver layer


In [0]:
# counting null values in each column
columns=df_silver.columns

for column in columns:
    null_count=df_silver.filter(col(column).isNull()).count()
    print(f" Number of nulls in {column}: {null_count}")



 Number of nulls in LK_SO_NUMBER_TXT: 0
 Number of nulls in LK_SO_ITEM_TXT: 0
 Number of nulls in GEOHI_ID_LVL1_TXT: 0
 Number of nulls in GEOHI_ID_LVL2_TXT: 0
 Number of nulls in GEOHI_ID_LVL3_TXT: 0
 Number of nulls in GEOHI_ID_LVL4_TXT: 0
 Number of nulls in LK_CUSTOMER_ID_TXT: 0
 Number of nulls in LK_MATERIAL_NUMBER_TXT: 0
 Number of nulls in SAHI_ID_LVL4_TXT: 0
 Number of nulls in SAHI_ID_LVL5_TXT: 0
 Number of nulls in SAHI_ID_LVL6_TXT: 0
 Number of nulls in SAHI_ID_LVL7_TXT: 0
 Number of nulls in OTDR_EXT_FLG: 0
 Number of nulls in SI_CPO_CREATION_DAT: 0
 Number of nulls in SI_SO_CREATION_DAT: 0
 Number of nulls in READY_TO_SHIP_DAT: 0
 Number of nulls in CUST_REQ_DELIVERY_DATE_DAT: 0
 Number of nulls in SI_VENDOR_ID_TXT: 0
 Number of nulls in VAL_PLANT_ID_TXT: 0


We see that there are no typical nulls values in df. But when we display our df we can clearly see that we have "NULL" values in string format

In [0]:
# counting nulls in string format

for column in df_silver.columns:
    null_str_count=df_silver.filter(col(column).cast(StringType()).isin("NULL", "Null", "Nan", "nan")).count()
    print(f" Number of 'null' string types in {column}: {null_str_count} ")

 Number of 'null' string types in LK_SO_NUMBER_TXT: 0 
 Number of 'null' string types in LK_SO_ITEM_TXT: 0 
 Number of 'null' string types in GEOHI_ID_LVL1_TXT: 2049 
 Number of 'null' string types in GEOHI_ID_LVL2_TXT: 2049 
 Number of 'null' string types in GEOHI_ID_LVL3_TXT: 2049 
 Number of 'null' string types in GEOHI_ID_LVL4_TXT: 2049 
 Number of 'null' string types in LK_CUSTOMER_ID_TXT: 0 
 Number of 'null' string types in LK_MATERIAL_NUMBER_TXT: 0 
 Number of 'null' string types in SAHI_ID_LVL4_TXT: 2049 
 Number of 'null' string types in SAHI_ID_LVL5_TXT: 2049 
 Number of 'null' string types in SAHI_ID_LVL6_TXT: 2049 
 Number of 'null' string types in SAHI_ID_LVL7_TXT: 2049 
 Number of 'null' string types in OTDR_EXT_FLG: 0 
 Number of 'null' string types in SI_CPO_CREATION_DAT: 13803 
 Number of 'null' string types in SI_SO_CREATION_DAT: 0 
 Number of 'null' string types in READY_TO_SHIP_DAT: 6753 
 Number of 'null' string types in CUST_REQ_DELIVERY_DATE_DAT: 2855 
 Number o

## 1. changing values in column from string "Null" to Null

In [0]:
for column in df.columns:
    df_silver=df_silver.withColumn(column, when(trim(lower(col(column))).isin("null"),None).otherwise(col(column)))

# count of rows in df_silver with fake nulls
df_silver.count()

138075

## 2. Droping Null values, drop duplicates, casting date and timestamp columns

In [0]:
df_silver=(
    df_silver
    .dropna(how="all")
    .dropDuplicates()

    # Columns casting to timestamp
    .withColumn('SI_CPO_CREATION_DAT', to_timestamp(col('SI_CPO_CREATION_DAT'),"dd/MM/yyyy HH:mm"))
    .withColumn('SI_SO_CREATION_DAT', to_timestamp(col('SI_SO_CREATION_DAT'),"dd/MM/yyyy HH:mm"))
    .withColumn('READY_TO_SHIP_DAT', to_timestamp(col('READY_TO_SHIP_DAT'),"dd/MM/yyyy HH:mm"))
    .withColumn('CUST_REQ_DELIVERY_DATE_DAT', to_timestamp(col('CUST_REQ_DELIVERY_DATE_DAT'),"dd/MM/yyyy HH:mm")
              
))

In [0]:
df_silver=(
    df_silver
    .withColumn("DIFF_DATE_CPO_SHIP_DAYS", date_diff(end=col("READY_TO_SHIP_DAT"), start=col("SI_CPO_CREATION_DAT")))
    .withColumn("PERIOD", date_format(col("SI_CPO_CREATION_DAT"), "yyyy-MM"))
    .withColumn("PERIOD", col("PERIOD").cast(StringType()))
    # rename columns
    .withColumnRenamed("GEOHI_ID_LVL4_TXT", "COUNTRY")
)


In [0]:
# writing data to silver Layer

logger.info("Saving data to silver layer")
try:
    df_silver.write.format("delta")\
        .mode("append")\
        .save(f"abfss://my-container@storagedbrics.dfs.core.windows.net/project-sales/datalake/silver")
    logger.info("Data succesfully saved to silver layer")
except Exception as e:
    logger.error(f"Error during saving data to silver layer: {e}")

2025-12-23 13:00:49,622 [INFO] Saving data to silver layer
2025-12-23 13:00:54,005 [INFO] Data succesfully saved to silver layer


# GOLD LAYER

In [0]:
# reading data from silver to gold layer

logger.info("Reading data from silver to gold layer")
try:
    df_gold=spark.read.format("delta")\
        .load(f"abfss://my-container@storagedbrics.dfs.core.windows.net/project-sales/datalake/silver")
    logger.info("Data succesfully readed from silver to gold layer")
except Exception as e:
    logger.error(f"Error reading data from silver to gold layer: {e}")

2025-12-23 13:00:54,142 [INFO] Reading data from silver to gold layer
2025-12-23 13:00:54,143 [INFO] Data succesfully readed from silver to gold layer


## Transforming and agregating data

In [0]:
df_gold_grouped=df_gold.groupBy("COUNTRY","PERIOD").agg(
    sum(col("OTDR_EXT_FLG")).alias("SUM_OTDR"),
    count(col("OTDR_EXT_FLG")).alias("TOTAL_OTDR")
)

df_gold_grouped=df_gold_grouped.withColumn("PTC_OTDR", round((col("SUM_OTDR")/col("TOTAL_OTDR"))*100,2) )
df_gold_grouped=df_gold_grouped.drop("SUM_OTDR","TOTAL_OTDR")

In [0]:
# writing data to gold layer

logger.info("Saving data to gold layer")
try:
    df_gold_grouped.write.format("delta")\
        .mode("overwrite")\
        .save(f"abfss://my-container@storagedbrics.dfs.core.windows.net/project-sales/datalake/gold")
    logger.info("Data succesfully saved to gold layer")
except Exception as e:
    logger.error(f"Error during saving data to gold layer: {e}")

2025-12-23 13:00:54,929 [INFO] Saving data to gold layer
2025-12-23 13:00:57,659 [INFO] Data succesfully saved to gold layer
