In [0]:
display(dbutils.fs.mounts())

mountPoint,source,encryptionType
/databricks-datasets,databricks-datasets,
/Volumes,UnityCatalogVolumes,
/databricks/mlflow-tracking,databricks/mlflow-tracking,
/databricks-results,databricks-results,
/databricks/mlflow-registry,databricks/mlflow-registry,
/Volume,DbfsReserved,
/volumes,DbfsReserved,
/,DatabricksRoot,
/volume,DbfsReserved,


In [0]:
configs = {
  "fs.azure.account.auth.type": "OAuth",
  "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
  "fs.azure.account.oauth2.client.id": "6d985239-6211-4eff-8fc2-8076bf156f12",
  "fs.azure.account.oauth2.client.secret": $secret,
  "fs.azure.account.oauth2.client.endpoint": $endpoint
}

In [0]:
# Setting up the access key for access to the storage container where bronze layer data is stored.

spark.conf.set(
  "fs.azure.account.key.retaildatalakejp.blob.core.windows.net",
   $value
)



In [0]:
df_bronze = spark.read.format("parquet").load(
  "wasbs://bronze@retaildatalakejp.blob.core.windows.net/OnlineRetail.parquet"
)


In [0]:
df_bronze.show(5)

In [0]:
#Transformations to convert the bronze layer data into silver layer by cleaning it.

from pyspark.sql.functions import *

df_bronze.count()




In [0]:
#Filtering the data to remove null values and ? from description column 
filtered_descriptions = df_bronze.filter(
    (col("Description").isNotNull()) &
    (~trim(col("Description")).isin("", "?"))
)
filtered_descriptions.count()


In [0]:
#Filtering dataframe by removing negative values from Quantity column and Unit Price column

filtered_data = filtered_descriptions.filter(
    (col("Quantity") > 0) &
    (col("UnitPrice") > 0)
)
filtered_data.count()

In [0]:
# Step 5: Filter rows with CustomerID present
filter_customerid_data = filtered_data.filter(col("CustomerID").isNotNull())
filter_customerid_data.count()

In [0]:
# Step 6: Add derived column - Total Sales
salesamount_df = filter_customerid_data.withColumn("SalesAmount", round(col("Quantity") * col("UnitPrice"), 2))
salesamount_df.show(10)


In [0]:
# Step 7: Extract Year, Month, Day, Hour from InvoiceDate
timestamp_df = salesamount_df.withColumn("InvoiceDateTS", to_timestamp(col("InvoiceDate"), "M/d/yyyy H:mm"))

# Step 2: Extract components
timestamp_df = timestamp_df.withColumn("InvoiceYear", year(col("InvoiceDateTS"))) \
       .withColumn("InvoiceMonth", month(col("InvoiceDateTS"))) \
       .withColumn("InvoiceDay", dayofmonth(col("InvoiceDateTS"))) \
       .withColumn("InvoiceHour", hour(col("InvoiceDateTS")))
timestamp_df.show(10)

In [0]:
final_df = timestamp_df.withColumn("Description", trim(col("Description"))) \
                       .withColumn("Country", trim(col("Country")))

final_df.show(2)

In [0]:
final_df.printSchema()

In [0]:
final_df = final_df.drop('InvoiceDate')
final_df.printSchema()

In [0]:
#Write data to silver layer
spark.conf.set(
  "fs.azure.account.key.retaildatalakejp.blob.core.windows.net",
  $key
)


final_df.write.mode("overwrite").parquet("wasbs://silver@retaildatalakejp.blob.core.windows.net/OnlineRetailCleaned.parquet")