In [12]:
from datetime import datetime
from pyspark.sql import Row

# Define metadata row for Bronze Lakehouse
metadata_bronze = [Row(
    lakehouse_name="bronze_pro",
    layer="Bronze",
    created_on=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    created_by="your.email@company.com"
)]

# Convert to DataFrame
df_meta_bronze = spark.createDataFrame(metadata_bronze)

# Save metadata as a Delta Table inside the Bronze Lakehouse
df_meta_bronze.write.format("delta") \
    .mode("overwrite") \
    .save("abfss://FabricTrainingWorkspace@onelake.dfs.fabric.microsoft.com/bronze_pro.Lakehouse/Tables/lakehouse_metadata")

# Confirm by displaying metadata
df_meta_bronze.show()


StatementMeta(, b73bfc2b-a712-4f72-9e36-67617589c9d1, 14, Finished, Available, Finished)

+--------------+------+-------------------+--------------------+
|lakehouse_name| layer|         created_on|          created_by|
+--------------+------+-------------------+--------------------+
|    bronze_pro|Bronze|2025-04-04 18:12:27|your.email@compan...|
+--------------+------+-------------------+--------------------+



In [13]:
file_path = "abfss://FabricTrainingWorkspace@onelake.dfs.fabric.microsoft.com/bronze_pro.Lakehouse/Files/raw/Product.csv"
# read tsv
df_product = spark.read.option("header", "true") \
                      .option("sep", "\t") \
                      .option("quote", "\"") \
                      .csv(file_path)


df_product.show(5)

StatementMeta(, b73bfc2b-a712-4f72-9e36-67617589c9d1, 15, Finished, Available, Finished)

+----------+--------------------+-------------+-----+-----------+-----------+-----------------------+-----------------+
|ProductKey|             Product|Standard Cost|Color|Subcategory|   Category|Background Color Format|Font Color Format|
+----------+--------------------+-------------+-----+-----------+-----------+-----------------------+-----------------+
|       210|HL Road Frame - B...|      $868.63|Black|Road Frames| Components|                #000000|          #FFFFFF|
|       215|Sport-100 Helmet,...|       $12.03|Black|    Helmets|Accessories|                #000000|          #FFFFFF|
|       216|Sport-100 Helmet,...|       $13.88|Black|    Helmets|Accessories|                #000000|          #FFFFFF|
|       217|Sport-100 Helmet,...|       $13.09|Black|    Helmets|Accessories|                #000000|          #FFFFFF|
|       253|LL Road Frame - B...|       $176.2|Black|Road Frames| Components|                #000000|          #FFFFFF|
+----------+--------------------+-------

In [18]:
df_product = df_product.toDF(*[col.strip().replace(" ", "_").replace("|", "_") for col in df_product.columns])
df_product

StatementMeta(, b73bfc2b-a712-4f72-9e36-67617589c9d1, 20, Finished, Available, Finished)

DataFrame[ProductKey: string, Product: string, Standard_Cost: string, Color: string, Subcategory: string, Category: string, Background_Color_Format: string, Font_Color_Format: string]

In [14]:

# Read the TSV file using PySpark
df_region = spark.read.option("header", "true") \
                      .option("sep", "\t") \
                      .option("quote", "\"") \
                      .csv( "abfss://FabricTrainingWorkspace@onelake.dfs.fabric.microsoft.com/bronze_pro.Lakehouse/Files/raw/Region.csv")
df_region.show(5)

StatementMeta(, b73bfc2b-a712-4f72-9e36-67617589c9d1, 16, Finished, Available, Finished)

+-----------------+---------+-------------+-------------+
|SalesTerritoryKey|   Region|      Country|        Group|
+-----------------+---------+-------------+-------------+
|                1|Northwest|United States|North America|
|                2|Northeast|United States|North America|
|                3|  Central|United States|North America|
|                4|Southwest|United States|North America|
|                5|Southeast|United States|North America|
+-----------------+---------+-------------+-------------+
only showing top 5 rows



In [15]:

df_sale= spark.read.option("header", "true") \
                     .option("sep", ",") \
                     .option("inferSchema", "true") \
                     .csv("abfss://FabricTrainingWorkspace@onelake.dfs.fabric.microsoft.com/bronze_pro.Lakehouse/Files/raw/Sales.csv")

df_sale.show(5, truncate=False)  # Show full values
df_sale.printSchema()  # Check column structure


StatementMeta(, b73bfc2b-a712-4f72-9e36-67617589c9d1, 17, Finished, Available, Finished)

+----------------+-------------------+----------+-----------------+-------------+---------+-----------+----------------+
|SalesOrderNumber|OrderDate          |ProductKey|SalesTerritoryKey|OrderQuantity|UnitPrice|SalesAmount|TotalProductCost|
+----------------+-------------------+----------+-----------------+-------------+---------+-----------+----------------+
|SO43697         |2010-12-29 00:00:00|310       |6                |1            |3578.27  |3578.27    |2171.2942       |
|SO43698         |2010-12-29 00:00:00|346       |7                |1            |3399.99  |3399.99    |1912.1544       |
|SO43699         |2010-12-29 00:00:00|346       |1                |1            |3399.99  |3399.99    |1912.1544       |
|SO43700         |2010-12-29 00:00:00|336       |4                |1            |699.0982 |699.0982   |413.1463        |
|SO43701         |2010-12-29 00:00:00|346       |9                |1            |3399.99  |3399.99    |1912.1544       |
+----------------+--------------

In [19]:
# Write Sales Data to Bronze
df_sale.write.format("delta") \
        .mode("overwrite") \
        .save("abfss://FabricTrainingWorkspace@onelake.dfs.fabric.microsoft.com/bronze_pro.Lakehouse/Tables/bronze_sal")

# Write Product Data to Bronze
df_product.write.format("delta") \
        .mode("overwrite") \
        .save("abfss://FabricTrainingWorkspace@onelake.dfs.fabric.microsoft.com/bronze_pro.Lakehouse/Tables/bronze_prod")

# Write Region Data to Bronze
df_region.write.format("delta") \
        .mode("overwrite") \
        .save("abfss://FabricTrainingWorkspace@onelake.dfs.fabric.microsoft.com/bronze_pro.Lakehouse/Tables/bronze_reg")

StatementMeta(, b73bfc2b-a712-4f72-9e36-67617589c9d1, 21, Finished, Available, Finished)

In [24]:
from datetime import datetime
from pyspark.sql import Row

# Function to log ingestion metadata
def log_ingestion(file_name, df, error_flag=False):
    metadata = [Row(
        file_name=file_name,
        ingestion_time=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),  # Convert to string format
        row_count=df.count() if not error_flag else 0,
        error_flag=error_flag
    )]
    
    df_meta = spark.createDataFrame(metadata)

    # Save metadata as Delta Table
    df_meta.write.format("delta") \
         .mode("append") \
         .save("abfss://FabricTrainingWorkspace@onelake.dfs.fabric.microsoft.com/bronze_pro.Lakehouse/Tables/ingestion_metadata")

# Log ingestion metadata
log_ingestion("Sales.csv", df_sale)
log_ingestion("Product.csv", df_product)
log_ingestion("Region.csv", df_region)


StatementMeta(, b73bfc2b-a712-4f72-9e36-67617589c9d1, 26, Finished, Available, Finished)

In [26]:
df1 = spark.read.parquet("Tables/ingestion_metadata/part-00000-180d5230-8009-4a75-abc6-1ad259007951-c000.snappy.parquet")
# df now is a Spark DataFrame containing parquet data from "Tables/ingestion_metadata/part-00000-180d5230-8009-4a75-abc6-1ad259007951-c000.snappy.parquet".
display(df1)

df2 = spark.read.parquet("Tables/ingestion_metadata/part-00000-3dbb7b97-34b5-48e0-a645-2dc7b3426ea3-c000.snappy.parquet")
# df now is a Spark DataFrame containing parquet data from "Tables/ingestion_metadata/part-00000-3dbb7b97-34b5-48e0-a645-2dc7b3426ea3-c000.snappy.parquet".
display(df2)

df3 = spark.read.parquet("Tables/ingestion_metadata/part-00000-420c4d5f-0808-4b98-935f-cf097c11f4aa-c000.snappy.parquet")
# df now is a Spark DataFrame containing parquet data from "Tables/ingestion_metadata/part-00000-420c4d5f-0808-4b98-935f-cf097c11f4aa-c000.snappy.parquet".
display(df3)

StatementMeta(, b73bfc2b-a712-4f72-9e36-67617589c9d1, 28, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 83e94e4b-a196-4fb1-89d3-91f9d96e5a90)

SynapseWidget(Synapse.DataFrame, b82caf33-9bc8-4cc4-8aa7-cce3fa7e1305)

SynapseWidget(Synapse.DataFrame, befde80b-a716-4adb-ac2d-4ab07d0f184d)

In [27]:
print(df_product.columns)
print(df_sale.columns)
print(df_region.columns)

StatementMeta(, b73bfc2b-a712-4f72-9e36-67617589c9d1, 29, Finished, Available, Finished)

['ProductKey', 'Product', 'Standard_Cost', 'Color', 'Subcategory', 'Category', 'Background_Color_Format', 'Font_Color_Format']
['SalesOrderNumber', 'OrderDate', 'ProductKey', 'SalesTerritoryKey', 'OrderQuantity', 'UnitPrice', 'SalesAmount', 'TotalProductCost']
['SalesTerritoryKey', 'Region', 'Country', 'Group']
