In [0]:
# Azure storage account details
account_name = "*************"
account_key = "*********"

In [0]:
from pyspark.sql import SparkSession, functions as F
from datetime import datetime
import os
spark = SparkSession.builder.appName("Rebrickable ETL").getOrCreate()
spark.conf.set(f"fs.azure.account.key.{account_name}.dfs.core.windows.net", account_key)

In [0]:
def rename_save(table, df):
    """
    Renames delta files
    """
    target_dir = f"abfss://conformed@{account_name}.dfs.core.windows.net/Input/Lego/{table}"
    temp_dir = f"{target_dir}_temp"

    df.coalesce(1).write.format("delta").option("mergeSchema", "true").mode("overwrite").save(temp_dir)

    try:
        files_in_temp = dbutils.fs.ls(temp_dir)
        
        delta_file = [file.path for file in files_in_temp if file.path.endswith(".parquet")][0]
        # print(delta_file)

        final_output_path = f"{target_dir}/{table}.parquet"
        
        dbutils.fs.mv(delta_file, final_output_path)
        
        dbutils.fs.rm(temp_dir, True)
        
        print(f"Successfully saved Delta file as {final_output_path}")

    except Exception as e:
        print(f"Error in file renaming/moving: {e}")


In [0]:
from datetime import datetime

today = datetime.today()
year = today.strftime('%Y')
month = today.strftime('%m')
day = today.strftime('%d')

In [0]:


# Load inventory datasets
inventories_df = spark.read.format("csv").option("header", "true").load(f"abfss://raw-data@{account_name}.dfs.core.windows.net/Unzipped Rebrickable/Lego/inventories/Year={year}/Month={month}/Day={day}/inventories.csv")

inventory_sets_df = spark.read.format("csv").option("header", "true").load(f"abfss://raw-data@{account_name}.dfs.core.windows.net/Unzipped Rebrickable/Lego/inventory_sets/Year={year}/Month={month}/Day={day}/inventory_sets.csv")

inventory_parts_df = spark.read.format("csv").option("header", "true").load(f"abfss://raw-data@{account_name}.dfs.core.windows.net/Unzipped Rebrickable/Lego/inventory_parts/Year={year}/Month={month}/Day={day}/inventory_parts.csv")

inventory_minifigs_df = spark.read.format("csv").option("header", "true").load(f"abfss://raw-data@{account_name}.dfs.core.windows.net/Unzipped Rebrickable/Lego/inventory_minifigs/Year={year}/Month={month}/Day={day}/inventory_minifigs.csv")

inventories_df.createOrReplaceTempView("inventories")
inventory_sets_df.createOrReplaceTempView("inventory_sets")
inventory_parts_df.createOrReplaceTempView("inventory_parts")
inventory_minifigs_df.createOrReplaceTempView("inventory_minifigs")

fact_inventory_df = spark.sql("SELECT i.id as inventory_id, i.version, is.set_num as set_num, ip.part_num as part_num, ip.color_id as color_id, ip.is_spare as is_spare, im.fig_num as fig_num, is.quantity as sets_quantity, im.quantity as minifigs_quantity , ip.quantity as parts_quantity, True AS is_active, current_timestamp() AS start_date, 'NULL' AS end_date FROM inventories i LEFT JOIN inventory_sets is ON i.id = is.inventory_id LEFT JOIN inventory_parts ip ON i.id =ip.inventory_id LEFT JOIN inventory_minifigs im ON i.id = im.inventory_id ORDER BY i.id" )

# # Show df 
# fact_inventory_df.show()

# num_rows = fact_inventory_df.count()
# print("Number of rows:", num_rows)

# Write to Delta format
# Can be commented out, depending on the stakeholder's requirements
fact_inventory_df.coalesce(1).write.format("delta").option("mergeSchema", "true").mode("overwrite").save(f"abfss://conformed@{account_name}.dfs.core.windows.net/Input/Lego/Fact_Inventory/")

rename_save(table ='Fact_Inventory', df = fact_inventory_df)


Successfully saved Delta file as abfss://conformed@atomicatraining.dfs.core.windows.net/Input/Lego/Fact_Inventory/Fact_Inventory.parquet


In [0]:
sets_df = spark.read.format("csv").option("header", "true").load(f"abfss://raw-data@{account_name}.dfs.core.windows.net/Unzipped Rebrickable/Lego/sets/Year={year}/Month={month}/Day={day}/sets.csv")

sets_df.createOrReplaceTempView("sets")

dim_sets_df = spark.sql("SELECT *, True AS is_active, current_timestamp() AS start_date, 'NULL' AS end_date FROM sets")

# dim_sets_df.show()

# Write to Delta format
dim_sets_df.coalesce(1).write.format("delta").option("mergeSchema", "true").mode("overwrite").save(f"abfss://conformed@{account_name}.dfs.core.windows.net/Input/Lego/Dim_sets/")

rename_save('Dim_sets', dim_sets_df)


Successfully saved Delta file as abfss://conformed@atomicatraining.dfs.core.windows.net/Input/Lego/Dim_sets/Dim_sets.parquet


In [0]:
themes_df = spark.read.format("csv").option("header", "true").load(f"abfss://raw-data@{account_name}.dfs.core.windows.net/Unzipped Rebrickable/Lego/themes/Year={year}/Month={month}/Day={day}/themes.csv")

themes_df.createOrReplaceTempView("themes")

dim_themes_df = spark.sql("SELECT id as theme_id, name, parent_id, True AS is_active, current_timestamp() AS start_date, 'NULL' AS end_date FROM themes")

# dim_themes_df.show()

# Write to Delta format
dim_themes_df.coalesce(1).write.format("delta").option("mergeSchema", "true").mode("overwrite").save(f"abfss://conformed@{account_name}.dfs.core.windows.net/Input/Lego/Dim_themes/")

rename_save('Dim_themes', dim_themes_df)

Successfully saved Delta file as abfss://conformed@atomicatraining.dfs.core.windows.net/Input/Lego/Dim_themes/Dim_themes.parquet


In [0]:
parts_df = spark.read.format("csv").option("header", "true").load(f"abfss://raw-data@{account_name}.dfs.core.windows.net/Unzipped Rebrickable/Lego/parts/Year={year}/Month={month}/Day={day}/parts.csv")

inventory_parts_df = spark.read.format("csv").option("header", "true").load(f"abfss://raw-data@{account_name}.dfs.core.windows.net/Unzipped Rebrickable/Lego/inventory_parts/Year={year}/Month={month}/Day={day}/inventory_parts.csv")

part_categories_df = spark.read.format("csv").option("header", "true").load(f"abfss://raw-data@{account_name}.dfs.core.windows.net/Unzipped Rebrickable/Lego/part_categories/Year={year}/Month={month}/Day={day}/part_categories.csv")

parts_df.createOrReplaceTempView("parts")
inventory_parts_df.createOrReplaceTempView("inventory_parts")
part_categories_df.createOrReplaceTempView("part_categories")

dim_parts_df = spark.sql("SELECT p.part_num as part_num, p.name as part_name, pc.id as part_cat_id, pc.name as part_cat_name, True AS is_active, current_timestamp() AS start_date, 'NULL' AS end_date FROM parts p JOIN part_categories pc ON p.part_cat_id = pc.id" )

# dim_parts_df.show()

# Write to Delta format
dim_parts_df.coalesce(1).write.format("delta").option("mergeSchema", "true").mode("overwrite").save(f"abfss://conformed@{account_name}.dfs.core.windows.net/Input/Lego/Dim_parts/")

rename_save('Dim_parts', dim_parts_df)

Successfully saved Delta file as abfss://conformed@atomicatraining.dfs.core.windows.net/Input/Lego/Dim_parts/Dim_parts.parquet


In [0]:
colors_df = spark.read.format("csv").option("header", "true").load(f"abfss://raw-data@{account_name}.dfs.core.windows.net/Unzipped Rebrickable/Lego/colors/Year={year}/Month={month}/Day={day}/colors.csv")

colors_df.createOrReplaceTempView("colors")

colors_df = spark.sql("SELECT id as color_id, name, rgb, is_trans, True AS is_active, current_timestamp() AS start_date, 'NULL' AS end_date FROM colors")

# colors_df.show()

# Write to Delta format
colors_df.coalesce(1).write.format("delta").option("mergeSchema", "true").mode("overwrite").save(f"abfss://conformed@{account_name}.dfs.core.windows.net/Input/Lego/Dim_colors/")

rename_save('Dim_colors', colors_df)

Successfully saved Delta file as abfss://conformed@atomicatraining.dfs.core.windows.net/Input/Lego/Dim_colors/Dim_colors.parquet


In [0]:
part_relationships_df = spark.read.format("csv").option("header", "true").load(f"abfss://raw-data@{account_name}.dfs.core.windows.net/Unzipped Rebrickable/Lego/part_relationships/Year={year}/Month={month}/Day={day}/part_relationships.csv")

part_relationships_df.createOrReplaceTempView("part_relationships")

part_relationships_df = spark.sql("SELECT *, True AS is_active, current_timestamp() AS start_date, 'NULL' AS end_date FROM part_relationships")

# part_relationships_df.show()

# Write to Delta format
part_relationships_df.coalesce(1).write.format("delta").option("mergeSchema", "true").mode("overwrite").save(f"abfss://conformed@{account_name}.dfs.core.windows.net/Input/Lego/Dim_part_relationships/")

rename_save('Dim_part_relationships', part_relationships_df)

Successfully saved Delta file as abfss://conformed@atomicatraining.dfs.core.windows.net/Input/Lego/Dim_part_relationships/Dim_part_relationships.parquet


In [0]:
minifigs_df = spark.read.format("csv").option("header", "true").load(f"abfss://raw-data@{account_name}.dfs.core.windows.net/Unzipped Rebrickable/Lego/minifigs/Year={year}/Month={month}/Day={day}/minifigs.csv")

minifigs_df.createOrReplaceTempView("minifigs")

minifigs_df = spark.sql("SELECT *, True AS is_active, current_timestamp() AS start_date, 'NULL' AS end_date FROM minifigs")

# minifigs_df.show()

# Write to Delta format
minifigs_df.coalesce(1).write.format("delta").option("mergeSchema", "true").mode("overwrite").save(f"abfss://conformed@{account_name}.dfs.core.windows.net/Input/Lego/Dim_minifig/")

rename_save('Dim_minifig', minifigs_df)

Successfully saved Delta file as abfss://conformed@atomicatraining.dfs.core.windows.net/Input/Lego/Dim_minifig/Dim_minifig.parquet
