In [1]:
from pyspark.sql.functions import (
    col, year, month, dayofmonth, weekofyear,
    date_format, quarter, dayofweek, when, lit
)

# Pull data from silver.calendar table to DataFrame

df_calendar = spark.table("silver.calendar")

#Perform transformations

df_dim_date = (
    df_calendar
    .select(col("Date"))
    .dropDuplicates()
    .withColumn("DateKey", date_format(col("Date"), "yyyyMMdd").cast("int"))
    .withColumn("Day", dayofmonth(col("Date")))
    .withColumn("DayName", date_format(col("Date"), "EEEE"))
    .withColumn("DayOfWeek", dayofweek(col("Date")))
    .withColumn("WeekOfYear", weekofyear(col("Date")))
    .withColumn("Month", month(col("Date")))
    .withColumn("MonthName", date_format(col("Date"), "MMMM"))
    .withColumn("Quarter", quarter(col("Date")))
    .withColumn("Year", year(col("Date")))
    .withColumn(
        "IsWeekend",
        when(dayofweek(col("Date")).isin(1, 7), lit(True)).otherwise(lit(False))
    )
)

# Select and reorder columns

df_dim_date = df_dim_date.select(
    "DateKey",
    "Date",
    "Day",
    "DayName",
    "DayOfWeek",
    "WeekOfYear",
    "Month",
    "MonthName",
    "Quarter",
    "Year",
    "IsWeekend"
)

# Insert data into gold.dim_date table

(
    df_dim_date
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("gold.dim_date")
)

#

StatementMeta(, 23709a00-0143-4b2e-88af-9f3359c142d9, 3, Finished, Available, Finished)

In [2]:
from pyspark.sql.functions import *

# Pull data from silver.products Delta Table to DataFrame

df_products_silver = spark.table("silver.products")

#Perform transformations

df_dim_product = (
    df_products_silver
    .select(
        col("ProductKey"),
        trim(col("ProductSKU")).alias("ProductSKU"),
        initcap(trim(col("ProductName"))).alias("ProductName"),
        initcap(trim(col("ModelName"))).alias("ModelName"),
        initcap(trim(col("CategoryName"))).alias("CategoryName"),
        initcap(trim(col("SubcategoryName"))).alias("SubcategoryName")
    )
)

# IMPORTANT! AdventureWorks dataset does not have prices provided for the products.
# We will populate synthetic prices based on products categories and subcategories for analytical purposes.
# Columns added will be a ListPrice and StandardCost that will be based on temporary columns CategoryBasePrice and SubcategoryMultiplier.

df_dim_product = (df_dim_product
    .withColumn(
        "CategoryBasePrice",
        when(col("CategoryName") == "Bikes", 800)
        .when(col("CategoryName") == "Components", 150)
        .when(col("CategoryName") == "Clothing", 40)
        .when(col("CategoryName") == "Accessories", 25)
        .otherwise(50)
    )
    .withColumn(
        "SubcategoryMultiplier",
        when(col("SubcategoryName").like("%Road%"), 1.3)
        .when(col("SubcategoryName").like("%Mountain%"), 1.2)
        .when(col("SubcategoryName").like("%Helmet%"), 1.1)
        .when(col("SubcategoryName").like("%Sock%"), 0.8)
        .otherwise(1.0)
    )
    .withColumn(
        "ListPrice",
        round(
            col("CategoryBasePrice") * col("SubcategoryMultiplier")
            + (col("ProductKey") % 10) * 3,
            2)
    )
    .withColumn("StandardCost", round(col("ListPrice") * 0.6, 2))
)


# Select and reorder columns

df_dim_product = df_dim_product.select(
    "ProductKey",
    "ProductSKU",
    "ProductName",
    "ModelName",
    "CategoryName",
    "SubcategoryName",
    "ListPrice",
    "StandardCost"
)


# Write data to gold.dim_product delta table

(
    df_dim_product
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("gold.dim_product")
)



StatementMeta(, f35d49f5-5edc-41ff-b672-f98e30abb992, 4, Finished, Available, Finished)

In [2]:
from pyspark.sql.functions import (
    col, concat_ws, year, current_date, lower, split
)

# Pull data from silver.sales Delta Table to DataFrame

df_customers_silver = spark.table("silver.customers")

df_dim_customer = (
    df_customers_silver
    .select(
        col("CustomerKey"),
        col("FirstName"),
        col("LastName"),
        col("Gender"),
        col("MaritalStatus"),
        col("BirthDate"),
        col("AnnualIncome"),
        col("EducationLevel"),
        col("Occupation"),
        col("HomeOwner"),
        col("Email")
    )
# Add new columns with full name, age and email domain
    .withColumn(
        "FullName", 
        concat_ws(" ", col("FirstName"), col("LastName"))
    )
    .withColumn(
        "Age",
        year(current_date()) - year(col("BirthDate"))
    )
    .withColumn(
        "EmailDomain",
        lower(split(col("Email"), "@").getItem(1))
    )
)

# Select and reorder columns

df_dim_customer = df_dim_customer.select(
    "CustomerKey",
    "FullName",
    "FirstName",
    "LastName",
    "Gender",
    "MaritalStatus",
    "Age",
    "AnnualIncome",
    "EducationLevel",
    "Occupation",
    "HomeOwner",
    "Email",
    "EmailDomain"
)

# Write data to gold.dim_customer delta table

(
    df_dim_customer
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("gold.dim_customer")
)


StatementMeta(, 8b4f5cf9-ab4b-4353-bb1a-5abdd204f8c0, 4, Finished, Available, Finished)

In [1]:
from pyspark.sql.functions import col, initcap, trim

# Pull data from silver.territories delta table to DataFrame

df_territories_silver = spark.table("silver.territories")

# Select and transform columns

df_dim_territory = (
    df_territories_silver
    .select(
        col("TerritoryKey"),
        initcap(trim(col("Region"))),
        initcap(trim(col("Country"))),
        initcap(trim(col("Continent")))
    )
)
# Write data to gold.dim_territory delta table

(
    df_dim_territory
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("gold.dim_territory")
)


StatementMeta(, ebddee9f-a9a0-41b3-aea9-7578e8863794, 3, Finished, Available, Finished)

In [4]:
from pyspark.sql.functions import col, round
from pyspark.sql.types import DecimalType

# Pull data from gold dimensions and silver sales delta tables

df_sales = spark.table("silver.sales")
df_dim_date = spark.table("gold.dim_date")
df_dim_customer = spark.table("gold.dim_customer")
df_dim_product = spark.table("gold.dim_product")
df_dim_territory = spark.table("gold.dim_territory")

#Join tables on keys to enforce star schema and ensure data integrity

df_sales_enriched = (
    df_sales
    .join(
        df_dim_date,
        df_sales.OrderDate == df_dim_date.Date,
        "left"
    )
    .join(
        df_dim_customer,
        df_sales.CustomerID == df_dim_customer.CustomerKey,
        "left"
    )
    .join(
        df_dim_product,
        df_sales.ProductID == df_dim_product.ProductKey,
        "left"
    )
    .join(
        df_dim_territory,
        df_sales.TerritoryID == df_dim_territory.TerritoryKey,
        "left"
    )
)


# IMPORTANT! AdventureWorks dataset does not have prices provided for the products.
# We already populated synthetic prices for product dimension.
# Now we will use those new columns to create fact_sales price columns for analytical purposes.
# Columns added will be a UnitPrice based on dim_product.ListPrice and SalesAmount - UnitPrice mutiplied by OrderQuantity.

df_sales_enriched = (df_sales_enriched
    .withColumn("UnitPrice", col("ListPrice").cast(DecimalType(10, 2))
    )
    .withColumn("SalesAmount", round(col("Quantity") * col("UnitPrice"), 2).cast(DecimalType(12, 2)))
)

# Reorder, rename and filter columns

df_fact_sales = df_sales_enriched.select(
    col("SalesOrderNumber"),
    col("SalesOrderLineNumber"),
    col("DateKey").alias("OrderDateKey"),
    col("CustomerID").alias("CustomerKey"),
    col("ProductID").alias("ProductKey"),
    col("TerritoryID").alias("TerritoryKey"),
    col("Quantity"),
    col("UnitPrice"),
    col("SalesAmount")
)

# Check for missing dimension keys
df_fact_sales.filter(
    col("OrderDateKey").isNull()
    | col("CustomerKey").isNull()
    | col("ProductKey").isNull()
    | col("TerritoryKey").isNull()
).count()

#Write DataFrame to gold.fact_sales Delta Table

df_fact_sales.write \
    .mode("overwrite") \
    .format("delta") \
    .saveAsTable("gold.fact_sales")


StatementMeta(, 819ab9f3-3560-4d1d-bf2b-4c73f8cf1766, 6, Finished, Available, Finished)

In [1]:
from pyspark.sql.functions import col
from pyspark.sql.types import DecimalType, IntegerType

# Read data from gold dimensions and silver returns delta tables

df_returns = spark.table("silver.returns")
df_dim_date = spark.table("gold.dim_date")
df_dim_product = spark.table("gold.dim_product")
df_dim_territory = spark.table("gold.dim_territory")

# Join tables on keys to enforce star schema and ensure data integrity

df_returns_enriched = (
    df_returns.alias("r")
    .join(
        df_dim_date.alias("d"),
        col("r.ReturnDate") == col("d.Date"),
        "left"
    )
    .join(
        df_dim_product.alias("p"),
        col("r.ProductKey") == col("p.ProductKey"),
        "left"
    )
    .join(
        df_dim_territory.alias("t"),
        col("r.TerritoryKey") == col("t.TerritoryKey"),
        "left"
    )
)

# Perform final select, type casting and adding price columns

df_fact_returns = (
    df_returns_enriched
    .select(
        col("d.DateKey").alias("ReturnDateKey"),
        col("p.ProductKey").cast(IntegerType()).alias("ProductKey"),
        col("t.TerritoryKey").cast(IntegerType()).alias("TerritoryKey"),
        col("r.ReturnQuantity").cast(IntegerType()).alias("ReturnQuantity"),
        col("p.ListPrice").cast(DecimalType(10, 2)).alias("UnitPrice"),
        (col("r.ReturnQuantity") * col("p.ListPrice"))
            .cast(DecimalType(12, 2))
            .alias("ReturnAmount")
    )
)

# Write data to gold fact_returns delta table

df_fact_returns.write \
    .format("delta") \
    .mode("append") \
    .saveAsTable("gold.fact_returns")



StatementMeta(, 1d27190f-db0f-4fa9-9b7e-63575f24c24b, 3, Finished, Available, Finished)