In [4]:
# Welcome to your new notebook
# Type here in the cell editor to add code!

# Load data to the dataframe as a starting point to create the gold layer
df = spark.read.table("Medallion__LH.sales_silver")
     

StatementMeta(, ecdebc9b-af0d-4f4e-9f1a-25c50ad2ed87, 6, Finished, Available)

In [5]:
# This code creates a table named dimdate_gold in the lakehouse "Medallion__LH". 
# The table stores date-related information in various formats, 
# potentially for further analysis or joining with other tables.

from pyspark.sql.types import *
from delta.tables import*
    
# Define the schema for the dimdate_gold table
DeltaTable.createIfNotExists(spark) \
     .tableName("Medallion__LH.dimdate_gold") \
     .addColumn("OrderDate", DateType()) \
     .addColumn("Day", IntegerType()) \
     .addColumn("Month", IntegerType()) \
     .addColumn("Year", IntegerType()) \
     .addColumn("mmmyyyy", StringType()) \
     .addColumn("yyyymm", StringType()) \
     .execute()

StatementMeta(, ecdebc9b-af0d-4f4e-9f1a-25c50ad2ed87, 7, Finished, Available)

<delta.tables.DeltaTable at 0x787188fe2bc0>

In [6]:
#This code takes an existing DataFrame, removes duplicates based on dates, 
#extracts and formats date components into new columns, sorts the data by date, 
#and displays a preview of the resulting DataFrame.

from pyspark.sql.functions import col, dayofmonth, month, year, date_format
    
# Create dataframe for dimDate_gold
    
dfdimDate_gold = df.dropDuplicates(["OrderDate"]).select(col("OrderDate"), \
         dayofmonth("OrderDate").alias("Day"), \
         month("OrderDate").alias("Month"), \
         year("OrderDate").alias("Year"), \
         date_format(col("OrderDate"), "MMM-yyyy").alias("mmmyyyy"), \
         date_format(col("OrderDate"), "yyyyMM").alias("yyyymm"), \
     ).orderBy("OrderDate")

 # Display the first 10 rows of the dataframe to preview your data

display(dfdimDate_gold.head(10))

StatementMeta(, ecdebc9b-af0d-4f4e-9f1a-25c50ad2ed87, 8, Finished, Available)

SynapseWidget(Synapse.DataFrame, 23508fe5-0b07-49e7-9cc6-eb43e1044aba)

In [7]:
from delta.tables import *
    
deltaTable = DeltaTable.forPath(spark, 'Tables/dimdate_gold')
    
dfUpdates = dfdimDate_gold
    
deltaTable.alias('silver') \
   .merge(
     dfUpdates.alias('updates'),
     'silver.OrderDate = updates.OrderDate'
   ) \
    .whenMatchedUpdate(set =
     {
          
     }
   ) \
  .whenNotMatchedInsert(values =
     {
       "OrderDate": "updates.OrderDate",
       "Day": "updates.Day",
       "Month": "updates.Month",
       "Year": "updates.Year",
       "mmmyyyy": "updates.mmmyyyy",
       "yyyymm": "yyyymm"
     }
   ) \
   .execute()

StatementMeta(, ecdebc9b-af0d-4f4e-9f1a-25c50ad2ed87, 9, Finished, Available)

In [9]:
# This code creates a table named dimcustomer_gold in the lakehouse "Medallion__LH". 
# The table stores date-related information in various formats, 
# potentially for further analysis or joining with other tables.

from pyspark.sql.types import *
from delta.tables import *
    
 # Create customer_gold dimension delta table
DeltaTable.createIfNotExists(spark) \
     .tableName("Medallion__LH.dimcustomer_gold") \
     .addColumn("CustomerName", StringType()) \
     .addColumn("Email",  StringType()) \
     .addColumn("First", StringType()) \
     .addColumn("Last", StringType()) \
     .addColumn("CustomerID", LongType()) \
     .execute()

StatementMeta(, ecdebc9b-af0d-4f4e-9f1a-25c50ad2ed87, 11, Finished, Available)

<delta.tables.DeltaTable at 0x78718903f640>

In [10]:
from pyspark.sql.functions import col, split
    
 # Create customer_gold dataframe
    
dfdimCustomer_silver = df.dropDuplicates(["CustomerName","Email"]).select(col("CustomerName"),col("Email")) \
     .withColumn("First",split(col("CustomerName"), " ").getItem(0)) \
     .withColumn("Last",split(col("CustomerName"), " ").getItem(1)) 
    
 # Display the first 10 rows of the dataframe to preview your data

display(dfdimCustomer_silver .head(10))

StatementMeta(, ecdebc9b-af0d-4f4e-9f1a-25c50ad2ed87, 12, Finished, Available)

SynapseWidget(Synapse.DataFrame, 045002c9-7407-4bed-96f0-447609851798)

In [12]:
from pyspark.sql.functions import monotonically_increasing_id, col, when, coalesce, max, lit
    
dfdimCustomer_temp = spark.read.table("Medallion__LH.dimCustomer_gold")
    
MAXCustomerID = dfdimCustomer_temp.select(coalesce(max(col("CustomerID")),lit(0)).alias("MAXCustomerID")).first()[0]
    
dfdimCustomer_gold = dfdimCustomer_silver.join(dfdimCustomer_temp,(dfdimCustomer_silver.CustomerName == dfdimCustomer_temp.CustomerName) & (dfdimCustomer_silver.Email == dfdimCustomer_temp.Email), "left_anti")
    
dfdimCustomer_gold = dfdimCustomer_gold.withColumn("CustomerID",monotonically_increasing_id() + MAXCustomerID + 1)

 # Display the first 10 rows of the dataframe to preview your data

display(dfdimCustomer_gold.head(10))

StatementMeta(, ecdebc9b-af0d-4f4e-9f1a-25c50ad2ed87, 14, Finished, Available)

SynapseWidget(Synapse.DataFrame, c59ffaf7-8f7c-4a3e-afbf-9e25fab8288a)

In [13]:
from delta.tables import *

deltaTable = DeltaTable.forPath(spark, 'Tables/dimcustomer_gold')
    
dfUpdates = dfdimCustomer_gold
    
deltaTable.alias('silver') \
  .merge(
    dfUpdates.alias('updates'),
    'silver.CustomerName = updates.CustomerName AND silver.Email = updates.Email'
  ) \
   .whenMatchedUpdate(set =
    {
          
    }
  ) \
 .whenNotMatchedInsert(values =
    {
      "CustomerName": "updates.CustomerName",
      "Email": "updates.Email",
      "First": "updates.First",
      "Last": "updates.Last",
      "CustomerID": "updates.CustomerID"
    }
  ) \
  .execute()

StatementMeta(, ecdebc9b-af0d-4f4e-9f1a-25c50ad2ed87, 15, Finished, Available)

In [15]:
from pyspark.sql.types import *
from delta.tables import *
    
DeltaTable.createIfNotExists(spark) \
    .tableName("Medallion__LH.dimproduct_gold") \
    .addColumn("ItemName", StringType()) \
    .addColumn("ItemID", LongType()) \
    .addColumn("ItemInfo", StringType()) \
    .execute()

StatementMeta(, ecdebc9b-af0d-4f4e-9f1a-25c50ad2ed87, 17, Finished, Available)

<delta.tables.DeltaTable at 0x787188fa7df0>

In [16]:
from pyspark.sql.functions import col, split, lit
    
# Create Customer_gold dataframe, this dataframe will be used later on on the Sales join
    
dfdimProduct_silver = df.dropDuplicates(["Item"]).select(col("Item")) \
    .withColumn("ItemName",split(col("Item"), ", ").getItem(0)) \
    .withColumn("ItemInfo",when((split(col("Item"), ", ").getItem(1).isNull() | (split(col("Item"), ", ").getItem(1)=="")),lit("")).otherwise(split(col("Item"), ", ").getItem(1))) 
    
# Display the first 10 rows of the dataframe to preview your data

display(dfdimProduct_silver.head(10))
     

StatementMeta(, ecdebc9b-af0d-4f4e-9f1a-25c50ad2ed87, 18, Finished, Available)

SynapseWidget(Synapse.DataFrame, b0a39017-3a1a-4f4e-a87b-1863b0770dbc)

In [17]:
from pyspark.sql.functions import monotonically_increasing_id, col, lit, max, coalesce
    
#dfdimProduct_temp = dfdimProduct_silver
dfdimProduct_temp = spark.read.table("Medallion__LH.dimProduct_gold")
    
MAXProductID = dfdimProduct_temp.select(coalesce(max(col("ItemID")),lit(0)).alias("MAXItemID")).first()[0]
    
dfdimProduct_gold = dfdimProduct_silver.join(dfdimProduct_temp,(dfdimProduct_silver.ItemName == dfdimProduct_temp.ItemName) & (dfdimProduct_silver.ItemInfo == dfdimProduct_temp.ItemInfo), "left_anti")
    
dfdimProduct_gold = dfdimProduct_gold.withColumn("ItemID",monotonically_increasing_id() + MAXProductID + 1)
    
# Display the first 10 rows of the dataframe to preview your data

display(dfdimProduct_gold.head(10))

StatementMeta(, ecdebc9b-af0d-4f4e-9f1a-25c50ad2ed87, 19, Finished, Available)

SynapseWidget(Synapse.DataFrame, 85dc715e-bd10-4905-a68f-ae2c56a840bd)

In [18]:
from delta.tables import *
    
deltaTable = DeltaTable.forPath(spark, 'Tables/dimproduct_gold')
            
dfUpdates = dfdimProduct_gold
            
deltaTable.alias('silver') \
  .merge(
        dfUpdates.alias('updates'),
        'silver.ItemName = updates.ItemName AND silver.ItemInfo = updates.ItemInfo'
        ) \
        .whenMatchedUpdate(set =
        {
               
        }
        ) \
        .whenNotMatchedInsert(values =
         {
          "ItemName": "updates.ItemName",
          "ItemInfo": "updates.ItemInfo",
          "ItemID": "updates.ItemID"
          }
          ) \
          .execute()

StatementMeta(, ecdebc9b-af0d-4f4e-9f1a-25c50ad2ed87, 20, Finished, Available)

In [20]:
from pyspark.sql.types import *
from delta.tables import *
    
DeltaTable.createIfNotExists(spark) \
    .tableName("Medallion__LH.factsales_gold") \
    .addColumn("CustomerID", LongType()) \
    .addColumn("ItemID", LongType()) \
    .addColumn("OrderDate", DateType()) \
    .addColumn("Quantity", IntegerType()) \
    .addColumn("UnitPrice", FloatType()) \
    .addColumn("Tax", FloatType()) \
    .execute()

StatementMeta(, ecdebc9b-af0d-4f4e-9f1a-25c50ad2ed87, 22, Finished, Available)

<delta.tables.DeltaTable at 0x787188fa7be0>

In [22]:
from pyspark.sql.functions import col
    
dfdimCustomer_temp = spark.read.table("Medallion__LH.dimCustomer_gold")
dfdimProduct_temp = spark.read.table("Medallion__LH.dimProduct_gold")
    
df = df.withColumn("ItemName",split(col("Item"), ", ").getItem(0)) \
    .withColumn("ItemInfo",when((split(col("Item"), ", ").getItem(1).isNull() | (split(col("Item"), ", ").getItem(1)=="")),lit("")).otherwise(split(col("Item"), ", ").getItem(1))) \
    
    
# Create Sales_gold dataframe
    
dffactSales_gold = df.alias("df1").join(dfdimCustomer_temp.alias("df2"),(df.CustomerName == dfdimCustomer_temp.CustomerName) & (df.Email == dfdimCustomer_temp.Email), "left") \
        .join(dfdimProduct_temp.alias("df3"),(df.ItemName == dfdimProduct_temp.ItemName) & (df.ItemInfo == dfdimProduct_temp.ItemInfo), "left") \
    .select(col("df2.CustomerID") \
        , col("df3.ItemID") \
        , col("df1.OrderDate") \
        , col("df1.Quantity") \
        , col("df1.UnitPrice") \
        , col("df1.Tax") \
    ).orderBy(col("df1.OrderDate"), col("df2.CustomerID"), col("df3.ItemID"))
    
# Display the first 10 rows of the dataframe to preview your data
    
display(dffactSales_gold.head(10))

StatementMeta(, ecdebc9b-af0d-4f4e-9f1a-25c50ad2ed87, 24, Finished, Available)

SynapseWidget(Synapse.DataFrame, f0b63256-37f7-4f3c-81b6-a35be4acdad3)

In [23]:
from delta.tables import *
    
deltaTable = DeltaTable.forPath(spark, 'Tables/factsales_gold')
    
dfUpdates = dffactSales_gold
    
deltaTable.alias('silver') \
  .merge(
    dfUpdates.alias('updates'),
    'silver.OrderDate = updates.OrderDate AND silver.CustomerID = updates.CustomerID AND silver.ItemID = updates.ItemID'
  ) \
   .whenMatchedUpdate(set =
    {
          
    }
  ) \
 .whenNotMatchedInsert(values =
    {
      "CustomerID": "updates.CustomerID",
      "ItemID": "updates.ItemID",
      "OrderDate": "updates.OrderDate",
      "Quantity": "updates.Quantity",
      "UnitPrice": "updates.UnitPrice",
      "Tax": "updates.Tax"
    }
  ) \
  .execute()

StatementMeta(, ecdebc9b-af0d-4f4e-9f1a-25c50ad2ed87, 25, Finished, Available)