In [1]:
# Welcome to your new notebook
# Type here in the cell editor to add code!

from pyspark.sql.types import *

# Create the schema for the table
orderSchema = StructType([
     StructField("SalesOrderNumber", StringType()),
     StructField("SalesOrderLineNumber", IntegerType()),
     StructField("OrderDate", DateType()),
     StructField("CustomerName", StringType()),
     StructField("Email", StringType()),
     StructField("Item", StringType()),
     StructField("Quantity", IntegerType()),
     StructField("UnitPrice", FloatType()),
     StructField("Tax", FloatType())
     ])
    
 # Import all files from bronze folder of lakehouse
df = spark.read.format("csv").option("header", "true").schema(orderSchema).load("Files/Bronze/*.csv")
    
 # Display the first 10 rows of the dataframe to preview your data
display(df.head(10))

StatementMeta(, d6098ca2-e35a-4717-b906-ac2ec81426e8, 3, Finished, Available)

SynapseWidget(Synapse.DataFrame, 387025ab-cc5b-4b01-a53c-903e879d864b)

In [9]:
from pyspark.sql.functions import when, lit, col, current_timestamp, input_file_name
    
# Add columns IsFlagged, CreatedTS and ModifiedTS
df = df.withColumn("FileName", input_file_name()) \
     .withColumn("IsFlagged", when(col("OrderDate") < '2019-08-01',True).otherwise(False)) \
     .withColumn("CreatedTS", current_timestamp()).withColumn("ModifiedTS", current_timestamp())
    
# Update CustomerName to "Unknown" if CustomerName null or empty
df = df.withColumn("CustomerName", when((col("CustomerName").isNull() | (col("CustomerName")=="")),lit("Unknown")).otherwise(col("CustomerName")))
     

StatementMeta(, 0f68cde7-edea-4108-a25e-5edfcd19aaad, 11, Finished, Available)

In [2]:
# Define the schema for the sales_silver table
    
from pyspark.sql.types import *
from delta.tables import *
    
DeltaTable.createIfNotExists(spark) \
     .tableName("Medallion__LH.sales_silver") \
     .addColumn("SalesOrderNumber", StringType()) \
     .addColumn("SalesOrderLineNumber", IntegerType()) \
     .addColumn("OrderDate", DateType()) \
     .addColumn("CustomerName", StringType()) \
     .addColumn("Email", StringType()) \
     .addColumn("Item", StringType()) \
     .addColumn("Quantity", IntegerType()) \
     .addColumn("UnitPrice", FloatType()) \
     .addColumn("Tax", FloatType()) \
     .addColumn("FileName", StringType()) \
     .addColumn("IsFlagged", BooleanType()) \
     .addColumn("CreatedTS", DateType()) \
     .addColumn("ModifiedTS", DateType()) \
     .execute()

StatementMeta(, d6098ca2-e35a-4717-b906-ac2ec81426e8, 4, Finished, Available)

<delta.tables.DeltaTable at 0x7b8021e51450>

In [3]:
df1 = spark.sql("SELECT * FROM Medallion__LH.sales_silver LIMIT 1000")

display(df1)

StatementMeta(, d6098ca2-e35a-4717-b906-ac2ec81426e8, 5, Finished, Available)

SynapseWidget(Synapse.DataFrame, 79d72afb-33ed-48d8-ace0-4083e9ea600b)

In [8]:
df1.printSchema()

df1.show(5)

StatementMeta(, d6098ca2-e35a-4717-b906-ac2ec81426e8, 10, Finished, Available)

root
 |-- SalesOrderNumber: string (nullable = true)
 |-- SalesOrderLineNumber: integer (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Item: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: float (nullable = true)
 |-- Tax: float (nullable = true)
 |-- FileName: string (nullable = true)
 |-- IsFlagged: boolean (nullable = true)
 |-- CreatedTS: date (nullable = true)
 |-- ModifiedTS: date (nullable = true)

+----------------+--------------------+----------+---------------+--------------------+----------------+--------+---------+--------+--------------------+---------+----------+----------+
|SalesOrderNumber|SalesOrderLineNumber| OrderDate|   CustomerName|               Email|            Item|Quantity|UnitPrice|     Tax|            FileName|IsFlagged| CreatedTS|ModifiedTS|
+----------------+--------------------+----------+---------------+--------------------+--

In [12]:
# Update existing records and insert new ones based on a condition defined by the columns SalesOrderNumber, OrderDate, CustomerName, and Item.

from delta.tables import *
    
deltaTable = DeltaTable.forPath(spark, 'Tables/sales_silver')
    
dfUpdates = df
    
deltaTable.alias('silver') \
  .merge(
    dfUpdates.alias('updates'),
    'silver.SalesOrderNumber = updates.SalesOrderNumber and silver.OrderDate = updates.OrderDate and silver.CustomerName = updates.CustomerName and silver.Item = updates.Item'
  ) \
   .whenMatchedUpdate(set =
    {
          
    }
  ) \
 .whenNotMatchedInsert(values =
    {
      "SalesOrderNumber": "updates.SalesOrderNumber",
      "SalesOrderLineNumber": "updates.SalesOrderLineNumber",
      "OrderDate": "updates.OrderDate",
      "CustomerName": "updates.CustomerName",
      "Email": "updates.Email",
      "Item": "updates.Item",
      "Quantity": "updates.Quantity",
      "UnitPrice": "updates.UnitPrice",
      "Tax": "updates.Tax",
      "FileName": "updates.FileName",
      "IsFlagged": "updates.IsFlagged",
      "CreatedTS": "updates.CreatedTS",
      "ModifiedTS": "updates.ModifiedTS"
    }
  ) \
  .execute()

StatementMeta(, 0f68cde7-edea-4108-a25e-5edfcd19aaad, 14, Finished, Available)