# load external data

In [1]:
# Declare parameters for connection string
# Build the connection string
# Read data into a DataFrame

# Declare parameters for connection string
 # Azure Blob Storage access info
blob_account_name = "azureopendatastorage"
blob_container_name = "nyctlc"
blob_relative_path = "yellow"

# Build the connection string  
 # Construct connection path
wasbs_path = f'wasbs://{blob_container_name}@{blob_account_name}.blob.core.windows.net/{blob_relative_path}'
print(wasbs_path)

# Read data into a DataFrame    
 # Read parquet data from Azure Blob Storage path
blob_df = spark.read.parquet(wasbs_path)

StatementMeta(, 8ab14e59-d0f8-4e63-827e-1f222faf8f4a, 3, Finished, Available)

wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/yellow


In [2]:
# Declare file name    
file_name = "yellow_taxi"
    
# Construct destination path
output_parquet_path = f"abfss://MicrosoftLearn@onelake.dfs.fabric.microsoft.com/MicrosoftLearn_LH.Lakehouse/Files/RawData/{file_name}"
print(output_parquet_path)
        
# Load the first 1000 rows as a Parquet file
blob_df.limit(1000).write.mode("overwrite").parquet(output_parquet_path)

StatementMeta(, 8ab14e59-d0f8-4e63-827e-1f222faf8f4a, 4, Finished, Available)

abfss://MicrosoftLearn@onelake.dfs.fabric.microsoft.com/MicrosoftLearn_LH.Lakehouse/Files/RawData/yellow_taxi


# Transform and load data to a Delta table

In [3]:
# This will add a timestamp column dataload_datetime to log when the data was loaded to a Delta table
# Filter NULL values in storeAndFwdFlag
# Load filtered data into a Delta table
# Display a single row for validation


from pyspark.sql.functions import col, to_timestamp, current_timestamp, year, month
    
# Read the parquet data from the specified path
raw_df = spark.read.parquet(output_parquet_path)   
    
# Add dataload_datetime column with current timestamp
filtered_df = raw_df.withColumn("dataload_datetime", current_timestamp())
    
# Filter columns to exclude any NULL values in storeAndFwdFlag
filtered_df = filtered_df.filter(raw_df["storeAndFwdFlag"].isNotNull())
    
# Load the filtered data into a Delta table
table_name = "yellow_taxi_filtered"  # Replace with your desired table name

filtered_df.write.format("delta").mode("append").saveAsTable(table_name)
    
# Display results
display(filtered_df.limit(25))

StatementMeta(, 8ab14e59-d0f8-4e63-827e-1f222faf8f4a, 5, Finished, Available)

SynapseWidget(Synapse.DataFrame, 60e02917-7417-4743-a98f-5c9ba30ea045)

# Optimize Delta table writes

In [4]:
 from pyspark.sql.functions import col, to_timestamp, current_timestamp, year, month
 
 # Read the parquet data from the specified path
 raw_df = spark.read.parquet(output_parquet_path)    

 # Add dataload_datetime column with current timestamp
 opt_df = raw_df.withColumn("dataload_datetime", current_timestamp())
    
 # Filter columns to exclude any NULL values in storeAndFwdFlag
 opt_df = opt_df.filter(opt_df["storeAndFwdFlag"].isNotNull())
    
 # Enable V-Order
 spark.conf.set("spark.sql.parquet.vorder.enabled", "true")
    
 # Enable automatic Delta optimized write
 spark.conf.set("spark.microsoft.delta.optimizeWrite.enabled", "true")
    
 # Load the filtered data into a Delta table
 table_name = "yellow_taxi_optimized"  # New table name
 opt_df.write.format("delta").mode("append").saveAsTable(table_name)
    
 # Display results
 display(opt_df.limit(25))

StatementMeta(, 8ab14e59-d0f8-4e63-827e-1f222faf8f4a, 6, Finished, Available)

SynapseWidget(Synapse.DataFrame, c457d8a2-1a9e-47b8-ac0b-990373f7382e)