In [4]:
# Welcome to your new notebook
# Type here in the cell editor to add code!

# The %%pyspark line at the beginning is called a magic, and tells Spark that the language used in this cell is PySpark.
# You can select the language you want to use as a default in the toolbar of the Notebook interface, 
# and then use a magic to override that choice for a specific cell.


df = spark.read.load('Files/sales.csv',
    format='csv',
    header=True
)
display(df.limit(5))


StatementMeta(, 4ff0e24c-8754-4fa5-95ac-835ac449bd52, 6, Finished, Available)

SynapseWidget(Synapse.DataFrame, 1bc00635-9796-4e29-9b65-5c518b6b2b7f)

In [2]:
dff = spark.read.format("csv").option("header","true").load("Files/sales.csv")

# df now is a Spark DataFrame containing CSV data from "Files/sales.csv".
display(dff.head(5))

StatementMeta(, 4ff0e24c-8754-4fa5-95ac-835ac449bd52, 4, Finished, Available)

SynapseWidget(Synapse.DataFrame, 69e07ba3-c30e-4740-b5d3-fe9810930dbd)

In [8]:
#Specifying an explicit schema
# How to add schema to data without header 

from pyspark.sql.types import *
from pyspark.sql.functions import *

salesSchema = StructType([
    StructField("SalesOrderNumber", StringType()),
    StructField("SalesOrderLineNumber", IntegerType()),
    StructField("OrderDate", DateType()),
    StructField("CustomerName", StringType()),
    StructField("EmailAddress", StringType()),
    StructField("Item", StringType()),
    StructField("Quantity", IntegerType()),
    StructField("UnitPrice", IntegerType()),
    StructField("TaxAmount", IntegerType())
    ])

df1 = spark.read.load('Files/sales.csv',
    format='csv',
    schema=salesSchema,
    header=False)
display(df1.limit(5))

StatementMeta(, 4ff0e24c-8754-4fa5-95ac-835ac449bd52, 11, Finished, Available)

SynapseWidget(Synapse.DataFrame, bc91674d-6993-4a63-bef6-854cd2f40a07)

In [10]:
Customer_and_mail_df = df.select("CustomerName", "EmailAddress")

display(Customer_and_mail_df.head(3))

StatementMeta(, 4ff0e24c-8754-4fa5-95ac-835ac449bd52, 13, Finished, Available)

SynapseWidget(Synapse.DataFrame, a4217041-aad0-4db3-bb44-d29aaffa551a)

In [20]:
#Filtering and grouping dataframes
#the number of products for each category

counts_df = df.select("CustomerName", "EmailAddress", "Item" ).groupBy("Item").count().sort("Item")
display(counts_df)

StatementMeta(, 4ff0e24c-8754-4fa5-95ac-835ac449bd52, 23, Finished, Available)

SynapseWidget(Synapse.DataFrame, 5db218b1-1524-4175-8117-cb9881ae0a9c)

In [23]:
# Write cleansed data to destination

df.write.format("delta").mode("overwrite").saveAsTable("SalesTable2")

StatementMeta(, 4ff0e24c-8754-4fa5-95ac-835ac449bd52, 26, Finished, Available)

In [None]:
bikes_df = spark.sql("SELECT ProductID, ProductName, ListPrice \
                      FROM products \
                      WHERE Category IN ('Mountain Bikes', 'Road Bikes')")
display(bikes_df)

# Creating a delta table from a dataframe

In [None]:
# The code specifies that the table should be saved in delta format with a specified table name.
# The data for the table is saved in Parquet files 
# (regardless of the format of the source file you loaded into the dataframe) in the Tables storage area in the lakehouse, 
# along with a _delta_log folder containing the transaction logs for the table. 
# The table will be listed in the Tables folder for the lakehouse in the Data explorer pane.


# Load a file into a dataframe
df = spark.read.load('Files/mydata.csv', format='csv', header=True)

# Save the dataframe as a delta table
df.write.format("delta").saveAsTable("mytable")

In [None]:
# In the previous example, the dataframe was saved as a managed table;
# meaning that the table definition in the metastore and the underlying data files are both managed by the Spark runtime for the Fabric lakehouse. 
# Deleting the table will also delete the underlying files from the Tables storage location for the lakehouse.

# You can also create tables as external tables, 
# in which the relational table definition in the metastore is mapped to an alternative file storage location.
# For example, the following code creates an external table for which the data is stored in the folder in the Files storage location for the lakehouse:


df.write.format("delta").saveAsTable("myexternaltable", path="Files/myexternaltable")

# Use the DeltaTableBuilder API

In [None]:
# The DeltaTableBuilder API enables you to write Spark code to create a table based on your specifications. 
# For example, the following code creates a table with a specified name and columns.

from delta.tables import *

DeltaTable.create(spark) \
  .tableName("products") \
  .addColumn("Productid", "INT") \
  .addColumn("ProductName", "STRING") \
  .addColumn("Category", "STRING") \
  .addColumn("Price", "FLOAT") \
  .execute()

In [None]:
# You can also create delta tables by using the Spark SQL CREATE TABLE statement, as shown in this example:

%%sql

CREATE TABLE salesorders
(
    Orderid INT NOT NULL,
    OrderDate TIMESTAMP NOT NULL,
    CustomerName STRING,
    SalesTotal FLOAT NOT NULL
)
USING DELTA

# Saving data in delta format

In [None]:
# he following PySpark code saves a dataframe to a new folder location in delta format:

delta_path = "Files/mydatatable"
df.write.format("delta").save(delta_path)

In [None]:
# You can replace the contents of an existing folder with the data in a dataframe by using the overwrite mode, 
# as shown here:

new_df.write.format("delta").mode("overwrite").save(delta_path)

In [None]:
# You can also add rows from a dataframe to an existing folder by using the append mode:

new_rows_df.write.format("delta").mode("append").save(delta_path)

# Write to a file

Lakehouses support structured, semi-structured, and unstructured files. Load as a parquet file or Delta table to take advantage of the Spark engine.

In [None]:
# Write DataFrame to Parquet file format
parquet_output_path = "your_folder/your_file_name"
df.write.mode("overwrite").parquet(parquet_output_path)
print(f"DataFrame has been written to Parquet file: {parquet_output_path}")

# Write DataFrame to Delta table
delta_table_name = "your_delta_table_name"
df.write.format("delta").mode("overwrite").saveAsTable(delta_table_name)
print(f"DataFrame has been written to Delta table: {delta_table_name}")

In [None]:
# Write to a Delta table

# Use format and save to load as a Delta table
table_name = "nyctaxi_raw"
filtered_df.write.mode("overwrite").format("delta").save(f"Tables/{table_name}")

# Confirm load as Delta table
print(f"Spark DataFrame saved to Delta table: {table_name}")


