In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# SparkSession is automatically configured by PYSPARK_SUBMIT_ARGS
# in docker-compose.yml to connect to the Spark master
# and include Delta Lake packages.
spark = SparkSession.builder.appName("JupyterDeltaTest").getOrCreate()

spark.sparkContext.setLogLevel("ERROR") # Optional: to reduce verbose logging

print("SparkSession created successfully in Jupyter!")
print(f"Spark UI available at: {spark.sparkContext.uiWebUrl}")

SparkSession created successfully in Jupyter!
Spark UI available at: http://8dd4678ace36:4040


In [4]:
delta_table_path = "/home/jovyan/data/my_notebook_delta_table" # Use the mounted path

# Create a DataFrame
data = [("John", 10), ("Jane", 20), ("Mike", 30)]
columns = ["name", "age"]
df = spark.createDataFrame(data, columns)

print("\nOriginal DataFrame:")
df.show()

# Write to Delta table
print(f"Writing DataFrame to Delta table at: {delta_table_path}")
df.write.format("delta").mode("overwrite").save(delta_table_path)
print("DataFrame written to Delta table successfully!")

# Read from Delta table
print(f"\nReading data from Delta table at: {delta_table_path}")
delta_df = spark.read.format("delta").load(delta_table_path)
print("Data read from Delta table:")
delta_df.show()

# Append new data
print("\nAppending new data:")
new_data = [("Sarah", 40)]
new_df = spark.createDataFrame(new_data, columns)
new_df.write.format("delta").mode("append").save(delta_table_path)

# Read updated table
print("\nReading updated Delta table:")
updated_delta_df = spark.read.format("delta").load(delta_table_path)
updated_delta_df.show()


Original DataFrame:
+----+---+
|name|age|
+----+---+
|John| 10|
|Jane| 20|
|Mike| 30|
+----+---+

Writing DataFrame to Delta table at: /home/jovyan/data/my_notebook_delta_table
DataFrame written to Delta table successfully!

Reading data from Delta table at: /home/jovyan/data/my_notebook_delta_table
Data read from Delta table:
+----+---+
|name|age|
+----+---+
|Jane| 20|
|John| 10|
|Mike| 30|
+----+---+


Appending new data:

Reading updated Delta table:
+-----+---+
| name|age|
+-----+---+
|Sarah| 40|
| Jane| 20|
| John| 10|
| Mike| 30|
+-----+---+



In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("DeltaExample") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

print("Done")


Done


In [3]:
spark.version


'3.3.1'