In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, LongType
from dotenv import load_dotenv
import os

In [2]:
# Load environment variables
load_dotenv()

HDFS_ENDPOINT = "demo-hadoop-namenode:9000"

spark = SparkSession.builder \
    .appName("DeltaLakeHDFS") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.0.0") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

In [3]:
table_name = 'people'
table_location = f'hdfs://{HDFS_ENDPOINT}/delta-lake/{table_name}'

In [6]:
table_location

'hdfs://demo-hadoop-namenode:9000/delta-lake/people'

In [4]:
schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("City", StringType(), True)
])

# Create data
data = [("Alice", 30, "New York"),
        ("Bob", 25, "San Francisco"),
        ("Charlie", 35, "Los Angeles")]

# Create DataFrame
df = spark.createDataFrame(data, schema)

df.write.format("delta").option("path", table_location).save()

In [7]:
new_data = [("David", 28, "Chicago"),
            ("Eve", 22, "Boston")]

new_df = spark.createDataFrame(new_data, schema)

new_df.write.format("delta").mode("append").option("path", table_location).save()