In [2]:
from pyspark.sql import SparkSession

# Step 1: Create SparkSession with Iceberg + Nessie + MinIO
spark = SparkSession.builder \
    .appName("Image Metadata to Iceberg via Nessie") \
    .config("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.nessie.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog") \
    .config("spark.sql.catalog.nessie.uri", "http://nessie:19120/api/v1") \
    .config("spark.sql.catalog.nessie.ref", "main") \
    .config("spark.sql.catalog.nessie.warehouse", "s3a://warehouse/") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .getOrCreate()

# Step 2: Read image metadata from MinIO bucket
image_df = spark.read.format("image").load("s3a://unstructured-data/airport_table_view_first5cols.png")

# Optional: Show schema and metadata
image_df.printSchema()
image_df.select(
    "image.origin",
    "image.height",
    "image.width",
    "image.nChannels",
    "image.mode"
).show(truncate=False)

# Step 3: Create namespace/schema in Iceberg via Nessie if it doesn't exist
spark.sql("CREATE NAMESPACE IF NOT EXISTS nessie.bronze_layer")

# Step 4: Write image metadata as an Iceberg table using Nessie catalog
image_df.writeTo("nessie.bronze_layer.unstructured_data").createOrReplace()

# Step 5: Read back from the Iceberg table to verify
spark.read.table("nessie.bronze_layer.unstructured_data").limit(100).toPandas()


root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)



                                                                                

+---------------------------------------------------------+------+-----+---------+----+
|origin                                                   |height|width|nChannels|mode|
+---------------------------------------------------------+------+-----+---------+----+
|s3a://unstructured-data/airport_table_view_first5cols.png|348   |1900 |4        |24  |
+---------------------------------------------------------+------+-----+---------+----+



                                                                                

Unnamed: 0,image
0,(s3a://unstructured-data/airport_table_view_fi...


In [2]:
spark.sql("SELECT * FROM nessie.bronze_layer.unstructured_data.snapshots").show(truncate=False)


+-----------------------+-------------------+---------+---------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|committed_at           |snapshot_id        |parent_id|operation|manifest_list                                                                                                                                                    |summary                                                                                                                                                                                                                                                       

In [3]:
df = spark.read.option("snapshot-id", "5092265359251046026").table("nessie.bronze_layer.unstructured_data")
df.select("image.origin", "image.height").show()


+--------------------+------+
|              origin|height|
+--------------------+------+
|s3a://unstructure...|   348|
+--------------------+------+

