In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("IcebergTest") \
    .config("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.nessie.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog") \
    .config("spark.sql.catalog.nessie.uri", "http://nessie:19120/api/v1") \
    .config("spark.sql.catalog.nessie.ref", "main") \
    .config("spark.sql.catalog.nessie.warehouse", "s3a://warehouse") \
    .config("spark.sql.catalog.nessie.s3.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/06/16 20:07:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
try:
    fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(
        spark._jvm.java.net.URI("s3a://warehouse"),
        spark._jsc.hadoopConfiguration()
    )
    print("✅ S3A filesystem is working!")
    print("FileSystem class:", fs.getClass().getName())
except Exception as e:
    print("❌ S3A filesystem is NOT configured correctly.")
    print("Error:", e)

25/06/16 20:07:52 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
✅ S3A filesystem is working!
FileSystem class: org.apache.hadoop.fs.s3a.S3AFileSystem


In [5]:


# Step 1: Create namespace (like a database)
spark.sql("CREATE NAMESPACE IF NOT EXISTS nessie.db")

# Step 2: Create Iceberg table in Nessie catalog
spark.sql("CREATE TABLE nessie.db.sample (id INT, name STRING)")

# Step 3: Insert some data
spark.sql("INSERT INTO nessie.db.sample VALUES (1, 'Asad'), (2, 'Kamran')")

# Step 4: Query the table
spark.sql("SELECT * FROM nessie.db.sample").show()


                                                                                

+---+------+
| id|  name|
+---+------+
|  1|  Asad|
|  2|Kamran|
+---+------+



In [6]:
# Insert more data
spark.sql("INSERT INTO nessie.db.sample VALUES (3, 'Oman')")

# View updated data
spark.sql("SELECT * FROM nessie.db.sample").show()

# Now time travel back
snapshot_id = spark.sql("SELECT snapshot_id FROM nessie.db.sample.snapshots").collect()[0][0]
spark.sql(f"SELECT * FROM nessie.db.sample VERSION AS OF {snapshot_id}").show()


+---+------+
| id|  name|
+---+------+
|  1|  Asad|
|  2|Kamran|
|  3|  Oman|
+---+------+

+---+------+
| id|  name|
+---+------+
|  1|  Asad|
|  2|Kamran|
+---+------+



In [8]:
spark.sql("SELECT * FROM nessie.db.sample.snapshots").show()
spark.sql("DESCRIBE TABLE EXTENDED nessie.db.sample").show(truncate=False)


+--------------------+-------------------+-------------------+---------+--------------------+--------------------+
|        committed_at|        snapshot_id|          parent_id|operation|       manifest_list|             summary|
+--------------------+-------------------+-------------------+---------+--------------------+--------------------+
|2025-06-16 20:10:...|5180409574472685445|               null|   append|s3a://warehouse/d...|{spark.app.id -> ...|
|2025-06-16 20:11:...|7795097965680246088|5180409574472685445|   append|s3a://warehouse/d...|{spark.app.id -> ...|
+--------------------+-------------------+-------------------+---------+--------------------+--------------------+

+----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+
|co

In [9]:
spark.sql("SELECT * FROM nessie.db.sample.manifests").show(truncate=False)


+-------+--------------------------------------------------------------------------------------------------------------------+------+-----------------+-------------------+----------------------+-------------------------+------------------------+------------------------+---------------------------+--------------------------+-------------------+
|content|path                                                                                                                |length|partition_spec_id|added_snapshot_id  |added_data_files_count|existing_data_files_count|deleted_data_files_count|added_delete_files_count|existing_delete_files_count|deleted_delete_files_count|partition_summaries|
+-------+--------------------------------------------------------------------------------------------------------------------+------+-----------------+-------------------+----------------------+-------------------------+------------------------+------------------------+---------------------------+----------

In [10]:
spark.sql("SELECT * FROM nessie.db.sample.files").show(truncate=False)


+-------+------------------------------------------------------------------------------------------------------------------------------+-----------+-------+------------+------------------+------------------+----------------+-----------------+----------------+------------------------+------------------------+------------+-------------+------------+-------------+----------------------------------------------------------+
|content|file_path                                                                                                                     |file_format|spec_id|record_count|file_size_in_bytes|column_sizes      |value_counts    |null_value_counts|nan_value_counts|lower_bounds            |upper_bounds            |key_metadata|split_offsets|equality_ids|sort_order_id|readable_metrics                                          |
+-------+------------------------------------------------------------------------------------------------------------------------------+-----------+------

In [12]:
spark.sql("DESCRIBE TABLE EXTENDED nessie.db.sample").show(truncate=False)


+----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                                                                                                                                                                                                                 |comment|
+----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+
|id                          |int                                                 