### 01 - Setup and test

In [None]:
# --------------------------------------------------------------------------
# 00_setup_and_test.ipynb
# Spark + Iceberg + MinIO (S3A) + Polaris (alleen voor Iceberg tabellen)
# --------------------------------------------------------------------------

from pyspark.sql import SparkSession
import os

# ======================================================================
# 1Ô∏è‚É£ Polaris endpoints + scope
# ======================================================================

POLARIS_URI = os.getenv("POLARIS_URI", "http://polaris:8181/api/catalog").rstrip("/")
if POLARIS_URI in ("http://polaris:8181", "https://polaris:8181"):
    POLARIS_URI = f"{POLARIS_URI}/api/catalog"

POLARIS_OAUTH2 = os.getenv("POLARIS_OAUTH2_TOKEN_URL", "http://polaris:8181/api/catalog/v1/oauth/tokens")
POLARIS_SCOPE = os.getenv("POLARIS_SCOPE", "PRINCIPAL_ROLE:ALL")

# catalog name (hoe je 'm aanspreekt in Spark): polaris.<ns>.<table>
POLARIS_CATALOG_NAME = os.getenv("POLARIS_CATALOG_NAME", "polaris")

# Iceberg warehouse locatie (waar data/metadata files landen in object storage)
ICEBERG_WAREHOUSE = os.getenv("ICEBERG_WAREHOUSE", "s3a://warehouse/iceberg")

# voorlopig admin/password
POLARIS_CLIENT_ID = os.getenv("POLARIS_CLIENT_ID", "admin")
POLARIS_CLIENT_SECRET = os.getenv("POLARIS_CLIENT_SECRET", "password")

# ======================================================================
# 2Ô∏è‚É£ Spark cluster
# ======================================================================

SPARK_MASTER = os.getenv("SPARK_MASTER", "spark://spark-master:7077")
DRIVER_HOST = os.getenv("SPARK_DRIVER_HOST", "jupyter")

# ======================================================================
# 3Ô∏è‚É£ MinIO/S3A
# ======================================================================

S3_ENDPOINT = os.getenv("S3_ENDPOINT", "http://minio:9000")
S3_ACCESS_KEY = os.getenv("MINIO_ROOT_USER", "minioadmin")
S3_SECRET_KEY = os.getenv("MINIO_ROOT_PASSWORD", "minioadmin")

# Stop oude sessie
if "spark" in locals():
    try:
        spark.stop()
    except Exception:
        pass

print(f"üîó SPARK_MASTER       : {SPARK_MASTER}")
print(f"üß∑ spark.driver.host  : {DRIVER_HOST}")
print(f"üß≠ POLARIS_URI        : {POLARIS_URI}")
print(f"üè∑Ô∏è  POLARIS catalog   : {POLARIS_CATALOG_NAME}")
print(f"üì¶ Iceberg warehouse  : {ICEBERG_WAREHOUSE}")
print(f"ü™£ S3 endpoint        : {S3_ENDPOINT}")

# ======================================================================
# 4Ô∏è‚É£ SparkSession
# ======================================================================

builder = (
    SparkSession.builder
    .appName("Lakehouse-Unplugged")
    .master(SPARK_MASTER)

    # Driver bereikbaar voor executors
    .config("spark.driver.host", DRIVER_HOST)
    .config("spark.driver.bindAddress", "0.0.0.0")

    # (aanrader) serializer
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

    # Iceberg extensies
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")

    # Polaris NIET als default catalog
    .config("spark.sql.defaultCatalog", "spark_catalog")

    # Polaris catalog (alleen gebruiken wanneer je 'polaris.*' aanspreekt)
    .config(f"spark.sql.catalog.{POLARIS_CATALOG_NAME}", "org.apache.iceberg.spark.SparkCatalog")
    .config(f"spark.sql.catalog.{POLARIS_CATALOG_NAME}.catalog-impl", "org.apache.iceberg.rest.RESTCatalog")
    .config(f"spark.sql.catalog.{POLARIS_CATALOG_NAME}.uri", POLARIS_URI)
    .config(f"spark.sql.catalog.{POLARIS_CATALOG_NAME}.warehouse", ICEBERG_WAREHOUSE)  # <‚Äî echte warehouse locatie
    .config(f"spark.sql.catalog.{POLARIS_CATALOG_NAME}.rest.auth.type", "oauth2")
    .config(f"spark.sql.catalog.{POLARIS_CATALOG_NAME}.credential", f"{POLARIS_CLIENT_ID}:{POLARIS_CLIENT_SECRET}")
    .config(f"spark.sql.catalog.{POLARIS_CATALOG_NAME}.oauth2-server-uri", POLARIS_OAUTH2)
    .config(f"spark.sql.catalog.{POLARIS_CATALOG_NAME}.scope", POLARIS_SCOPE)
    .config(f"spark.sql.catalog.{POLARIS_CATALOG_NAME}.token-refresh-enabled", "true")

    # S3A / MinIO (data files)
    .config("spark.hadoop.fs.s3a.endpoint", S3_ENDPOINT)
    .config("spark.hadoop.fs.s3a.access.key", S3_ACCESS_KEY)
    .config("spark.hadoop.fs.s3a.secret.key", S3_SECRET_KEY)
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

    # licht houden
    .config("spark.sql.shuffle.partitions", "4")
    .config("spark.sql.adaptive.enabled", "true")
)

# Optie A: liever GEEN jars.packages in de driver
pkgs = (os.getenv("SPARK_JARS_PACKAGES") or "").strip()
if pkgs:
    print(f"‚ö†Ô∏è spark.jars.packages staat aan (driver downloadt deps): {pkgs}")
    builder = builder.config("spark.jars.packages", pkgs)

spark = builder.getOrCreate()

print("‚úÖ Spark up.")
print("üì¶ Default catalog =", spark.conf.get("spark.sql.defaultCatalog"))
print("üß™ Sanity spark.range(10).count() =", spark.range(10).count())

# S3A sanity check (als S3A jars aanwezig zijn)
try:
    jvm = spark._jvm
    hadoop_conf = spark._jsc.hadoopConfiguration()
    fs = jvm.org.apache.hadoop.fs.FileSystem.get(jvm.java.net.URI("s3a://warehouse"), hadoop_conf)
    statuses = fs.listStatus(jvm.org.apache.hadoop.fs.Path("s3a://warehouse/"))
    print("ü™£ S3A sanity list (warehouse):", [s.getPath().toString() for s in statuses])
except Exception as exc:
    print("‚ö†Ô∏è S3A sanity check skipped:", exc)


FileNotFoundError: ‚ùå Geen spark-submit gevonden op /opt/spark/bin/spark-submit en ook niet op /usr/local/lib/python3.11/site-packages/bin/spark-submit

### Driver: Iceberg versie check

In [11]:
# ======================================================================
# üîé Iceberg sanity check (driver)
# ======================================================================

spark = SparkSession.builder.getOrCreate()

try:
    jvm = spark._jvm
    version = jvm.org.apache.iceberg.VersionInfo.VERSION
    print("üß† Driver Iceberg version:", version)
except Exception as e:
    print("‚ùå Driver Iceberg error:", e)


üß† Driver Iceberg version: <py4j.java_gateway.JavaPackage object at 0x74efe412e4d0>


#### 02 - Parkeer bestande in de landingzone

In [12]:
import boto3
from pathlib import Path
import os

# ======================================================================
# 0Ô∏è‚É£ Helper: zoek automatisch lokaal data-bestand
# ======================================================================
def find_data_file(filename: str) -> Path:
    p = Path.cwd()
    for _ in range(4):
        candidate = p / "data" / filename
        if candidate.exists():
            return candidate
        p = p.parent
    raise FileNotFoundError(f"‚ùå Kon '{filename}' niet vinden in een 'data' map.")

# ======================================================================
# 1Ô∏è‚É£ Config
# ======================================================================
local_file = find_data_file("gekentekendevoertuigen_sample.json")
bucket = "warehouse"
prefix = "landing"

object_key = f"{prefix}/{local_file.name}"
s3_uri = f"s3a://{bucket}/{object_key}"

print(f"üìÑ Lokaal bestand: {local_file}")
print(f"‚¨ÜÔ∏è Upload naar:  {s3_uri}")

s3_endpoint = os.getenv("S3_ENDPOINT", "http://minio:9000")
s3_access_key = os.getenv("MINIO_ROOT_USER", "minioadmin")
s3_secret_key = os.getenv("MINIO_ROOT_PASSWORD", "minioadmin")

# ======================================================================
# 2Ô∏è‚É£ MinIO client via boto3
# ======================================================================
s3 = boto3.client(
    "s3",
    endpoint_url=s3_endpoint,
    aws_access_key_id=s3_access_key,
    aws_secret_access_key=s3_secret_key,
    region_name="us-east-1",
)

# Upload bestand
s3.upload_file(str(local_file), bucket, object_key)

print("‚úÖ Upload gelukt.")


# ======================================================================
# 3Ô∏è‚É£ Verify: lijst objecten in prefix
# ======================================================================
response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
print("üì¶ Objecten in MinIO:")
for item in response.get("Contents", []):
    print(" -", item["Key"])


# ======================================================================
# 4Ô∏è‚É£ Spark read via S3A (data-files), metadata via Polaris
# ======================================================================
print(f"üì• Inlezen via Spark: {s3_uri}")

df = spark.read.option("multiline", "true").json(s3_uri)

print(f"üìä Aantal records: {df.count():,}")
df.printSchema()


üìÑ Lokaal bestand: /workspace/data/gekentekendevoertuigen_sample.json
‚¨ÜÔ∏è Upload naar:  s3a://warehouse/landing/gekentekendevoertuigen_sample.json
‚úÖ Upload gelukt.
üì¶ Objecten in MinIO:
 - landing/gekentekendevoertuigen_sample.json
üì• Inlezen via Spark: s3a://warehouse/landing/gekentekendevoertuigen_sample.json


26/01/06 12:28:44 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: s3a://warehouse/landing/gekentekendevoertuigen_sample.json.
java.lang.RuntimeException: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.s3a.S3AFileSystem not found
	at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2688)
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3431)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
	at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:53)
	at org.apache.spark.sql.execution.dataso

Py4JJavaError: An error occurred while calling o320.json.
: java.lang.RuntimeException: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.s3a.S3AFileSystem not found
	at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2688)
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3431)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$checkAndGlobPathIfNecessary$1(DataSource.scala:724)
	at scala.collection.immutable.List.map(List.scala:293)
	at org.apache.spark.sql.execution.datasources.DataSource$.checkAndGlobPathIfNecessary(DataSource.scala:722)
	at org.apache.spark.sql.execution.datasources.DataSource.checkAndGlobPathIfNecessary(DataSource.scala:551)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:404)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:229)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:211)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
	at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:362)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.s3a.S3AFileSystem not found
	at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2592)
	at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2686)
	... 29 more


#### 03 - Ingest into bronze table

In [3]:
# ======================================================================
# üîÑ Ingest van Landingzone ‚Üí Bronze (Iceberg via Polaris)
# ======================================================================

bucket = "warehouse"
prefix = "landing"
local_file = find_data_file("gekentekendevoertuigen_sample.json")
object_key = f"{prefix}/{local_file.name}"
s3_uri = f"s3a://{bucket}/{object_key}"

ns = "polaris.bronze"
bronze_table = f"{ns}.gekentekendevoertuigen"

print(f"üì• Lezen vanuit landingzone: {s3_uri}")

# 1) Data inlezen uit landingzone
df = spark.read.option("multiline", "true").json(s3_uri)

print(f"üì¶ Aantal records geladen: {df.count():,}")
df.printSchema()

# 2) Namespace garanderen
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {ns}")

# 3) Wegschrijven naar Iceberg Bronze
print(f"üßä Schrijven naar Bronze tabel: {bronze_table}")

(
    df.writeTo(bronze_table)
      .using("iceberg")
      .option("format-version", "2")
      .option("overwrite-mode", "dynamic")
      .createOrReplace()
)

print(f"‚úÖ Bronze tabel bijgewerkt: {bronze_table}")

# 4) Tabellen tonen
print("üìã Tabellen in polaris.bronze:")
spark.sql("SHOW TABLES IN polaris.bronze").show(truncate=False)

# 5) Bronze teruglezen ter controle
bronze_df = spark.read.table(bronze_table)

print(f"üîÅ Records in Bronze: {bronze_df.count():,}")
bronze_df.show(5, truncate=False)


üì• Lezen vanuit landingzone: s3a://warehouse/landing/gekentekendevoertuigen_sample.json
üì¶ Aantal records geladen: 10,000
root
 |-- aanhangwagen_autonoom_geremd: string (nullable = true)
 |-- aanhangwagen_middenas_geremd: string (nullable = true)
 |-- aantal_cilinders: string (nullable = true)
 |-- aantal_deuren: string (nullable = true)
 |-- aantal_rolstoelplaatsen: string (nullable = true)
 |-- aantal_staanplaatsen: string (nullable = true)
 |-- aantal_wielen: string (nullable = true)
 |-- aantal_zitplaatsen: string (nullable = true)
 |-- afstand_hart_koppeling_tot_achterzijde_voertuig: string (nullable = true)
 |-- afstand_voorzijde_voertuig_tot_hart_koppeling: string (nullable = true)
 |-- afwijkende_maximum_snelheid: string (nullable = true)
 |-- api_gekentekende_voertuigen_assen: string (nullable = true)
 |-- api_gekentekende_voertuigen_brandstof: string (nullable = true)
 |-- api_gekentekende_voertuigen_carrosserie: string (nullable = true)
 |-- api_gekentekende_voertuigen_c

26/01/06 09:06:37 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
26/01/06 09:06:37 WARN TaskSetManager: Lost task 0.0 in stage 11.0 (TID 9) (172.18.0.5 executor 0): java.io.InvalidClassException: org.apache.iceberg.Schema; local class incompatible: stream classdesc serialVersionUID = 1630427867957364554, local class serialVersionUID = 6812231194765760118
	at java.base/java.io.ObjectStreamClass.initNonProxy(ObjectStreamClass.java:597)
	at java.base/java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:2051)
	at java.base/java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1898)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2224)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1733)
	at java.base/java.io.ObjectInputStream$FieldValues.<init>(ObjectInputStream.java:2606)
	at java.bas

Py4JJavaError: An error occurred while calling o89.createOrReplace.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 11.0 failed 4 times, most recent failure: Lost task 0.3 in stage 11.0 (TID 12) (172.18.0.5 executor 0): java.io.InvalidClassException: org.apache.iceberg.Schema; local class incompatible: stream classdesc serialVersionUID = 1630427867957364554, local class serialVersionUID = 6812231194765760118
	at java.base/java.io.ObjectStreamClass.initNonProxy(ObjectStreamClass.java:597)
	at java.base/java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:2051)
	at java.base/java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1898)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2224)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1733)
	at java.base/java.io.ObjectInputStream$FieldValues.<init>(ObjectInputStream.java:2606)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2457)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2257)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1733)
	at java.base/java.io.ObjectInputStream.readArray(ObjectInputStream.java:2157)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1721)
	at java.base/java.io.ObjectInputStream$FieldValues.<init>(ObjectInputStream.java:2606)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2457)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2257)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1733)
	at java.base/java.io.ObjectInputStream$FieldValues.<init>(ObjectInputStream.java:2606)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2457)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2257)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1733)
	at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:509)
	at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:467)
	at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:87)
	at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:129)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:86)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:840)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2(WriteToDataSourceV2Exec.scala:385)
	at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2$(WriteToDataSourceV2Exec.scala:359)
	at org.apache.spark.sql.execution.datasources.v2.AppendDataExec.writeWithV2(WriteToDataSourceV2Exec.scala:225)
	at org.apache.spark.sql.execution.datasources.v2.V2ExistingTableWriteExec.run(WriteToDataSourceV2Exec.scala:337)
	at org.apache.spark.sql.execution.datasources.v2.V2ExistingTableWriteExec.run$(WriteToDataSourceV2Exec.scala:336)
	at org.apache.spark.sql.execution.datasources.v2.AppendDataExec.run(WriteToDataSourceV2Exec.scala:225)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.execution.datasources.v2.V2CreateTableAsSelectBaseExec.$anonfun$writeToTable$1(WriteToDataSourceV2Exec.scala:577)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1397)
	at org.apache.spark.sql.execution.datasources.v2.V2CreateTableAsSelectBaseExec.writeToTable(WriteToDataSourceV2Exec.scala:573)
	at org.apache.spark.sql.execution.datasources.v2.V2CreateTableAsSelectBaseExec.writeToTable$(WriteToDataSourceV2Exec.scala:567)
	at org.apache.spark.sql.execution.datasources.v2.AtomicReplaceTableAsSelectExec.writeToTable(WriteToDataSourceV2Exec.scala:183)
	at org.apache.spark.sql.execution.datasources.v2.AtomicReplaceTableAsSelectExec.run(WriteToDataSourceV2Exec.scala:216)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriterV2.runCommand(DataFrameWriterV2.scala:196)
	at org.apache.spark.sql.DataFrameWriterV2.internalReplace(DataFrameWriterV2.scala:208)
	at org.apache.spark.sql.DataFrameWriterV2.createOrReplace(DataFrameWriterV2.scala:134)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: java.io.InvalidClassException: org.apache.iceberg.Schema; local class incompatible: stream classdesc serialVersionUID = 1630427867957364554, local class serialVersionUID = 6812231194765760118
	at java.base/java.io.ObjectStreamClass.initNonProxy(ObjectStreamClass.java:597)
	at java.base/java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:2051)
	at java.base/java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1898)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2224)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1733)
	at java.base/java.io.ObjectInputStream$FieldValues.<init>(ObjectInputStream.java:2606)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2457)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2257)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1733)
	at java.base/java.io.ObjectInputStream.readArray(ObjectInputStream.java:2157)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1721)
	at java.base/java.io.ObjectInputStream$FieldValues.<init>(ObjectInputStream.java:2606)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2457)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2257)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1733)
	at java.base/java.io.ObjectInputStream$FieldValues.<init>(ObjectInputStream.java:2606)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2457)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2257)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1733)
	at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:509)
	at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:467)
	at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:87)
	at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:129)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:86)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:840)


#### Query de bronze table

In [22]:
from pyspark.sql.functions import col

df = spark.read.table("polaris.bronze.gekentekendevoertuigen")

print("üöó Top 5 voertuigsoorten:")
(
    df.groupBy("voertuigsoort")
      .count()
      .orderBy(col("count").desc())
      .show(5, truncate=False)
)

print("
üè∑Ô∏è Top 5 merken:")
(
    df.groupBy("merk")
      .count()
      .orderBy(col("count").desc())
      .show(5, truncate=False)
)

print("
üî§ Top 5 handelsbenamingen:")
(
    df.groupBy("handelsbenaming")
      .count()
      .orderBy(col("count").desc())
      .show(5, truncate=False)
)

print("
‚ö° Top 5 voertuigen op vermogen (massarijklaar):")
(
    df.select("merk", "handelsbenaming", "vermogen_massarijklaar")
      .orderBy(col("vermogen_massarijklaar").desc_nulls_last())
      .show(5, truncate=False)
)



üöó Top 5 voertuigsoorten:
+--------------------+-----+
|voertuigsoort       |count|
+--------------------+-----+
|Personenauto        |7078 |
|Bedrijfsauto        |1237 |
|Bromfiets           |782  |
|Motorfiets          |258  |
|Middenasaanhangwagen|136  |
+--------------------+-----+
only showing top 5 rows


üè∑Ô∏è Top 5 merken:
+-------------+-----+
|merk         |count|
+-------------+-----+
|VOLKSWAGEN   |1076 |
|PEUGEOT      |615  |
|RENAULT      |606  |
|MERCEDES-BENZ|565  |
|FORD         |553  |
+-------------+-----+
only showing top 5 rows


üî§ Top 5 handelsbenamingen:
+---------------+-----+
|handelsbenaming|count|
+---------------+-----+
|POLO           |219  |
|GOLF           |202  |
|FOCUS          |138  |
|N/A            |135  |
|CLIO           |125  |
+---------------+-----+
only showing top 5 rows


‚ö° Top 5 voertuigen op vermogen (massarijklaar):
+-------+--------------------+----------------------+
|merk   |handelsbenaming     |vermogen_massarijklaar|
+-------+