### 01 - Setup and test

In [None]:
from pyspark.sql import SparkSession
import os

# Polaris
POLARIS_URI = os.getenv("POLARIS_URI", "http://polaris:8181/api/catalog").rstrip("/")
POLARIS_OAUTH2 = os.getenv("POLARIS_OAUTH2_TOKEN_URL", "http://polaris:8181/api/catalog/v1/oauth/tokens")
POLARIS_SCOPE = os.getenv("POLARIS_SCOPE", "PRINCIPAL_ROLE:ALL")
POLARIS_CLIENT_ID = os.getenv("POLARIS_CLIENT_ID", "admin")
POLARIS_CLIENT_SECRET = os.getenv("POLARIS_CLIENT_SECRET", "password")

# Spark
SPARK_MASTER = os.getenv("SPARK_MASTER", "spark://spark-master:7077")
DRIVER_HOST = os.getenv("SPARK_DRIVER_HOST", "jupyter")  # docker service name

# MinIO / S3A
S3_ENDPOINT = os.getenv("S3_ENDPOINT", "http://minio:9000")
S3_ACCESS_KEY = os.getenv("MINIO_ROOT_USER", "minioadmin")
S3_SECRET_KEY = os.getenv("MINIO_ROOT_PASSWORD", "minioadmin")

# Iceberg files location
ICEBERG_WAREHOUSE = os.getenv("ICEBERG_WAREHOUSE", "s3://warehouse/polaris").strip()
if ICEBERG_WAREHOUSE.startswith("s3a://"):
    ICEBERG_WAREHOUSE = f"s3://{ICEBERG_WAREHOUSE[len('s3a://') :]}"


# Stop oude sessie
if "spark" in locals():
    try:
        spark.stop()
    except Exception:
        pass

print(f"üîó SPARK_MASTER      : {SPARK_MASTER}")
print(f"üß∑ DRIVER_HOST       : {DRIVER_HOST}")
print(f"üß≠ POLARIS_URI       : {POLARIS_URI}")
print(f"üì¶ ICEBERG_WAREHOUSE : {ICEBERG_WAREHOUSE}")
print(f"ü™£ S3 endpoint       : {S3_ENDPOINT}")

builder = (
    SparkSession.builder
    .appName("Lakehouse-Unplugged")
    .master(SPARK_MASTER)
    .config("spark.driver.host", DRIVER_HOST)
    .config("spark.driver.bindAddress", "0.0.0.0")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.defaultCatalog", "spark_catalog")

    # Polaris catalog (gebruik via polaris.<ns>.<table>)
    .config("spark.sql.catalog.polaris", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.polaris.type", "rest")
    .config("spark.sql.catalog.polaris.uri", POLARIS_URI)
    .config("spark.sql.catalog.polaris.warehouse", ICEBERG_WAREHOUSE)
    .config("spark.sql.catalog.polaris.rest.auth.type", "oauth2")
    .config("spark.sql.catalog.polaris.credential", f"{POLARIS_CLIENT_ID}:{POLARIS_CLIENT_SECRET}")
    .config("spark.sql.catalog.polaris.oauth2-server-uri", POLARIS_OAUTH2)
    .config("spark.sql.catalog.polaris.scope", POLARIS_SCOPE)
    .config("spark.sql.catalog.polaris.token-refresh-enabled", "true")

    # Iceberg FileIO via AWS bundle (werkt met s3:// en MinIO)
    .config("spark.sql.catalog.polaris.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
    .config("spark.sql.catalog.polaris.s3.endpoint", S3_ENDPOINT)
    .config("spark.sql.catalog.polaris.s3.path-style-access", "true")
    .config("spark.sql.catalog.polaris.s3.access-key-id", S3_ACCESS_KEY)
    .config("spark.sql.catalog.polaris.s3.secret-access-key", S3_SECRET_KEY)
    .config("spark.sql.catalog.polaris.s3.region", "us-east-1")

    # S3A / MinIO (data files)
    .config("spark.hadoop.fs.s3a.endpoint", S3_ENDPOINT)
    .config("spark.hadoop.fs.s3a.access.key", S3_ACCESS_KEY)
    .config("spark.hadoop.fs.s3a.secret.key", S3_SECRET_KEY)
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

    .config("spark.sql.shuffle.partitions", "4")
    .config("spark.sql.adaptive.enabled", "true")
)

pkgs = (os.getenv("SPARK_JARS_PACKAGES") or "").strip()
if pkgs:
    print(f"‚ö†Ô∏è spark.jars.packages staat aan (driver downloadt deps): {pkgs}")
    builder = builder.config("spark.jars.packages", pkgs)

spark = builder.getOrCreate()

print("‚úÖ Spark up.")
print("üß™ Sanity spark.range(10).count() =", spark.range(10).count())


üîó SPARK_MASTER      : spark://spark-master:7077
üß∑ DRIVER_HOST       : jupyter
üß≠ POLARIS_URI       : http://polaris:8181/api/catalog
üì¶ ICEBERG_WAREHOUSE : s3://warehouse/iceberg
ü™£ S3 endpoint       : http://minio:9000
‚úÖ Spark up.


[Stage 0:>                                                          (0 + 2) / 2]

üß™ Sanity spark.range(10).count() = 10


                                                                                

#### 02 - Parkeer bestande in de landingzone

In [3]:
import os
import boto3
from pathlib import Path
from botocore.exceptions import ClientError

# ======================================================================
# 0Ô∏è‚É£ Helper: zoek automatisch lokaal data-bestand
# ======================================================================
def find_data_file(filename: str) -> Path:
    p = Path.cwd()
    for _ in range(6):
        candidate = p / "data" / filename
        if candidate.exists():
            return candidate
        p = p.parent
    raise FileNotFoundError(f"‚ùå Kon '{filename}' niet vinden in een 'data' map vanaf {Path.cwd()}.")

# ======================================================================
# 1Ô∏è‚É£ Config (uit env waar kan)
# ======================================================================
local_file = find_data_file("gekentekendevoertuigen_sample.json")

bucket = os.getenv("MINIO_BUCKET", "warehouse")
prefix = os.getenv("MINIO_PREFIX", "landing")

endpoint = os.getenv("S3_ENDPOINT", "http://minio:9000")
access_key = os.getenv("MINIO_ROOT_USER", "minioadmin")
secret_key = os.getenv("MINIO_ROOT_PASSWORD", "minioadmin")
region = os.getenv("AWS_REGION", "us-east-1")

object_key = f"{prefix}/{local_file.name}"
s3a_uri = f"s3a://{bucket}/{object_key}"

print(f"üìÑ Lokaal bestand : {local_file}")
print(f"‚¨ÜÔ∏è Upload naar    : s3://{bucket}/{object_key}")
print(f"üì• Spark read via : {s3a_uri}")
print(f"ü™£ MinIO endpoint : {endpoint}")

# ======================================================================
# 2Ô∏è‚É£ MinIO client via boto3 (S3 API)
# ======================================================================
s3 = boto3.client(
    "s3",
    endpoint_url=endpoint,
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key,
    region_name=region,
)

# Bucket check (minio-setup maakt 'warehouse' al aan, maar dit maakt het robuust)
try:
    s3.head_bucket(Bucket=bucket)
except ClientError:
    print(f"‚ÑπÔ∏è Bucket '{bucket}' bestaat nog niet, maak 'm aan...")
    s3.create_bucket(Bucket=bucket)

# Upload bestand
s3.upload_file(str(local_file), bucket, object_key)
print("‚úÖ Upload gelukt.")

# ======================================================================
# 3Ô∏è‚É£ Verify: lijst objecten in prefix
# ======================================================================
response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
print("üì¶ Objecten in MinIO:")
for item in response.get("Contents", []):
    print(" -", item["Key"])

# ======================================================================
# 4Ô∏è‚É£ Spark read via S3A
# ======================================================================
df = spark.read.option("multiline", "true").json(s3a_uri)

print(f"üìä Aantal records: {df.count():,}")
df.printSchema()
df.show(5, truncate=False)


üìÑ Lokaal bestand : /workspace/data/gekentekendevoertuigen_sample.json
‚¨ÜÔ∏è Upload naar    : s3://warehouse/landing/gekentekendevoertuigen_sample.json
üì• Spark read via : s3a://warehouse/landing/gekentekendevoertuigen_sample.json
ü™£ MinIO endpoint : http://minio:9000
‚úÖ Upload gelukt.
üì¶ Objecten in MinIO:
 - landing/gekentekendevoertuigen_sample.json


                                                                                

üìä Aantal records: 10,000
root
 |-- aanhangwagen_autonoom_geremd: string (nullable = true)
 |-- aanhangwagen_middenas_geremd: string (nullable = true)
 |-- aantal_cilinders: string (nullable = true)
 |-- aantal_deuren: string (nullable = true)
 |-- aantal_rolstoelplaatsen: string (nullable = true)
 |-- aantal_staanplaatsen: string (nullable = true)
 |-- aantal_wielen: string (nullable = true)
 |-- aantal_zitplaatsen: string (nullable = true)
 |-- afstand_hart_koppeling_tot_achterzijde_voertuig: string (nullable = true)
 |-- afstand_voorzijde_voertuig_tot_hart_koppeling: string (nullable = true)
 |-- afwijkende_maximum_snelheid: string (nullable = true)
 |-- api_gekentekende_voertuigen_assen: string (nullable = true)
 |-- api_gekentekende_voertuigen_brandstof: string (nullable = true)
 |-- api_gekentekende_voertuigen_carrosserie: string (nullable = true)
 |-- api_gekentekende_voertuigen_carrosserie_specifiek: string (nullable = true)
 |-- api_gekentekende_voertuigen_voertuigklasse: st

26/01/06 16:16:52 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 7:>                                                          (0 + 1) / 1]

+----------------------------+----------------------------+----------------+-------------+-----------------------+--------------------+-------------+------------------+-----------------------------------------------+---------------------------------------------+---------------------------+-----------------------------------------------+-----------------------------------------------+-----------------------------------------------+-------------------------------------------------+-----------------------------------------------+-------+------------------------+------------------------+---------+--------------+--------------+-----------------------------------+----------------------------------------+-------------------------------------------+----------------------+-------------------------+--------------------+-----------------------+------------+---------------------------------------+--------------------------+-------------------------------------+----------------+---------------+----

                                                                                

#### 03 - Ingest into bronze table

In [4]:
# ======================================================================
# üîÑ Ingest van Landingzone ‚Üí Bronze (Iceberg via Polaris)
# ======================================================================

bucket = "warehouse"
prefix = "landing"
local_file = find_data_file("gekentekendevoertuigen_sample.json")
object_key = f"{prefix}/{local_file.name}"
s3_uri = f"s3a://{bucket}/{object_key}"

catalog = "polaris"
namespace = "bronze"
table_name = "gekentekendevoertuigen"

ns_fqn = f"{catalog}.{namespace}"
table_fqn = f"{ns_fqn}.{table_name}"

print(f"üì• Lezen vanuit landingzone: {s3_uri}")

# 0) Quick sanity: catalog zichtbaar?
spark.sql("SHOW CATALOGS").show(truncate=False)

# 1) Data inlezen uit landingzone
df = spark.read.option("multiline", "true").json(s3_uri)

print(f"üì¶ Aantal records geladen: {df.count():,}")
df.printSchema()

# 2) Namespace garanderen
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {ns_fqn}")

# 3) Wegschrijven naar Iceberg Bronze (create or replace)
print(f"üßä Schrijven naar Bronze tabel: {table_fqn}")

(
    df.writeTo(table_fqn)
      .using("iceberg")
      .option("format-version", "2")
      .createOrReplace()
)

# 4) Refresh metadata (handig bij interactive notebooks)
spark.catalog.refreshTable(table_fqn)

print(f"‚úÖ Bronze tabel bijgewerkt: {table_fqn}")

# 5) Tabellen tonen
print(f"üìã Tabellen in {ns_fqn}:")
spark.sql(f"SHOW TABLES IN {ns_fqn}").show(truncate=False)

# 6) Bronze teruglezen ter controle
bronze_df = spark.read.table(table_fqn)

print(f"üîÅ Records in Bronze: {bronze_df.count():,}")
bronze_df.show(5, truncate=False)


üì• Lezen vanuit landingzone: s3a://warehouse/landing/gekentekendevoertuigen_sample.json
+-------------+
|catalog      |
+-------------+
|spark_catalog|
+-------------+

üì¶ Aantal records geladen: 10,000
root
 |-- aanhangwagen_autonoom_geremd: string (nullable = true)
 |-- aanhangwagen_middenas_geremd: string (nullable = true)
 |-- aantal_cilinders: string (nullable = true)
 |-- aantal_deuren: string (nullable = true)
 |-- aantal_rolstoelplaatsen: string (nullable = true)
 |-- aantal_staanplaatsen: string (nullable = true)
 |-- aantal_wielen: string (nullable = true)
 |-- aantal_zitplaatsen: string (nullable = true)
 |-- afstand_hart_koppeling_tot_achterzijde_voertuig: string (nullable = true)
 |-- afstand_voorzijde_voertuig_tot_hart_koppeling: string (nullable = true)
 |-- afwijkende_maximum_snelheid: string (nullable = true)
 |-- api_gekentekende_voertuigen_assen: string (nullable = true)
 |-- api_gekentekende_voertuigen_brandstof: string (nullable = true)
 |-- api_gekentekende_vo

Py4JJavaError: An error occurred while calling o181.sql.
: org.apache.iceberg.exceptions.RESTException: Unable to process: Unable to find warehouse s3://warehouse/iceberg
	at org.apache.iceberg.rest.ErrorHandlers$DefaultErrorHandler.accept(ErrorHandlers.java:250)
	at org.apache.iceberg.rest.ErrorHandlers$DefaultErrorHandler.accept(ErrorHandlers.java:214)
	at org.apache.iceberg.rest.HTTPClient.throwFailure(HTTPClient.java:240)
	at org.apache.iceberg.rest.HTTPClient.execute(HTTPClient.java:336)
	at org.apache.iceberg.rest.HTTPClient.execute(HTTPClient.java:297)
	at org.apache.iceberg.rest.BaseHTTPClient.get(BaseHTTPClient.java:77)
	at org.apache.iceberg.rest.RESTSessionCatalog.fetchConfig(RESTSessionCatalog.java:1023)
	at org.apache.iceberg.rest.RESTSessionCatalog.initialize(RESTSessionCatalog.java:205)
	at org.apache.iceberg.rest.RESTCatalog.initialize(RESTCatalog.java:82)
	at org.apache.iceberg.CatalogUtil.loadCatalog(CatalogUtil.java:280)
	at org.apache.iceberg.CatalogUtil.buildIcebergCatalog(CatalogUtil.java:337)
	at org.apache.iceberg.spark.SparkCatalog.buildIcebergCatalog(SparkCatalog.java:154)
	at org.apache.iceberg.spark.SparkCatalog.initialize(SparkCatalog.java:754)
	at org.apache.spark.sql.connector.catalog.Catalogs$.load(Catalogs.scala:65)
	at org.apache.spark.sql.connector.catalog.CatalogManager.$anonfun$catalog$1(CatalogManager.scala:53)
	at scala.collection.mutable.HashMap.getOrElseUpdate(HashMap.scala:86)
	at org.apache.spark.sql.connector.catalog.CatalogManager.catalog(CatalogManager.scala:53)
	at org.apache.spark.sql.connector.catalog.LookupCatalog$CatalogAndNamespace$.unapply(LookupCatalog.scala:86)
	at org.apache.spark.sql.catalyst.analysis.ResolveCatalogs$$anonfun$apply$1.applyOrElse(ResolveCatalogs.scala:51)
	at org.apache.spark.sql.catalyst.analysis.ResolveCatalogs$$anonfun$apply$1.applyOrElse(ResolveCatalogs.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$2(AnalysisHelper.scala:170)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$1(AnalysisHelper.scala:170)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:323)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning(AnalysisHelper.scala:168)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning$(AnalysisHelper.scala:164)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$4(AnalysisHelper.scala:175)
	at org.apache.spark.sql.catalyst.trees.UnaryLike.mapChildren(TreeNode.scala:1215)
	at org.apache.spark.sql.catalyst.trees.UnaryLike.mapChildren$(TreeNode.scala:1214)
	at org.apache.spark.sql.catalyst.plans.logical.CreateNamespace.mapChildren(v2Commands.scala:548)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$1(AnalysisHelper.scala:175)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:323)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning(AnalysisHelper.scala:168)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning$(AnalysisHelper.scala:164)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsWithPruning(AnalysisHelper.scala:99)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsWithPruning$(AnalysisHelper.scala:96)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperators(AnalysisHelper.scala:76)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperators$(AnalysisHelper.scala:75)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperators(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.analysis.ResolveCatalogs.apply(ResolveCatalogs.scala:30)
	at org.apache.spark.sql.catalyst.analysis.ResolveCatalogs.apply(ResolveCatalogs.scala:27)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$2(RuleExecutor.scala:222)
	at scala.collection.LinearSeqOptimized.foldLeft(LinearSeqOptimized.scala:126)
	at scala.collection.LinearSeqOptimized.foldLeft$(LinearSeqOptimized.scala:122)
	at scala.collection.immutable.List.foldLeft(List.scala:91)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1(RuleExecutor.scala:219)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1$adapted(RuleExecutor.scala:211)
	at scala.collection.immutable.List.foreach(List.scala:431)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:211)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.org$apache$spark$sql$catalyst$analysis$Analyzer$$executeSameContext(Analyzer.scala:226)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$execute$1(Analyzer.scala:222)
	at org.apache.spark.sql.catalyst.analysis.AnalysisContext$.withNewAnalysisContext(Analyzer.scala:173)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:222)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:188)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$executeAndTrack$1(RuleExecutor.scala:182)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:89)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:182)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$executeAndCheck$1(Analyzer.scala:209)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:330)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:208)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$analyzed$1(QueryExecution.scala:77)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:138)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$2(QueryExecution.scala:219)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:219)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:218)
	at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:77)
	at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:74)
	at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:66)
	at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:99)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:97)
	at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:638)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:629)
	at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:659)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)


#### Query de bronze table

In [22]:
from pyspark.sql.functions import col

df = spark.read.table("polaris.bronze.gekentekendevoertuigen")

print("üöó Top 5 voertuigsoorten:")
(
    df.groupBy("voertuigsoort")
      .count()
      .orderBy(col("count").desc())
      .show(5, truncate=False)
)

print("
üè∑Ô∏è Top 5 merken:")
(
    df.groupBy("merk")
      .count()
      .orderBy(col("count").desc())
      .show(5, truncate=False)
)

print("
üî§ Top 5 handelsbenamingen:")
(
    df.groupBy("handelsbenaming")
      .count()
      .orderBy(col("count").desc())
      .show(5, truncate=False)
)

print("
‚ö° Top 5 voertuigen op vermogen (massarijklaar):")
(
    df.select("merk", "handelsbenaming", "vermogen_massarijklaar")
      .orderBy(col("vermogen_massarijklaar").desc_nulls_last())
      .show(5, truncate=False)
)



üöó Top 5 voertuigsoorten:
+--------------------+-----+
|voertuigsoort       |count|
+--------------------+-----+
|Personenauto        |7078 |
|Bedrijfsauto        |1237 |
|Bromfiets           |782  |
|Motorfiets          |258  |
|Middenasaanhangwagen|136  |
+--------------------+-----+
only showing top 5 rows


üè∑Ô∏è Top 5 merken:
+-------------+-----+
|merk         |count|
+-------------+-----+
|VOLKSWAGEN   |1076 |
|PEUGEOT      |615  |
|RENAULT      |606  |
|MERCEDES-BENZ|565  |
|FORD         |553  |
+-------------+-----+
only showing top 5 rows


üî§ Top 5 handelsbenamingen:
+---------------+-----+
|handelsbenaming|count|
+---------------+-----+
|POLO           |219  |
|GOLF           |202  |
|FOCUS          |138  |
|N/A            |135  |
|CLIO           |125  |
+---------------+-----+
only showing top 5 rows


‚ö° Top 5 voertuigen op vermogen (massarijklaar):
+-------+--------------------+----------------------+
|merk   |handelsbenaming     |vermogen_massarijklaar|
+-------+