In [None]:
# jupyter_vim

In [None]:
### run ./scripts/run-spark-streaming.sh at project root

In [None]:
! ls /opt/spark/jars | grep kafka
! ls /opt/spark/jars | grep avro

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID", "admin")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY", "password")

# Create a SparkSession
jars = ["org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.4"]
spark = (SparkSession.builder.appName("KafkaStreaming")
     .master("local[1]")
    .config("spark.streaming.stopGracefullyOnShutdown", "true")
    .config("spark.sql.streaming.schemaInference", "true")
    .config("spark.jars.packages", ",".join(jars))
    .config(
        "spark.sql.extensions",
        "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
    )
    .config("spark.sql.catalog.default", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.default.catalog", "spark")
    .config("spark.sql.catalog.default.type", "rest")
    .config("spark.sql.catalog.default.uri", "http://iceberg-rest:8181")
    .config("spark.sql.catalog.default.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
    .config("spark.sql.catalog.default.s3.endpoint", "http://minio:9000")
    .config("spark.sql.catalog.default.warehouse", "s3://warehouse/wh")
    .config("spark.sql.catalog.default.s3.access-key", AWS_ACCESS_KEY_ID)
    .config("spark.sql.catalog.default.s3.secret-key", AWS_SECRET_ACCESS_KEY)
    .getOrCreate()
        )

#    .config("spark.jars", "/opt/spark/jars/kafka-clients-3.4.1.jar") \
spark.sparkContext.setLogLevel("INFO")

# Set up the Kafka configuration
kafka_bootstrap_servers = "kafka-broker:9092"
kafka_topic = "brothers-karamazov"

spark

In [None]:
! ls /home/app/output

In [None]:
# text format messages

In [None]:
if any(file.endswith(".txt") for file in os.listdir("/home/app/output/")):
    spark.read.format("text").load("/home/app/output/*.text").head(5)

In [None]:
# avro format messages

In [None]:
schema = StructType(
    [
        StructField("line", StringType(), True),
    ]
)
if any(file.endswith(".avro") for file in os.listdir("/home/app/output/")):
    spark.read.format("avro").option("schema", schema).load(
        "/home/app/output/*.avro"
    ).show(10, truncate=False)

In [None]:
import pathlib

for f in pathlib.Path("/home/app/output").glob("*"):
    file_size = f.stat().st_size
    print(f"File: {f}, Size: {file_size / 1024 / 1024} MB")

In [None]:
table = spark.read.format("iceberg").load("default.spark.text")
table.show(20, truncate=False)

In [None]:
table.tail(10)

In [None]:
from pyiceberg.catalog import load_catalog

catalog = load_catalog(name="rest", uri="http://iceberg-rest:8181")

In [None]:
catalog.table_exists("spark.text")
table = catalog.load_table("spark.text")
table.schema().as_arrow()

In [None]:
table.inspect.history()

In [None]:
table.inspect.files()

In [None]:
table.scan().to_pandas()