In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [None]:
spark = SparkSession.builder.appName("Jupyter").getOrCreate()

spark

In [None]:
schema = StructType([
    StructField("year", IntegerType(), True),
    StructField("age", StringType(), True),
    StructField("school", IntegerType(), True),
    StructField("group", IntegerType(), True),
    StructField("topic", StringType(), True),
    StructField("count", StringType(), True)
])

In [None]:
from pyspark.sql import SparkSession

# Initialize Spark with MinIO configuration
spark = SparkSession.builder \
    .appName("MinIO Image Load") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .getOrCreate()
spark

# Load image from MinIO
image_df = spark.read.format("image").load("s3a://unstructured-data/airport_table_view_firstScots.png")

# Show the results
image_df.printSchema()
image_df.show(truncate=False)


In [None]:
image_df = spark.read.format("image").load("file:///home/iceberg/warehouse/unstructured data/data.png")
image_df.printSchema()

# Show only metadata fields, exclude the large binary 'data' column
image_df.select(
    "image.origin",
    "image.height",
    "image.width",
    "image.nChannels",
    "image.mode"
).show(truncate=False)


In [None]:
image_df.writeTo("db.bronze_layer.unstructured_data").createOrReplace()


In [None]:
spark.stop()

In [None]:
spark._jsc.sc().isStopped()