In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp

In [2]:
spark = SparkSession.builder \
    .appName("TestHuditoMinIO") \
    .master("spark://spark-master:7077") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.sql.catalogImplementation", "hive") \
    .config("spark.hadoop.hive.metastore.uris", "thrift://hive-metastore:9083") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/28 16:36:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Show Spark Context
sc = spark.sparkContext
sc.setLogLevel("ERROR")
sc

In [None]:
data = [(1, "Alice", 24), (2, "Bob", 30)]
columns = ["id", "name", "age"]
df = spark.createDataFrame(data, columns).withColumn("curr_timestamp", current_timestamp())
df.show()

In [None]:
# Define Hudi options
hudi_options = {
    "hoodie.table.name": "customer_hudi",
    "hoodie.datasource.write.recordkey.field": "id",
    "hoodie.datasource.write.precombine.field": "curr_timestamp",
    "hoodie.datasource.write.table.type": "COPY_ON_WRITE",
    "hoodie.datasource.hive_sync.enable": "true",
    "hoodie.datasource.hive_sync.mode": "hms",
    "hoodie.datasource.hive_sync.database": "default",
    "hoodie.datasource.hive_sync.table": "customer_hudi",
    "hoodie.datasource.hive_sync.metastore.uris": "thrift://hive-metastore:9083",
    "hoodie.datasource.write.hive_style_partitioning": "true",
    "hoodie.enable.data.skipping": "true",
    "hoodie.metadata.enable": "true",
    "hoodie.metadata.index.column.stats.enable": "true"
}

df.write.format("hudi") \
    .options(**hudi_options) \
    .mode("overwrite") \
    .save("s3a://warehouse/customer_hudi/")

In [None]:
spark.sql("SHOW TABLES IN default").show()