In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp

In [15]:
spark = SparkSession.builder \
    .appName("TestHuditoMinIO") \
    .master("spark://spark-master:7077") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.sql.catalogImplementation", "hive") \
    .config("spark.hadoop.hive.metastore.uris", "thrift://hive-metastore:9083") \
    .getOrCreate()

25/10/28 14:46:50 INFO SparkContext: Running Spark version 3.5.6
25/10/28 14:46:50 INFO SparkContext: OS info Linux, 6.6.87.1-microsoft-standard-WSL2, amd64
25/10/28 14:46:50 INFO SparkContext: Java version 11.0.27
25/10/28 14:46:50 INFO ResourceUtils: No custom resources configured for spark.driver.
25/10/28 14:46:50 INFO SparkContext: Submitted application: TestHuditoMinIO
25/10/28 14:46:50 INFO ResourceProfile: Default ResourceProfile created, executor resources: Map(memory -> name: memory, amount: 1024, script: , vendor: , offHeap -> name: offHeap, amount: 0, script: , vendor: ), task resources: Map(cpus -> name: cpus, amount: 1.0)
25/10/28 14:46:50 INFO ResourceProfile: Limiting resource is cpu
25/10/28 14:46:50 INFO ResourceProfileManager: Added ResourceProfile id: 0
25/10/28 14:46:50 INFO SecurityManager: Changing view acls to: spark
25/10/28 14:46:50 INFO SecurityManager: Changing modify acls to: spark
25/10/28 14:46:50 INFO SecurityManager: Changing view acls groups to: 
25/10

In [16]:
# Show Spark Context
sc = spark.sparkContext
sc.setLogLevel("ERROR")
sc

In [17]:
data = [(1, "Alice", 24), (2, "Bob", 30)]
columns = ["id", "name", "age"]
df = spark.createDataFrame(data, columns).withColumn("curr_timestamp", current_timestamp())
df.show()



+---+-----+---+--------------------+
| id| name|age|      curr_timestamp|
+---+-----+---+--------------------+
|  1|Alice| 24|2025-10-28 14:46:...|
|  2|  Bob| 30|2025-10-28 14:46:...|
+---+-----+---+--------------------+



                                                                                

In [11]:
# Define Hudi options
hudi_options = {
    "hoodie.table.name": "customer_hudi",
    "hoodie.datasource.write.recordkey.field": "id",
    "hoodie.datasource.write.precombine.field": "curr_timestamp",
    "hoodie.datasource.write.table.type": "COPY_ON_WRITE",
    "hoodie.datasource.hive_sync.enable": "true",
    "hoodie.datasource.hive_sync.mode": "hms",
    "hoodie.datasource.hive_sync.database": "default",
    "hoodie.datasource.hive_sync.table": "customer_hudi",
    "hoodie.datasource.hive_sync.metastore.uris": "thrift://hive-metastore:9083",
    "hoodie.datasource.write.hive_style_partitioning": "true",
    "hoodie.enable.data.skipping": "true",
    "hoodie.metadata.enable": "true",
    "hoodie.metadata.index.column.stats.enable": "true"
}

df.write.format("hudi") \
    .options(**hudi_options) \
    .mode("overwrite") \
    .save("s3a://warehouse/customer_hudi/")

25/10/27 16:12:07 WARN HoodieSparkSqlWriterInternal: hoodie table at s3a://warehouse/customer_hudi already exists. Deleting existing data & overwriting with new data.
25/10/27 16:12:13 WARN HoodieBackedTableMetadataWriter: Partition stats index cannot be enabled for a non-partitioned table. Removing from initialization list. Please disable hoodie.metadata.index.partition.stats.enable
25/10/27 16:12:21 WARN HoodieBackedTableMetadataWriter: Partition stats index cannot be enabled for a non-partitioned table. Removing from initialization list. Please disable hoodie.metadata.index.partition.stats.enable
                                                                                

In [6]:
spark.sql("SHOW TABLES IN default").show()

+---------+-------------+-----------+
|namespace|    tableName|isTemporary|
+---------+-------------+-----------+
|  default|customer_hudi|      false|
+---------+-------------+-----------+



In [18]:
spark.sql("SELECT * FROM default.customer_hudi").show()



[Stage 2:>                                                          (0 + 1) / 1]

+-------------------+--------------------+------------------+----------------------+--------------------+---+-----+---+--------------------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name| id| name|age|      curr_timestamp|
+-------------------+--------------------+------------------+----------------------+--------------------+---+-----+---+--------------------+
|  20251027161210964|20251027161210964...|                 2|                      |2b6da6f0-fba9-4a3...|  2|  Bob| 30|2025-10-27 16:12:...|
|  20251027161210964|20251027161210964...|                 1|                      |2b6da6f0-fba9-4a3...|  1|Alice| 24|2025-10-27 16:12:...|
+-------------------+--------------------+------------------+----------------------+--------------------+---+-----+---+--------------------+



                                                                                

In [9]:
spark.sql("SHOW SCHEMAS").show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [14]:
spark.stop()

25/10/28 14:46:46 INFO SparkContext: SparkContext is stopping with exitCode 0.
25/10/28 14:46:46 INFO SparkUI: Stopped Spark web UI at http://9361fa1ccb18:4040
25/10/28 14:46:46 INFO StandaloneSchedulerBackend: Shutting down all executors
25/10/28 14:46:46 INFO StandaloneSchedulerBackend$StandaloneDriverEndpoint: Asking each executor to shut down
25/10/28 14:46:46 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
25/10/28 14:46:46 INFO MemoryStore: MemoryStore cleared
25/10/28 14:46:46 INFO BlockManager: BlockManager stopped
25/10/28 14:46:46 INFO BlockManagerMaster: BlockManagerMaster stopped
25/10/28 14:46:46 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
25/10/28 14:46:46 INFO SparkContext: Successfully stopped SparkContext
