In [None]:
from pyspark.sql import SparkSession
import logging
import os

# Initialize logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s:%(funcName)s:%(levelname)s:%(message)s')
logger = logging.getLogger("spark_structured_streaming")

In [None]:
def initialize_spark_session(app_name, access_key, secret_key, minio_endpoint):
    """
    Initialize the Spark Session with MinIO configurations, Delta Lake support, and additional packages.
    
    :param app_name: Name of the Spark application.
    :param access_key: Access key for MinIO.
    :param secret_key: Secret key for MinIO.
    :param minio_endpoint: MinIO server endpoint (including port).
    :return: Spark session object or None if there's an error.
    """
    # Set the desired JAVA_HOME
    os.environ['JAVA_HOME'] = '/Library/Java/JavaVirtualMachines/jdk1.8.0_321.jdk/Contents/Home'
    os.environ['PATH'] = f"{os.environ['JAVA_HOME']}/bin:" + os.environ['PATH']
    
    # Add the Scala path to the PATH environment variable
    os.environ['PATH'] = "/opt/homebrew/opt/scala@2.13/bin:" + os.environ['PATH']
    
    
    try:
        # #.master("spark://spark_master:7077") \
        spark = SparkSession \
                .builder \
                .appName(app_name) \
                .config("spark.hadoop.fs.s3a.access.key", access_key) \
                .config("spark.hadoop.fs.s3a.secret.key", secret_key) \
                .config("spark.hadoop.fs.s3a.endpoint", minio_endpoint) \
                .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
                .config("spark.hadoop.fs.s3a.path.style.access", "true") \
                .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
                .config("spark.hadoop.fs.s3a.aws.credentials.provider", 
                        "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
                .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
                .config("spark.jars.packages", 
                        "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,"
                        "org.apache.kafka:kafka-clients:3.3.0,"
                        "org.apache.hadoop:hadoop-aws:3.2.0,"
                        "com.amazonaws:aws-java-sdk-s3:1.11.375,"
                        "org.postgresql:postgresql:42.2.18,"
                        "org.apache.commons:commons-pool2:2.8.0,"
                        "io.delta:delta-core_2.12:2.1.0") \
                .getOrCreate()

        spark.sparkContext.setLogLevel("ERROR")
        logger.info('Spark session initialized successfully with MinIO, Delta Lake, and additional packages')
        return spark

    except Exception as e:
        logger.error(f"Spark session initialization failed. Error: {e}")
        return None

In [None]:

# Replace with your actual MinIO credentials and endpoint
app_name = "DeltaLakeMinIOExplorer"
minio_endpoint = "http://minio:9000"
access_key = "H3jy2qkZ6ZFASS7slRJe"
secret_key = "uFdq1hFXiee8pT2hG52gguyI0Iqkkyro07dDjopB"
brokers = "localhost:29092,localhost:29093,localhost:29094"


In [None]:
spark = initialize_spark_session(app_name, access_key, secret_key, minio_endpoint)
spark

In [None]:
delta_table_path = "s3a://datalake/processed_data/deltatable/table1/" #"s3a://datalake/processed_data/deltatable/"  # MinIO bucket path for output data
checkpoint_location = "s3a://datalake/checkpoints/spark_metadata"  # Correct checkpoint location path


# Load the Delta table
delta_df = spark.read.format("delta").load(delta_table_path)

# Show the first few records to explore the data
#delta_df.show()


In [None]:
import boto3
import json
from botocore.exceptions import NoCredentialsError

# Use the correct MinIO endpoint, access key, and secret key
s3 = boto3.client('s3',
                  endpoint_url="http://localhost:9000",
                  aws_access_key_id=access_key,
                  aws_secret_access_key=secret_key)

try:
    response = s3.list_buckets()
    # index ResponseMetadata in response dictionary
    print("Connection successful:", json.dumps(response['ResponseMetadata'], indent=2))
except Exception as e:
    print("Error:", str(e))
