In [1]:
import findspark
findspark.init()

import os
import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

# Initialize logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s:%(funcName)s:%(levelname)s:%(message)s')
logger = logging.getLogger("spark_structured_streaming")


app_name = "SparkStructuredStreamingToS3"
access_key = "H3jy2qkZ6ZFASS7slRJe"
secret_key = "uFdq1hFXiee8pT2hG52gguyI0Iqkkyro07dDjopB"
brokers = "localhost:29092,localhost:29093,localhost:29094"
topic = "names_topic"
path = "datalake"
#checkpoint_location = "CHECKPOINT_LOCATION"
checkpoint_location = "/opt/bitnami/spark/datalake/_spark_metadata"

    # TODO: send only validated data to db
    # PostgreSQL configurations
postgres_url = "jdbc:postgresql://localhost:5432/postgres"
postgres_table = "user_data"
postgres_user = "postgres"
postgres_password = "postgres"


In [2]:
def initialize_spark_session(app_name, access_key=None, secret_key=None):
    """
    Initialize the Spark Session with provided configurations.
    
    :param app_name: Name of the Spark application.
    :param access_key: AWS S3 access key (optional).
    :param secret_key: AWS S3 secret key (optional).
    :return: Spark session object.
    """
    try:
        # Set the desired JAVA_HOME
        os.environ['JAVA_HOME'] = '/Library/Java/JavaVirtualMachines/jdk1.8.0_321.jdk/Contents/Home'
        os.environ['PATH'] = f"{os.environ['JAVA_HOME']}/bin:" + os.environ['PATH']
        
        # Add the Scala path to the PATH environment variable
        os.environ['PATH'] = "/opt/homebrew/opt/scala@2.13/bin:" + os.environ['PATH']

        # Initialize the Spark session builder
        spark_builder = SparkSession.builder \
            .appName(app_name) \
            .master("local[2]") \
            .config("spark.jars.packages", 
                    "org.apache.kafka:kafka-clients:3.3.0,"
                    "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,"
                    "org.apache.hadoop:hadoop-aws:3.2.0,"
                    "com.amazonaws:aws-java-sdk-bundle:1.11.375,"
                    "org.postgresql:postgresql:42.2.18,"
                    "org.apache.commons:commons-pool2:2.8.0,"
                    "org.scala-lang:scala-library:2.12.15") \
            .config("spark.driver.memory", "4g") \
            .config("spark.executor.memory", "4g") \
            .config("spark.driver.extraJavaOptions", "-Djava.library.path=$JAVA_HOME/lib/server") \
            .config("spark.executor.extraJavaOptions", "-Djava.library.path=$JAVA_HOME/lib/server")

        # Optionally set S3 credentials if provided
        if access_key and secret_key:
            spark_builder = spark_builder \
                .config("spark.hadoop.fs.s3a.access.key", access_key) \
                .config("spark.hadoop.fs.s3a.secret.key", secret_key) \
                .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

        # Build the Spark session
        spark = spark_builder.getOrCreate()
        
        # Set log level to WARN to reduce output noise
        spark.sparkContext.setLogLevel("WARN")
        logger.info('Spark session initialized successfully')
        return spark
    
    except Exception as e:
        logger.error(f"Spark session initialization failed. Error: {e}")
        return None


# Example usage
spark = initialize_spark_session(app_name="LocalTesting", access_key=access_key, secret_key=secret_key)
spark

24/08/21 12:09:13 WARN Utils: Your hostname, Hamzas-MacBook-Pro-8665.local resolves to a loopback address: 127.0.0.1; using 10.50.49.133 instead (on interface en14)
24/08/21 12:09:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/usr/local/Homebrew/Caskroom/miniforge/base/envs/data_pipeline/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/hamzaharunamohammed/.ivy2/cache
The jars for the packages stored in: /Users/hamzaharunamohammed/.ivy2/jars
org.apache.kafka#kafka-clients added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
org.postgresql#postgresql added as a dependency
org.apache.commons#commons-pool2 added as a dependency
org.scala-lang#scala-library added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-9095c9f6-3626-4173-9f73-eec67b0456de;1.0
	confs: [default]
	found org.apache.kafka#kafka-clients;3.3.0 in central
	found com.github.luben#zstd-jni;1.5.2-1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.36 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.3.0 in central
	found org.apache.spark#spark-token-prov

24/08/21 12:09:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2024-08-21 12:09:19,800:initialize_spark_session:INFO:Spark session initialized successfully


In [3]:
def get_streaming_dataframe(spark, brokers, topic):
    """
    Get a streaming dataframe from Kafka.
    
    :param spark: Initialized Spark session.
    :param brokers: Comma-separated list of Kafka brokers.
    :param topic: Kafka topic to subscribe to.
    :return: Dataframe object or None if there's an error.
    """
    try:
        df = spark \
            .readStream \
            .format("kafka") \
            .option("kafka.bootstrap.servers", brokers) \
            .option("subscribe", topic) \
            .option("delimiter", ",") \
            .option("startingOffsets", "earliest") \
            .load()
        logger.info("Streaming dataframe fetched successfully")
        return df

    except Exception as e:
        logger.warning(f"Failed to fetch streaming dataframe. Error: {e}")
        return None


def transform_streaming_data(df):
    """
    Transform the initial dataframe to get the final structure.
    
    :param df: Initial dataframe with raw data.
    :return: Transformed dataframe.
    """
    schema = StructType([
        StructField("full_name", StringType(), False),
        StructField("gender", StringType(), False),
        StructField("location", StringType(), False),
        StructField("city", StringType(), False),
        StructField("country", StringType(), False),
        StructField("postcode", IntegerType(), False),
        StructField("latitude", FloatType(), False),
        StructField("longitude", FloatType(), False),
        StructField("email", StringType(), False)
    ])

    transformed_df = df.selectExpr("CAST(value AS STRING)") \
        .select(from_json(col("value"), schema).alias("data")) \
        .select("data.*")
    logger.info("Streamed dataframe transformed successfully")
    return transformed_df


def initiate_streaming_to_bucket(df, path, checkpoint_location):
    """
    Start streaming the transformed data to the specified S3 bucket in parquet format.
    
    :param df: Transformed dataframe.
    :param path: S3 bucket path.
    :param checkpoint_location: Checkpoint location for streaming.
    :return: None
    """
    logger.info("Initiating streaming process...")
    stream_query = (df.writeStream
                    .format("parquet")
                    .outputMode("append")
                    .option("path", path)
                    .option("checkpointLocation", checkpoint_location)
                    .start())
    stream_query.awaitTermination()


In [4]:
if spark:
    df = get_streaming_dataframe(spark, brokers, topic)
    if df:
        transformed_df = transform_streaming_data(df)
        initiate_streaming_to_bucket(transformed_df, path, checkpoint_location)


: java.lang.NoClassDefFoundError: scala/$less$colon$less
	at org.apache.spark.sql.kafka010.KafkaSourceProvider.org$apache$spark$sql$kafka010$KafkaSourceProvider$$validateStreamOptions(KafkaSourceProvider.scala:338)
	at org.apache.spark.sql.kafka010.KafkaSourceProvider.sourceSchema(KafkaSourceProvider.scala:71)
	at org.apache.spark.sql.execution.datasources.DataSource.sourceSchema(DataSource.scala:236)
	at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo$lzycompute(DataSource.scala:118)
	at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo(DataSource.scala:118)
	at org.apache.spark.sql.execution.streaming.StreamingRelation$.apply(StreamingRelation.scala:34)
	at org.apache.spark.sql.streaming.DataStreamReader.loadInternal(DataStreamReader.scala:168)
	at org.apache.spark.sql.streaming.DataStreamReader.load(DataStreamReader.scala:144)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAc

In [5]:
spark = initialize_spark_session("LocalTesting")

# Simple DataFrame example
data = [("Alice", 34), ("Bob", 45), ("Cathy", 29)]
df = spark.createDataFrame(data, ["Name", "Age"])
df.show()

2024-08-21 12:09:20,772:initialize_spark_session:INFO:Spark session initialized successfully
                                                                                

+-----+---+
| Name|Age|
+-----+---+
|Alice| 34|
|  Bob| 45|
|Cathy| 29|
+-----+---+

