In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Create a SparkSession
spark = SparkSession.builder.appName("Testing PySpark Example").getOrCreate()

In [None]:
from pyspark.sql import SparkSession

# Specify the paths to your JAR files
jars = "/opt/bitnami/spark/jars/kafka-clients-3.3.0.jar," \
       "/opt/bitnami/spark/jars/spark-sql-kafka-0-10_2.12-3.3.0.jar," \
       "/opt/bitnami/spark/jars/spark-streaming-kafka-0-10_2.12-3.3.0.jar," \
       "/opt/bitnami/spark/jars/spark-token-provider-kafka-0-10_2.12-3.3.0.jar"

# Create the Spark Session with the JARs included
spark = SparkSession \
    .builder \
    .appName("Streaming from Kafka") \
    .config("spark.streaming.stopGracefullyOnShutdown", True) \
    .config("spark.jars", jars) \
    .config("spark.sql.shuffle.partitions", 4) \
    .master("local[*]") \
    .getOrCreate()

spark


In [None]:
kafka_bootstrap_servers = "kafka_broker_1:19092,kafka_broker_2:19093,kafka_broker_3:19094"
kafka_topic_name = "names_topic"


# Construct a streaming DataFrame that reads from test-topic
orders_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
    .option("subscribe", kafka_topic_name) \
    .option("startingOffsets", "latest") \
    .load()

print("Printing Schema of orders_df: ")
orders_df.printSchema()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import findspark
import time
import os

findspark.init()


if __name__ == "__main__":

    os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages ' \
                                        'org.apache.spark:spark-sql-kafka-0-10_2.12-3.3.0,' \
                                        'org.apache.spark:spark-token-provider-kafka-0-10_2.12-3.3.0' \
                                        'pyspark-shell '

    print("Stream Data Processing Application Started ...")
    print(time.strftime("%Y-%m-%d %H:%M:%S"))

    spark = SparkSession \
        .builder \
        .appName("PySpark Structured Streaming with Kafka and Message Format as JSON") \
        .master("local[*]") \
        .getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")
    spark


In [None]:
kafka_bootstrap_servers = "kafka_broker_1:19092,kafka_broker_2:19093,kafka_broker_3:19094"
kafka_topic_name = "names_topic"


# Construct a streaming DataFrame that reads from test-topic
orders_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
    .option("subscribe", kafka_topic_name) \
    .option("startingOffsets", "latest") \
    .load()

print("Printing Schema of orders_df: ")
orders_df.printSchema()

In [None]:


orders_df1 = orders_df.selectExpr("CAST(value AS STRING)", "timestamp")

orders_schema = StructType() \
    .add("order_id", StringType()) \
    .add("order_product_name", StringType()) \
    .add("order_card_type", StringType()) \
    .add("order_amount", StringType()) \
    .add("order_datetime", StringType()) \
    .add("order_country_name", StringType()) \
    .add("order_city_name", StringType()) \
    .add("order_ecommerce_website_name", StringType())

orders_df2 = orders_df1 \
    .select(from_json(col("value"), orders_schema) \
            .alias("orders"), "timestamp")

orders_df3 = orders_df2.select("orders.*", "timestamp")
orders_df3.printSchema()

# Simple aggregate - find total_order_amount by grouping country, city
orders_df4 = orders_df3.groupBy("order_country_name", "order_city_name")\
    .agg({'order_amount': 'sum'})\
    .select("order_country_name", "order_city_name", col("sum(order_amount)")
            .alias("total_order_amount"))

print("Printing Schema of orders_df4: ")
orders_df4.printSchema()

# Write final result into console for debugging purpose
orders_agg_write_stream = orders_df4\
    .writeStream.trigger(processingTime='5 seconds')\
    .outputMode("update")\
    .option("truncate", "false")\
    .format("console")\
    .start()

orders_agg_write_stream.awaitTermination()
print("Stream Data Processing Application Completed.")

In [None]:
from pyspark.sql import SparkSession

# Define the Scala and Spark versions
scala_version = '2.12.15'  # This matches the Scala version used by Spark
spark_version = '3.3.0'    # Spark version from the provided configuration

# Specify the packages needed for Kafka integration
packages = [
    f'org.apache.spark:spark-sql-kafka-0-10_{scala_version}:{spark_version}',
    'org.apache.kafka:kafka-clients:3.2.0'
]

# Create the SparkSession
spark = SparkSession.builder \
    .master("spark://172.19.0.3:7077") \
    .appName("kafka-example") \
    .config("spark.jars.packages", ",".join(packages)) \
    .getOrCreate()

# Show the status of the Spark session to confirm it's alive
print(spark)

In [None]:
# Define the Scala and Spark versions
scala_version = '2.12.15'  # This matches the Scala version used in the provided JAR file
spark_version = '3.3.0'  # Spark version from the provided configuration

# Specify the packages needed for Kafka, Hadoop, AWS, and other dependencies
packages = [
    f'org.apache.spark:spark-sql-kafka-0-10_{scala_version}:{spark_version}',  # Kafka integration
    'org.apache.kafka:kafka-clients:2.8.1',  # Kafka clients
    'org.apache.hadoop:hadoop-aws:3.2.0',  # Hadoop AWS integration
    'com.amazonaws:aws-java-sdk-s3:1.11.375',  # AWS SDK for S3
    'org.apache.commons:commons-pool2:2.8.0'  # Commons Pool 2
]

# Create the SparkSession
spark = SparkSession.builder \
    .master("spark://172.19.0.3:7077") \
    .appName("kafka-example") \
    .config("spark.jars.packages", ",".join(packages)) \
    .getOrCreate()

# Show the status of the Spark session to confirm it's alive
print(spark)