# 

# START ZOOKEPER SERVICE
```
/usr/local/kafka/bin/zookeeper-server-start.sh /usr/local/kafka/config/zookeeper.properties

```
# START KAFKA BROKERS
```
/usr/local/kafka/bin/kafka-server-start.sh /usr/local/kafka/config/server1.properties
/usr/local/kafka/bin/kafka-server-start.sh /usr/local/kafka/config/server2.properties
/usr/local/kafka/bin/kafka-server-start.sh /usr/local/kafka/config/server3.properties
/usr/local/kafka/bin/kafka-server-start.sh /usr/local/kafka/config/server4.properties
```

# LISTS TOPICS
```#/usr/local/kafka/bin/kafka-topics.sh --list --bootstrap-server localhost:909 
```2``

# DEPENDENCIES

In [1]:
import os
from pyspark.sql import SparkSession

# Directory where JARs are located
jars_directory = "/usr/local/spark/jars/"

# List of JAR filenames
jar_files = [
    "commons-pool2-2.11.1.jar",
    "kafka-clients-3.3.2.jar",
    "spark-sql-kafka-0-10_2.12-3.4.1.jar",
    "spark-token-provider-kafka-0-10_2.12-3.4.1.jar"
]

dependencies = ",".join([os.path.join(jars_directory, jar) for jar in jar_files])

# Configure Kafka connection
kafka_bootstrap_servers = "localhost:9092"
topic = "animals-topic-batch"


# Create Spark session and add JARs
spark_session = SparkSession.builder \
    .appName("WriteKafkaAnimals") \
    .config("spark.jars", dependencies) \
    .getOrCreate()

24/01/03 01:44:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# BATCH WRITING

In [2]:
def save_batch_data(spark_session, kafka_bootstrap_servers, topic, iterations=1):

    # Create a sample DataFrame with animal data
    data = [("zebra", "mammal"), ("koala", "marsupial"), ("cheetah", "feline"),("dolphin", "mammal"),
            ("parrot", "bird"), ("rhino", "mammal"), ("panda", "mammal"), ("kangaroo", "marsupial"), 
            ("panther", "feline"), ("chimpanzee", "primate"), ("hippo", "mammal"), ("eagle", "bird"), 
            ("orangutan", "primate"), ("bear", "mammal"), ("owl", "bird"), ("polar bear", "mammal"), 
            ("snake", "reptile"), ("hawk", "bird"), ("fox", "mammal"), ("turtle", "reptile"), 
            ("swan", "bird"), ("jaguar", "feline"), ("seagull", "bird"), ("gazelle", "mammal")]
    
    columns = ["name", "type"]
    
    df_animals = spark_session.createDataFrame(data, columns)

    for iteration in range(iterations):
        # Write the DataFrame to Kafka topic
        df_animals.selectExpr("name as key", "type as value") \
            .write \
            .format("kafka") \
            .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
            .option("topic", topic) \
            .save()
    
    # Print a message indicating that the data has been written to the Kafka topic
    print(f"{iterations} Iterations, Data written to Kafka topic ({topic}).")

# Configure Kafka connection
kafka_bootstrap_servers = "localhost:9092"
topic = "animals-topic-batch"
save_batch_data(spark_session, kafka_bootstrap_servers, topic)

[Stage 0:>                                                        (0 + 16) / 16]

1 Iterations, Data written to Kafka topic (animals-topic-batch).


                                                                                

# BATCH READING

In [3]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Define the hexadecimal decoding function
@udf(returnType=StringType())
def decode_hex(value):
    try:
        if isinstance(value, str):
            return bytes.fromhex(value).decode('utf-8')
        elif isinstance(value, bytearray):
            return bytes(value).decode('utf-8')
        else:
            return str(value)
    except (ValueError, UnicodeDecodeError):
        return str(value)
            
def read_batch_data(spark_session, kafka_bootstrap_servers, topic):

    # Try to read data from Kafka
    try:
        # Read data from Kafka
        df_kafka = spark_session \
            .read \
            .format("kafka") \
            .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
            .option("subscribe", topic) \
            .load()
    
        # Decode hexadecimal values
        df_decoded = df_kafka \
            .withColumn('key', decode_hex('key')) \
            .withColumn('value', decode_hex('value'))
    
        # Show the DataFrame with decoded data
        df_decoded.show(truncate=False)
    
    except Exception as e:
        if "UnknownTopicOrPartitionException" in str(e):
            print(f"The topic '{topic}' does not exist in the Kafka cluster.")
        else:
            print(f"Unexpected error: {e}")
    
    finally:
        # Stop the Spark session
        None

topic = "animals-topic-batch"
read_batch_data(spark_session, kafka_bootstrap_servers, topic)

24/01/03 01:44:48 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
24/01/03 01:44:49 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


+----------+---------+-------------------+---------+------+-----------------------+-------------+
|key       |value    |topic              |partition|offset|timestamp              |timestampType|
+----------+---------+-------------------+---------+------+-----------------------+-------------+
|koala     |marsupial|animals-topic-batch|0        |0     |2024-01-02 22:43:50.69 |0            |
|chimpanzee|primate  |animals-topic-batch|0        |1     |2024-01-02 22:43:50.69 |0            |
|bear      |mammal   |animals-topic-batch|0        |2     |2024-01-02 22:43:50.689|0            |
|parrot    |bird     |animals-topic-batch|0        |3     |2024-01-02 22:43:50.689|0            |
|hippo     |mammal   |animals-topic-batch|0        |4     |2024-01-02 22:43:50.69 |0            |
|seagull   |bird     |animals-topic-batch|0        |5     |2024-01-02 22:43:50.689|0            |
|orangutan |primate  |animals-topic-batch|0        |6     |2024-01-02 22:43:50.69 |0            |
|panda     |mammal  

24/01/03 01:44:50 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/01/03 01:44:50 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/01/03 01:44:50 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/01/03 01:44:50 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/01/03 01:44:50 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/01/03 01:44:50 WARN KafkaDataConsumer: KafkaDataConsumer is not running in Un

# STREAMING WRITING
## IT WILL BE LISTENING TO THE "animals-topic-batch" TOPIC
## WHEN NEW DATA ARRIVES, IT READS AND STORES THEM IN THE "animals-topic-streaming" TOPIC
## MAKE THE TRANSFORMATIONS

In [4]:
from pyspark.sql.functions import expr

def read_streaming_data(spark_session, kafka_bootstrap_servers, input_topic, output_topic, checkpoint_location):

    #spark_session.sparkContext.setLogLevel("DEBUG")    
    # Read from Kafka in streaming mode
    kafkaStream = spark_session \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
        .option("subscribe", input_topic) \
        .option("startingOffsets", "earliest") \
        .load()
    
    # Perform some transformation on the data (here we are simply renaming the columns and uppercase the values)
    transformedStream = kafkaStream.selectExpr( "CAST(key AS STRING) as key","UPPER(CAST(value AS STRING)) as value")

    query = transformedStream \
        .writeStream \
        .option("failOnDataLoss", "false") \
        .outputMode("append") \
        .format("kafka") \
        .option("truncate", "false") \
        .option("checkpointLocation", checkpoint_location) \
        .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
        .option("topic", output_topic) \
        .start()

    print('kafkaStream', type(kafkaStream))
    print('transformedStream', type(transformedStream))
    print('query', type(query))
# Configure Kafka connection
kafka_bootstrap_servers = "localhost:9092"
input_topic = "animals-topic-batch"
output_topic = "animals-topic-streaming"
# Checkpoint directory within the Kafka directory
checkpoint_location = "/usr/local/kafka/checkpoint"

read_streaming_data(spark_session, kafka_bootstrap_servers, input_topic, output_topic, checkpoint_location)

kafkaStream <class 'pyspark.sql.dataframe.DataFrame'>
transformedStream <class 'pyspark.sql.dataframe.DataFrame'>
query <class 'pyspark.sql.streaming.query.StreamingQuery'>


24/01/03 01:44:50 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


## MASSIVE DATA INSERTION TO KAFKA FOR STREAMING READ
### COPY, PASTE, AND RUN THE FOLLOWING CODE IN ANOTHER NOTEBOOK TO OBSERVE STREAMING REA


```
import os
from pyspark.sql import SparkSession
from time import sleep

# Directory where JARs are located
jars_directory = "/usr/local/spark/jars/"

# List of JAR filenames
jar_files = [
    "commons-pool2-2.11.1.jar",
    "kafka-clients-3.3.2.jar",
    "spark-sql-kafka-0-10_2.12-3.4.1.jar",
    "spark-token-provider-kafka-0-10_2.12-3.4.1.jar"
]

dependencies = ",".join([os.path.join(jars_directory, jar) for jar in jar_files])

# Configure Kafka connection
kafka_bootstrap_servers = "localhost:9092"
topic = "animals-topic-batch"


# Create Spark session and add JARs
spark_session = SparkSession.builder \
    .appName("WriteKafkaAnimals") \
    .config("spark.jars", dependencies) \
    .getOrCreate()


def save_batch_data(spark_session, kafka_bootstrap_servers, topic, iterations=1):

    # Create a sample DataFrame with animal data
    data = [("zebra", "mammal"), ("koala", "marsupial"), ("cheetah", "feline"),("dolphin", "mammal"),
            ("parrot", "bird"), ("rhino", "mammal"), ("panda", "mammal"), ("kangaroo", "marsupial"), 
            ("panther", "feline"), ("chimpanzee", "primate"), ("hippo", "mammal"), ("eagle", "bird"), 
            ("orangutan", "primate"), ("bear", "mammal"), ("owl", "bird"), ("polar bear", "mammal"), 
            ("snake", "reptile"), ("hawk", "bird"), ("fox", "mammal"), ("turtle", "reptile"), 
            ("swan", "bird"), ("jaguar", "feline"), ("seagull", "bird"), ("gazelle", "mammal")]
    
    columns = ["name", "type"]
    
    df_animals = spark_session.createDataFrame(data, columns)

    for iteration in range(iterations):
        sleep(0.2)
        # Write the DataFrame to Kafka topic
        df_animals.selectExpr("name as key", "type as value") \
            .write \
            .format("kafka") \
            .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
            .option("topic", topic) \
            .save()
        print (f'Iteration {iteration}, completed!!!')
    
    # Print a message indicating that the data has been written to the Kafka topic
    print(f"{iterations} Iterations, Data written to Kafka topic ({topic}).")

     # Finally, stop the Spark session
    spark_session.stop()

save_batch_data(spark_session, kafka_bootstrap_servers, topic, iterations=100)
```

# STREAMING READING

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr

# Configure Kafka connection
kafka_bootstrap_servers = "localhost:9092"
topic = "animals-topic-streaming"

def read_streaming_data(spark_session,kafka_bootstrap_servers, topic):
   
    # Read from Kafka in streaming mode
    kafkaStream = spark_session.readStream.format("kafka") \
        .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
        .option("subscribe", topic) \
        .option("startingOffsets", "earliest") \
        .load()
    
    # Show the read content
    query = kafkaStream.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
        .writeStream \
        .outputMode("append") \
        .format("console") \
        .start() \
    
    # Wait for the stream to finish (adjust as needed)
    #query.awaitTermination()
    
    # Finally, stop the Spark session (you can stop it after the stream finishes)
    #spark_session.stop()

read_streaming_data(spark_session, kafka_bootstrap_servers, topic)

24/01/03 01:44:50 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-1964884b-4add-4b8f-9b91-1c4a61bf10be. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/01/03 01:44:50 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/01/03 01:44:50 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
24/01/03 01:44:50 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


-------------------------------------------
Batch: 0
-------------------------------------------
+----------+---------+
|       key|    value|
+----------+---------+
|     koala|marsupial|
|chimpanzee|  primate|
|      bear|   mammal|
|    parrot|     bird|
|     hippo|   mammal|
|   seagull|     bird|
| orangutan|  primate|
|     panda|   mammal|
|    turtle|  reptile|
|    jaguar|   feline|
|polar bear|   mammal|
|   dolphin|   mammal|
|     zebra|   mammal|
|     snake|  reptile|
|      swan|     bird|
|       fox|   mammal|
|      hawk|     bird|
|  kangaroo|marsupial|
|     rhino|   mammal|
|     eagle|     bird|
+----------+---------+
only showing top 20 rows

-------------------------------------------
Batch: 1
-------------------------------------------
+----------+---------+
|       key|    value|
+----------+---------+
|chimpanzee|  PRIMATE|
|   seagull|     BIRD|
|      bear|   MAMMAL|
|     koala|MARSUPIAL|
|    turtle|  REPTILE|
|       owl|     BIRD|
|   cheetah|   FELINE

# CLOSE SPARK SESSION

In [6]:
spark_session.stop()