# DEPENDENCIAS

In [1]:
import os

# Directory where JARs are located
jars_directory = "/usr/local/spark/jars/"

# List of JAR filenames
jar_files = [
    "commons-pool2-2.11.1.jar",
    "kafka-clients-3.3.2.jar",
    "spark-sql-kafka-0-10_2.12-3.4.1.jar",
    "spark-token-provider-kafka-0-10_2.12-3.4.1.jar"
]

dependencies = ",".join([os.path.join(jars_directory, jar) for jar in jar_files])

# LECTURA

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Create Spark session
spark = SparkSession.builder \
    .appName("ReadKafkaAnimals") \
    .config("spark.jars", dependencies) \
    .getOrCreate()

# Define the hexadecimal decoding function
@udf(returnType=StringType())
def decode_hex(value):
    try:
        if isinstance(value, str):
            return bytes.fromhex(value).decode('utf-8')
        elif isinstance(value, bytearray):
            return bytes(value).decode('utf-8')
        else:
            return str(value)
    except (ValueError, UnicodeDecodeError):
        return str(value)

# Configure Kafka connection
kafka_bootstrap_servers = "localhost:9092"
topic = "animals-topic"

# Try to read data from Kafka
try:
    # Read data from Kafka
    df_kafka = spark \
        .read \
        .format("kafka") \
        .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
        .option("subscribe", topic) \
        .load()

    # Decode hexadecimal values
    df_decoded = df_kafka \
        .withColumn('key', decode_hex('key')) \
        .withColumn('value', decode_hex('value'))

    # Show the DataFrame with decoded data
    df_decoded.show(truncate=False)

except Exception as e:
    if "UnknownTopicOrPartitionException" in str(e):
        print(f"The topic '{topic}' does not exist in the Kafka cluster.")
    else:
        print(f"Unexpected error: {e}")

finally:
    # Stop the Spark session
    spark.stop()

24/01/02 05:27:19 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
24/01/02 05:27:20 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/01/02 05:27:20 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/01/02 05:27:21 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/01/02 05:27:21 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/01/02 05:27:21 WARN KafkaDataConsumer: KafkaDataConsumer is not 

+--------+------+-------------+---------+------+-----------------------+-------------+
|key     |value |topic        |partition|offset|timestamp              |timestampType|
+--------+------+-------------+---------+------+-----------------------+-------------+
|tiger   |feline|animals-topic|1        |0     |2024-01-02 05:16:34.262|0            |
|tiger   |feline|animals-topic|1        |1     |2024-01-02 05:18:00.249|0            |
|tiger   |feline|animals-topic|1        |2     |2024-01-02 05:18:12.788|0            |
|tiger   |feline|animals-topic|1        |3     |2024-01-02 05:18:18.444|0            |
|tiger   |feline|animals-topic|1        |4     |2024-01-02 05:18:22.265|0            |
|tiger   |feline|animals-topic|1        |5     |2024-01-02 05:24:37.562|0            |
|tiger   |feline|animals-topic|1        |6     |2024-01-02 05:27:09.911|0            |
|lion    |feline|animals-topic|2        |0     |2024-01-02 05:16:34.263|0            |
|elephant|mammal|animals-topic|2        |1 

# ESCRITURA

In [3]:
from pyspark.sql import SparkSession
import os

# Configure Kafka connection
kafka_bootstrap_servers = "localhost:9092"
topic = "animals-topic"

# Create Spark session and add JARs
spark = SparkSession.builder \
    .appName("WriteKafkaAnimals") \
    .config("spark.jars", dependencies) \
    .getOrCreate()

# Create a sample DataFrame with animal data
data = [("lion", "feline"), ("elephant", "mammal"), ("tiger", "feline")]
columns = ["name", "type"]

df_animals = spark.createDataFrame(data, columns)

# Write the DataFrame to Kafka topic
df_animals.selectExpr("name as key", "type as value") \
    .write \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
    .option("topic", topic) \
    .save()

# Print a message indicating that the data has been written to the Kafka topic
print("Data written to Kafka topic (Animals).")

# Finally, stop the Spark session
spark.stop()

                                                                                

Data written to Kafka topic (Animals).
