In [1]:
# import libraries
import json

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.avro.functions import from_avro, to_avro
from pyspark.sql.functions import col, current_timestamp, struct, to_json, lit

In [18]:
def get_avro_schema(spark_df, schema_type: str, name: str, namespace: str):
    """
    Returns the corresponding avro schema for the passed in spark dataframe.
    The type mapping covers most commonly used types, every field is made to be nullable.
    """

    schema_base = {"type": schema_type, "namespace": namespace, "name": name}

    # Keys are Spark Types, Values are Avro Types
    avro_mapping = {
        "StringType": ["string", "null"],
        "LongType": ["long", "null"],
        "IntegerType": ["int", "null"],
        "BooleanType": ["boolean", "null"],
        "FloatType": ["float", "null"],
        "DoubleType": ["double", "null"],
        "TimestampType": ["long", "null"],
        "ArrayType(StringType,true)": [
            {"type": "array", "items": ["string", "null"]},
            "null",
        ],
        "ArrayType(IntegerType,true)": [
            {"type": "array", "items": ["int", "null"]},
            "null",
        ],
    }

    fields = []

    for field in spark_df.schema.fields:
        if str(field.dataType) in avro_mapping:
            fields.append(
                {"name": field.name, "type": avro_mapping[str(field.dataType)]}
            )
        else:
            fields.append({"name": field.name, "type": str(field.dataType)})

    schema_base["fields"] = fields

    return json.dumps(schema_base)
def generate_avro_schema_from_json(json_data):
    def avro_type_mapping(value):
        if value is None:
            return ["null", "string"]
        elif isinstance(value, bool):
            return ["null", "boolean"]
        elif isinstance(value, int):
            return ["null", "int"]
        elif isinstance(value, float):
            return ["null", "double"]
        elif isinstance(value, str):
            return ["null", "string"]
        else:
            return ["null", "string"]

    avro_schema = {
        "type": "record",
        "name": "Default_schema",
        "fields": [
            {"name": key, "type": avro_type_mapping(value)}
            for key, value in json_data.items()
        ],
    }
    return avro_schema

In [2]:
# init session
spark = (
    SparkSession.builder.appName("delivery-data-from-sap-hana-to-kafka")
    .config(
        "spark.jars.packages",
        "org.postgresql:postgresql:42.7.1,"+
        "com.sap.cloud.db.jdbc:ngdbc:2.19.15,"+
        "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,"+
        "org.apache.spark:spark-avro_2.12:3.5.0,"+
        "org.apache.kafka:kafka-clients:3.5.0,"+
        "org.apache.spark:spark-streaming-kafka-0-10_2.12:3.5.0",
    )
    .enableHiveSupport()
    .getOrCreate()
)

In [15]:
df = spark\
  .read\
  .format("kafka")\
  .option("kafka.bootstrap.servers", "broker:29092")\
  .option("subscribe", "topic2")\
  .load()

In [19]:
df.collect()

[Row(key=None, value=bytearray(b'{"id":1,"name":"Paul","age":32,"address":"California                                        ","salary":20000.0}'), topic='topic2', partition=0, offset=0, timestamp=datetime.datetime(2023, 12, 20, 3, 36, 51, 554000), timestampType=0),
 Row(key=None, value=bytearray(b'{"id":2,"name":"Allen","age":25,"address":"Texas                                             ","salary":15000.0}'), topic='topic2', partition=0, offset=1, timestamp=datetime.datetime(2023, 12, 20, 3, 36, 51, 563000), timestampType=0),
 Row(key=None, value=bytearray(b'{"id":3,"name":"Teddy","age":23,"address":"Norway                                            ","salary":20000.0}'), topic='topic2', partition=0, offset=2, timestamp=datetime.datetime(2023, 12, 20, 3, 36, 51, 563000), timestampType=0),
 Row(key=None, value=bytearray(b'{"id":4,"name":"Mark","age":25,"address":"Rich-Mond                                         ","salary":65000.0}'), topic='topic2', partition=0, offset=3, timestamp=