In [1]:
# import libraries
import json

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.avro.functions import from_avro, to_avro
from pyspark.sql.functions import col, current_timestamp, struct, to_json, lit

In [2]:
# init session
spark = (
    SparkSession.builder.appName("delivery-data-from-sap-hana-to-kafka")
    .config(
        "spark.jars.packages",
        "org.postgresql:postgresql:42.7.1,"
        + "com.sap.cloud.db.jdbc:ngdbc:2.19.15,"
        + "com.oracle.database.jdbc:ojdbc8:23.3.0.23.09,"
        + "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0",
    )
    .enableHiveSupport()
    .getOrCreate()
)

In [3]:
# show configured parameters
SparkConf().getAll()

[('spark.jars',
  'file:///home/jovyan/.ivy2/jars/org.postgresql_postgresql-42.7.1.jar,file:///home/jovyan/.ivy2/jars/com.sap.cloud.db.jdbc_ngdbc-2.19.15.jar,file:///home/jovyan/.ivy2/jars/com.oracle.database.jdbc_ojdbc8-23.3.0.23.09.jar,file:///home/jovyan/.ivy2/jars/org.apache.spark_spark-sql-kafka-0-10_2.12-3.5.0.jar,file:///home/jovyan/.ivy2/jars/org.checkerframework_checker-qual-3.41.0.jar,file:///home/jovyan/.ivy2/jars/org.apache.spark_spark-token-provider-kafka-0-10_2.12-3.5.0.jar,file:///home/jovyan/.ivy2/jars/org.apache.kafka_kafka-clients-3.4.1.jar,file:///home/jovyan/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.0.jar,file:///home/jovyan/.ivy2/jars/org.apache.commons_commons-pool2-2.11.1.jar,file:///home/jovyan/.ivy2/jars/org.apache.hadoop_hadoop-client-runtime-3.3.4.jar,file:///home/jovyan/.ivy2/jars/org.lz4_lz4-java-1.8.0.jar,file:///home/jovyan/.ivy2/jars/org.xerial.snappy_snappy-java-1.1.10.3.jar,file:///home/jovyan/.ivy2/jars/org.slf4j_slf4j-api-2.0.7.jar,file:///home/

In [4]:
# set log level
spark.sparkContext.setLogLevel("INFO")

In [5]:
df = (
    spark.read.format("jdbc")
    .option("driver", "org.postgresql.Driver")
    .option("url", "jdbc:postgresql://database:5432/database")
    .option("dbtable", "company")
    .option("user", "postgres")
    .option("password", "postgres")
    .load()
)

In [6]:
# df = (
#     spark.read.format("jdbc")
#     .option("driver", "com.sap.db.jdbc.Driver")
#     .option("url", "jdbc:sap://10.163.9.4:30041/HAQ")
#     .option("dbtable", "SAPHANADB.CRCO")
#     .option("user", "SYNAPSE_READ")
#     .option("password", "Syn@ps322SAP22")
#     .load()
# )

In [7]:
df_processed = (
    df.withColumn("ingestion_time", lit(current_timestamp()))
    .withColumn("source_system", lit("sap"))
    .withColumn("user_name", lit("gersonrs"))
    .withColumn("ingestion_type", lit("spark"))
    .withColumn("base_format", lit("table"))
    .withColumn("rows_written", lit(df.count()))
    .withColumn("schema", lit(df.schema.json()))
)

In [8]:
df_processed.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- salary: float (nullable = true)
 |-- ingestion_time: timestamp (nullable = false)
 |-- source_system: string (nullable = false)
 |-- user_name: string (nullable = false)
 |-- ingestion_type: string (nullable = false)
 |-- base_format: string (nullable = false)
 |-- rows_written: integer (nullable = false)
 |-- schema: string (nullable = false)



In [9]:
df_processed.count()

7

In [10]:
(
    df_processed.select(to_json(struct("*")).alias("value"))
    .selectExpr("CAST(value AS STRING)")
    .write.format("kafka")
    .option("kafka.bootstrap.servers", "broker:29092")
    .option("topic", "topic2")
    .save()
)