In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import random
import time

def setup_and_ingest_realtime_sales():
    # Watsonx.data, COS, and Db2 credentials
    UsernameCPD = "ibmlhapikey"
    PasswordCPD = "****"
    
    LH_S3_ACCESS_KEY = "****"
    LH_S3_SECRET_KEY = "****"

    # Initialize Spark session
    spark = SparkSession.builder.appName('sparky').getOrCreate()
    sc = spark.sparkContext
    sc.setLogLevel("ERROR")
    conf = sc.getConf()

    # Set configurations
    conf.set("spark.hive.metastore.uris", f"thrift://<YOUR WXD ENDPOINT>")
    conf.set("spark.hive.metastore.use.SSL", "true")
    conf.set("spark.hive.metastore.truststore.type", "JKS")
    conf.set("spark.hive.metastore.truststore.path", "file:///opt/ibm/jdk/lib/security/cacerts")
    conf.set("spark.hive.metastore.truststore.password", "changeit")

    conf.set("spark.hive.metastore.client.auth.mode", "PLAIN")
    conf.set("spark.hive.metastore.client.plain.username", UsernameCPD)
    conf.set("spark.hive.metastore.client.plain.password", PasswordCPD)

    conf.set("fs.s3a.access.key", LH_S3_ACCESS_KEY)
    conf.set("fs.s3a.secret.key", LH_S3_SECRET_KEY)
    conf.set("fs.s3a.endpoint", f"http://s3.us-south.cloud-object-storage.appdomain.cloud")

    conf.set("fs.s3a.connection.ssl.enabled",  "true")
    conf.set("fs.s3a.path.style.access",       "true")
    conf.set("fs.s3a.impl",                    "org.apache.hadoop.fs.s3a.S3AFileSystem")

    conf.set("spark.jars.packages", "io.delta:delta-core_2.12:2.2.0")
    conf.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    conf.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    sc.stop()

    spark = (SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate())

    # Function to create the Delta Lake table
    def create_realtime_sales_table(spark):
        delta_table_path = "s3a://spark-dl/bronze_ec/realtime_sales_new/"
        try:
            spark.sql(f"""
                CREATE TABLE IF NOT EXISTS spark_catalog.bronze_ec.realtime_sales_new
                (sales_id BIGINT, product_id BIGINT, accessory_id BIGINT, customer_id BIGINT, sale_date TIMESTAMP, quantity BIGINT, total_amount DOUBLE)
                USING delta
                LOCATION '{delta_table_path}'
            """)
            print("Table realtime_sales_new created in bronze_ec database.")
        except Exception as e:
            print(f"Error creating table realtime_sales_new: {e}")

    # Function to generate synthetic sales data
    def generate_sales_data(existing_ids):
        products = [201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229]
        accessories = list(range(1, 51))
        customers = [301, 302, 303, 304, 305, 306, 307, 308, 309, 310]
        sales_data = []
        for _ in range(10):
            sale_id = random.randint(1, 1000)
            while sale_id in existing_ids:
                sale_id = random.randint(1, 1000)
            product_id = random.choice(products)
            accessory_id = random.choice(accessories)
            customer_id = random.choice(customers)
            quantity = random.randint(1, 5)
            total_amount = round(quantity * random.uniform(50, 500), 2)
            sales_data.append((sale_id, product_id, accessory_id, customer_id, quantity, total_amount))
            existing_ids.add(sale_id)
        return sales_data

    # Real-time ingestion function
    def real_time_ingestion(spark):
        delta_table_path = "s3a://spark-dl/bronze_ec/realtime_sales_new/"
        existing_ids = set()
        for _ in range(5):  # Ingest only 5 times
            sales_data = generate_sales_data(existing_ids)
            df = spark.createDataFrame(sales_data, ["sales_id", "product_id", "accessory_id", "customer_id", "quantity", "total_amount"])
            df = df.withColumn("sale_date", F.current_timestamp())
            try:
                df.write.format("delta").mode("append").save(delta_table_path)
                print(f"Ingested {len(sales_data)} records into realtime_sales_new table.")
                df.show(truncate=False)
            except Exception as e:
                print(f"Error writing to Delta table: {e}")
            time.sleep(5)
        
        print("Finished ingesting sales data.")

    # Execution flow
    create_realtime_sales_table(spark)
    real_time_ingestion(spark)

# Main entry point
if __name__ == "__main__":
    setup_and_ingest_realtime_sales()


Error creating table realtime_sales_new: org.apache.hadoop.hive.ql.metadata.HiveException: java.lang.RuntimeException: Unable to instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient
Ingested 10 records into realtime_sales_new table.
+--------+----------+------------+-----------+--------+------------+--------------------------+
|sales_id|product_id|accessory_id|customer_id|quantity|total_amount|sale_date                 |
+--------+----------+------------+-----------+--------+------------+--------------------------+
|502     |207       |15          |307        |5       |1845.38     |2024-12-02 13:00:46.201909|
|972     |213       |26          |308        |3       |418.21      |2024-12-02 13:00:46.201909|
|748     |219       |36          |309        |3       |952.71      |2024-12-02 13:00:46.201909|
|83      |219       |12          |303        |5       |1796.94     |2024-12-02 13:00:46.201909|
|249     |211       |39          |304        |4       |1321.76     |2024-1

In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import random
import time

def setup_and_ingest_realtime_sales():
    # Watsonx.data, COS, and Db2 credentials
    UsernameCPD = "ibmlhapikey"
    PasswordCPD = "39zsNQt5sf6VhjfGoK6U_Ken-tfxQDetavPL3Nb9nzTK"
    
    LH_S3_ACCESS_KEY = "8ed2f00977a24cc79d7e110dfbf3a3ed"
    LH_S3_SECRET_KEY = "77710fe015f0aa883a8da76f7e2a91ec693e133b7093db8d"

    # Initialize Spark session
    spark = SparkSession.builder.appName('sparky').getOrCreate()
    sc = spark.sparkContext
    sc.setLogLevel("ERROR")
    conf = sc.getConf()

    # Set configurations
    conf.set("spark.hive.metastore.uris", f"thrift://139f6d9a-165f-4bee-a5f7-d3810bdad5fa.cise77rd04nf1e5p5s20.lakehouse.appdomain.cloud:31774")
    conf.set("spark.hive.metastore.use.SSL", "true")
    conf.set("spark.hive.metastore.truststore.type", "JKS")
    conf.set("spark.hive.metastore.truststore.path", "file:///opt/ibm/jdk/lib/security/cacerts")
    conf.set("spark.hive.metastore.truststore.password", "changeit")

    conf.set("spark.hive.metastore.client.auth.mode", "PLAIN")
    conf.set("spark.hive.metastore.client.plain.username", UsernameCPD)
    conf.set("spark.hive.metastore.client.plain.password", PasswordCPD)

    conf.set("fs.s3a.access.key", LH_S3_ACCESS_KEY)
    conf.set("fs.s3a.secret.key", LH_S3_SECRET_KEY)
    conf.set("fs.s3a.endpoint", f"http://s3.us-south.cloud-object-storage.appdomain.cloud")

    conf.set("fs.s3a.connection.ssl.enabled",  "true")
    conf.set("fs.s3a.path.style.access",       "true")
    conf.set("fs.s3a.impl",                    "org.apache.hadoop.fs.s3a.S3AFileSystem")

    conf.set("spark.jars.packages", "io.delta:delta-core_2.12:2.2.0")
    conf.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    conf.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    sc.stop()

    spark = (SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate())

    # Function to create the Delta Lake table
    def create_realtime_sales_table(spark):
        delta_table_path = "s3a://spark-dl/bronze_ec/realtime_sales_new/"
        try:
            spark.sql(f"""
                CREATE TABLE IF NOT EXISTS spark_catalog.bronze_ec.realtime_sales_new
                (sales_id BIGINT, product_id BIGINT, accessory_id BIGINT, customer_id BIGINT, sale_date TIMESTAMP, quantity BIGINT, total_amount DOUBLE)
                USING delta
                LOCATION '{delta_table_path}'
            """)
            print("Table realtime_sales_new created in bronze_ec database.")
        except Exception as e:
            print(f"Error creating table realtime_sales_new: {e}")

    # Function to generate synthetic sales data
    def generate_sales_data(existing_ids, count=30):  # Generate 30 records at a time
        products = [201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230]
        accessories = list(range(1, 51))
        customers = [301, 302, 303, 304, 305, 306, 307, 308, 309, 310]
        sales_data = []
        while len(sales_data) < count:
            sale_id = random.randint(1, 1000)
            while sale_id in existing_ids:
                sale_id = random.randint(1, 1000)
            product_id = random.choice(products)
            accessory_id = random.choice(accessories)
            customer_id = random.choice(customers)
            quantity = random.randint(1, 5)
            total_amount = round(quantity * random.uniform(50, 500), 2)
            sales_data.append((sale_id, product_id, accessory_id, customer_id, quantity, total_amount))
            existing_ids.add(sale_id)
        return sales_data

    # Real-time ingestion function
    def real_time_ingestion(spark):
        delta_table_path = "s3a://spark-dl/bronze_ec/realtime_sales_new/"
        existing_ids = set()
        
        for iteration in range(5):  # Loop to ingest records multiple times
            sales_data = generate_sales_data(existing_ids, count=30)
            df = spark.createDataFrame(sales_data, ["sales_id", "product_id", "accessory_id", "customer_id", "quantity", "total_amount"])
            df = df.withColumn("sale_date", F.current_timestamp())
            try:
                df.write.format("delta").mode("append").save(delta_table_path)
                print(f"Ingested {len(sales_data)} records into realtime_sales_new table on iteration {iteration + 1}.")
                df.show(truncate=False)
            except Exception as e:
                print(f"Error writing to Delta table: {e}")
            
            time.sleep(5)  # Sleep for 5 seconds before next iteration
        
        print("Finished ingesting sales data.")

    # Execution flow
    create_realtime_sales_table(spark)
    real_time_ingestion(spark)

# Main entry point
if __name__ == "__main__":
    setup_and_ingest_realtime_sales()


Error creating table realtime_sales_new: org.apache.hadoop.hive.ql.metadata.HiveException: java.lang.RuntimeException: Unable to instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient
Ingested 30 records into realtime_sales_new table on iteration 1.
+--------+----------+------------+-----------+--------+------------+--------------------------+
|sales_id|product_id|accessory_id|customer_id|quantity|total_amount|sale_date                 |
+--------+----------+------------+-----------+--------+------------+--------------------------+
|967     |212       |41          |309        |2       |559.76      |2024-12-02 13:04:10.392403|
|950     |218       |18          |309        |1       |348.04      |2024-12-02 13:04:10.392403|
|338     |210       |40          |307        |3       |1165.2      |2024-12-02 13:04:10.392403|
|470     |203       |43          |303        |3       |674.76      |2024-12-02 13:04:10.392403|
|225     |230       |29          |307        |3       |288.