# Data Streaming Pipline
using:
- Spark Streaming (3.1.2)
- Pyspark (3.1.2)
- Kafka (Confluent cloud)
- ElasticSearch (7)

In [None]:
!pip install Elasticsearch
!pip install confluent-kafka

In [1]:
from elasticsearch import Elasticsearch
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StringType, IntegerType, FloatType, MapType



### Creating Elatsicsearc index using Command line 

curl -X PUT "http://10.0.3.36:9200/events" -H 'Content-Type: application/json' -d'
  "mappings": {
    "properties": {
      "eventType": { "type": "keyword" },
      "customerId": { "type": "keyword" },
      "productId": { "type": "keyword" },,
      "timestamp": { "type": "keyword"},,
      "metadata": {  "type": "keyword"},
        "type": "object",
        "enabled": true",
      },"enabled": true
      "quantity": { "type": "integer" },
      "totalAmount": { "type": "float" },
      "paymentMethod": { "type": "keyword" },
      "recommendedProductId": { "type": "keyword" }
    } "recommendedProductId": { "type": "keyword" }
  } }
}'


## Test inserting to Elasticsearch index with python

In [19]:
# Connect to Elasticsearch
es = Elasticsearch(
    [{'scheme': 'http', 'host': '192.168.33.139', 'port': 9200}]
)

# Define sample data
sample_data = [
    {
        "eventType": "purchase",
        "customerId": "C123",
        "productId": "P456",
        "timestamp": "2024-07-27T12:34:56.789Z",
        "metadata": {
            "category": "electronics",
            "source": "website"
        },
        "quantity": 2,
        "totalAmount": 199.99,
        "paymentMethod": "credit_card",
        "recommendedProductId": "P789"
    },
    {
        "eventType": "view",
        "customerId": "C124",
        "productId": "P457",
        "timestamp": "2024-07-27T13:45:56.789Z",
        "metadata": {},
        "quantity": 1,
        "totalAmount": 0.0,
        "paymentMethod": "none",
        "recommendedProductId": "P790"
    }
]

# Index the sample data
for i, doc in enumerate(sample_data):
    es.index(index='events', id=i+1, document=doc)




## Test reading data from Elasticsearch index with python

In [None]:
# Connect to Elasticsearch
es = Elasticsearch(
    hosts=[
        {
            'host': '192.168.33.139',
            'port': 9200,
            'scheme': 'http'  # or 'https' if you are using SSL
        }
    ]
)

# Define a search query to retrieve all documents
query = {
    "query": {
        "match_all": {}
    }
}

# Execute the search query
response = es.search(index='events', body=query)

# Print the retrieved documents
for hit in response['hits']['hits']:
    print(hit['_source'])


## Test inserting to Elasticsearch index with Pyspark

In [20]:
# Create SparkSession
spark = SparkSession.builder \
    .appName("TestToElasticsearch") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2,org.elasticsearch:elasticsearch-spark-30_2.12:7.15.2") \
    .getOrCreate()

# Define schema
schema = StructType() \
    .add("eventType", StringType()) \
    .add("customerId", StringType()) \
    .add("productId", StringType()) \
    .add("timestamp", StringType()) \
    .add("metadata", MapType(StringType(), StringType())) \
    .add("quantity", IntegerType()) \
    .add("totalAmount", FloatType()) \
    .add("paymentMethod", StringType())

# Create test data DataFrame
test_data = spark.createDataFrame([{
    'eventType': 'purchase',
    'customerId': '12345',
    'productId': '67890',
    'timestamp': '2024-07-27T11:44:45',
    'metadata': {'category': 'Books', 'source': 'Advertisement'},
    'quantity': 1,
    'totalAmount': 15.75,
    'paymentMethod': 'Credit Card'
}], schema)

# Elasticsearch configuration
es_write_conf = {
    "es.nodes": "192.168.33.139",
    "es.port": "9200",
    "es.index.auto.create": "true"
}

# Write data to Elasticsearch
test_data.write \
    .format("org.elasticsearch.spark.sql") \
    .options(**es_write_conf) \
    .mode("append") \
    .save("events")


## Reading data from Confluent Kafka Cloud and inserting it to Elasticsearch

In [3]:
# Define directories
outputDir = "hdfs://localhost:9000/user/itversity/stream_output"
checkpointDir = "hdfs://localhost:9000/user/itversity/stream_checkpoint"

# Create SparkSession
spark = SparkSession.builder \
    .appName("KafkaToElasticsearch") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2,org.elasticsearch:elasticsearch-spark-30_2.12:7.15.2") \
    .getOrCreate()

# Define schema for the incoming JSON data
schema = StructType() \
    .add("eventType", StringType()) \
    .add("customerId", StringType()) \
    .add("productId", StringType()) \
    .add("timestamp", StringType()) \
    .add("metadata", MapType(StringType(), StringType())) \
    .add("quantity", IntegerType()) \
    .add("totalAmount", FloatType()) \
    .add("paymentMethod", StringType()) \
    .add("recommendedProductId", StringType(), True)  # Optional field

# Kafka connection details
bootstrap_servers = "pkc-921jm.us-east-2.aws.confluent.cloud:9092"
kafka_topic = "demo_topic"  # Your Kafka topic name
kafka_username = "6UX76KJG3N363E2H"
kafka_password = "dWAj0CPM50FkljxdqW1z/1yH2am/FQdb7MicZ01vr/F0Zu8WRFx6Fx0fDpXWrt06"

# Read data from Kafka topic as a streaming DataFrame
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", bootstrap_servers) \
    .option("subscribe", kafka_topic) \
    .option("startingOffsets", "earliest") \
    .option("kafka.security.protocol", "SASL_SSL") \
    .option("kafka.sasl.mechanism", "PLAIN") \
    .option("kafka.sasl.jaas.config",
            f'org.apache.kafka.common.security.plain.PlainLoginModule required username="{kafka_username}" password="{kafka_password}";') \
    .load()

# Parse the JSON data
json_df = df.selectExpr("CAST(value AS STRING)") \
            .select(from_json("value", schema).alias("data")) \
            .select("data.*")


# Elasticsearch configuration
es_write_conf = {
    "es.nodes": "192.168.33.139",
    "es.port": "9200",
    "es.resource": "events/_doc",
    "es.nodes.wan.only": "true",
    "es.write.operation": "index",
    "es.index.auto.create": "true"
}


# Write to Elasticsearch
es_query = json_df.writeStream \
    .format("org.elasticsearch.spark.sql") \
    .options(**es_write_conf) \
    .option("checkpointLocation", checkpointDir) \
    .start()

# Await termination
es_query.awaitTermination()


KeyboardInterrupt: 

## Test count inserted records

In [10]:
import requests

response = requests.get("http://192.168.33.139:9200/events/_count")
print(response.text)

{"count":58,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0}}


## Test Filter Recordrs

In [11]:
# Create a client instance with scheme, host, and port
es = Elasticsearch([{'scheme': 'http', 'host': '192.168.33.139', 'port': 9200}])  # Adjust scheme, host, and port as needed

# Define the search query
query = {
    "query": {
        "bool": {
            "should": [
                {
                    "bool": {
                        "must": [
                            { "match": { "eventType": "recommendationClick" }}
                        ]
                    }
                }
            ]
        }
    }
}

# Execute the search query
response = es.search(index="events", body=query)

# Print the search results
print(response)


{'took': 23, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 13, 'relation': 'eq'}, 'max_score': 1.4748477, 'hits': [{'_index': 'events', '_type': '_doc', '_id': 'c8GD9JABPle8ZzxyG1-O', '_score': 1.4748477, '_source': {'eventType': 'recommendationClick', 'customerId': '94706', 'productId': '8327', 'timestamp': '2024-07-27T17:05:11', 'metadata': {}, 'recommendedProductId': '2335'}}, {'_index': 'events', '_type': '_doc', '_id': 'dcGD9JABPle8ZzxyG1-O', '_score': 1.4748477, '_source': {'eventType': 'recommendationClick', 'customerId': '40976', 'productId': '4564', 'timestamp': '2024-07-27T17:05:13', 'metadata': {}, 'recommendedProductId': '6915'}}, {'_index': 'events', '_type': '_doc', '_id': 'Z8GC9JABPle8Zzxy01_9', '_score': 1.4748477, '_source': {'eventType': 'recommendationClick', 'customerId': '14487', 'productId': '6253', 'timestamp': '2024-07-27T17:04:54', 'metadata': {}, 'recommendedProductId': '4391'}}, {'_index':



In [12]:
spark.stop()

## Run With Spark Submit

`spark-submit --master local[1] --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2,org.elasticsearch:elasticsearch-spark-30_2.12:7.15.2 data-pipeline/src/main/java/com/example/sparkToELastic.py
`