In [15]:
!pip install elasticsearch==7

Defaulting to user installation because normal site-packages is not writeable


In [12]:
!pip uninstall elasticsearch --yes

Found existing installation: elasticsearch 8.14.0
Uninstalling elasticsearch-8.14.0:
  Successfully uninstalled elasticsearch-8.14.0


In [4]:
!pip install pyspark

Defaulting to user installation because normal site-packages is not writeable
Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
     |████████████████████████████████| 317.0 MB 35 kB/s              
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.7
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
     |████████████████████████████████| 200 kB 43.7 MB/s            
[?25hUsing legacy 'setup.py install' for pyspark, since package 'wheel' is not installed.
Installing collected packages: py4j, pyspark
    Running setup.py install for pyspark ... [?25ldone
[?25hSuccessfully installed py4j-0.10.9.7 pyspark-3.5.1


In [19]:
from elasticsearch import Elasticsearch, exceptions
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StringType, IntegerType, FloatType, MapType

## Test inserting to Elasticsearch index with Pyspark

In [23]:
# Elasticsearch configuration
es_host = "10.0.3.36"
es_port = 9200  # Ensure this is an integer
es_scheme = "http"
es_index = "test"

# Create Elasticsearch client
es = Elasticsearch([{'host': es_host, 'port': es_port, 'scheme': es_scheme}])

# Function to create index if not exists
def create_index_if_not_exists(es_client, index_name):
    try:
        if not es_client.indices.exists(index=index_name):
            print(f"Index '{index_name}' does not exist. Creating index...")
            es_client.indices.create(index=index_name, body={
                "mappings": {
                    "properties": {
                        "eventType": {"type": "text"},
                        "customerId": {"type": "text"},
                        "productId": {"type": "text"},
                        "timestamp": {"type": "date"},
                        "metadata": {"type": "object"},
                        "quantity": {"type": "integer"},
                        "totalAmount": {"type": "float"},
                        "paymentMethod": {"type": "text"}
                    }
                }
            }, headers={"Content-Type": "application/json"})  # Set correct header
        else:
            print(f"Index '{index_name}' already exists.")
    except exceptions.ApiError as e:
        print(f"Error creating index: {e}")

# Create SparkSession
spark = SparkSession.builder \
    .appName("TestToElasticsearch") \
    .config("spark.jars", "s3://aws-emr-studio-381492251123-eu-central-1/elasticsearch-spark-30_2.12-8.0.0.jar") \
    .getOrCreate()

# Create the index if it does not exist
create_index_if_not_exists(es, es_index)

# Define schema
schema = StructType() \
    .add("eventType", StringType()) \
    .add("customerId", StringType()) \
    .add("productId", StringType()) \
    .add("timestamp", StringType()) \
    .add("metadata", MapType(StringType(), StringType())) \
    .add("quantity", IntegerType()) \
    .add("totalAmount", FloatType()) \
    .add("paymentMethod", StringType())

# Create test data DataFrame
test_data = spark.createDataFrame([{
    'eventType': 'purchase',
    'customerId': '12345',
    'productId': '67890',
    'timestamp': '2024-07-27T11:44:45',
    'metadata': {'category': 'Books', 'source': 'Advertisement'},
    'quantity': 1,
    'totalAmount': 15.75,
    'paymentMethod': 'Credit Card'
}], schema)

# Elasticsearch configuration for Spark
es_write_conf = {
    "es.nodes": es_host,
    "es.port": str(es_port),  # Convert port to string for Spark configuration
    "es.index.auto.create": "true"  # Ensure that Spark does not attempt to auto-create the index
}

# Write data to Elasticsearch
test_data.write \
    .format("org.elasticsearch.spark.sql") \
    .options(**es_write_conf) \
    .mode("append") \
    .save(es_index)

Index 'test' does not exist. Creating index...
Error creating index: ApiError(406, 'Content-Type header [application/vnd.elasticsearch+json; compatible-with=8] is not supported', 'Content-Type header [application/vnd.elasticsearch+json; compatible-with=8] is not supported')


  es_client.indices.create(index=index_name, body={


In [25]:
!curl -X GET "http://10.0.3.36:9200/test/_search?pretty"

{
  "took" : 3,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 1.0,
    "hits" : [
      {
        "_index" : "test",
        "_type" : "_doc",
        "_id" : "titjGJEBk8nbZW4mPCn0",
        "_score" : 1.0,
        "_source" : {
          "eventType" : "purchase",
          "customerId" : "12345",
          "productId" : "67890",
          "timestamp" : "2024-07-27T11:44:45",
          "metadata" : {
            "category" : "Books",
            "source" : "Advertisement"
          },
          "quantity" : 1,
          "totalAmount" : 15.75,
          "paymentMethod" : "Credit Card"
        }
      }
    ]
  }
}


## Reading data from Confluent Kafka Cloud and inserting it to Elasticsearch

### Creating Elatsicsearc index using Command line 

curl -X PUT "http://10.0.3.36:9200/events" -H 'Content-Type: application/json' -d'
{
  "mappings": {
    "properties": {
      "eventType": { "type": "keyword" },
      "customerId": { "type": "keyword" },
      "productId": { "type": "keyword" },
      "timestamp": { "type": "date" },
      "metadata": { 
        "type": "object",
        "enabled": true
      },
      "quantity": { "type": "integer" },
      "totalAmount": { "type": "float" },
      "paymentMethod": { "type": "keyword" },
      "recommendedProductId": { "type": "keyword" }
    }
  }
}'
