In [5]:
import time, json
from kafka import KafkaProducer
from data_generator.fakedata import create_fakeuser
from faker import Faker
from datetime import datetime
from pprint import pprint

#### 1. List Up Kafka Topic

In [6]:
from kafka import KafkaConsumer, KafkaAdminClient
from kafka import TopicPartition, OffsetAndMetadata
from kafka.admin import NewPartitions, NewTopic

bootstrap_servers=["kafka1:19091", "kafka2:19092", "kafka3:19093"]

admin_client = KafkaAdminClient(bootstrap_servers=bootstrap_servers)

In [14]:
admin_client.list_topics()

['connect_offsets',
 'demo',
 'my_status_topic',
 '__consumer_offsets',
 'user',
 'connect_configs']

In [29]:
admin_client.delete_topics([i for i in topic_list if "my" in i])

DeleteTopicsResponse_v3(throttle_time_ms=0, topic_error_codes=[(topic='mysqldemo', error_code=0), (topic='my_status_topic', error_code=0)])

In [11]:
admin_client.delete_topics(["demo"])

DeleteTopicsResponse_v3(throttle_time_ms=0, topic_error_codes=[(topic='demo', error_code=0)])

#### 2. Create Kafka Topic

In [13]:
# topic 생성
topic_name = "demo"
num_partitions = 4

new_topic_config = {
    'name': topic_name,
    'num_partitions': num_partitions,
    'replication_factor': 1
}
admin_client.create_topics([NewTopic(**new_topic_config)])

CreateTopicsResponse_v3(throttle_time_ms=0, topic_errors=[(topic='demo', error_code=0, error_message=None)])

In [46]:
admin_client.list_topics()

['dataops.dataops.fakeuser',
 'connect-configs',
 'connect-status',
 'connect-offsets',
 '__consumer_offsets',
 'dataops.dataops.tips',
 'fake',
 'dataops.dataops.dataops']

#### 3. Consumer Object

In [8]:
consumer = KafkaConsumer(
    topic_name,
    bootstrap_servers = bootstrap_servers,
    auto_offset_reset = "earliest",
)
consumer.partitions_for_topic(topic_name)

#### 4. Search partition, offset info for the Topic

In [12]:
total_offset = 0
for partition in consumer.partitions_for_topic(topic_name):
    tp = TopicPartition(topic_name, partition)
    consumer.seek_to_end(tp)
    offset = consumer.position(tp)
    print(f"Offset for topic '{topic_name}', partition {partition}: {offset}")
    total_offset += offset
    
print(f"Total offset for topic '{topic_name}': {total_offset}")

Offset for topic 'demo1', partition 0: 0
Offset for topic 'demo1', partition 1: 0
Offset for topic 'demo1', partition 2: 0
Offset for topic 'demo1', partition 3: 0
Offset for topic 'demo1', partition 4: 0
Total offset for topic 'demo1': 0


#### 5. Kafka Producer

In [13]:
producer = KafkaProducer(
    bootstrap_servers = bootstrap_servers
)

ORDER_LIMIT = 100
for i in range(1, ORDER_LIMIT+1):
    data = create_fakeuser()

    producer.send(topic_name, json.dumps(data).encode("utf-8"))
    print("=="*30)
    print(data)
    print(f">>>>>>>>>>>  {i} MESSAGE SENT  <<<<<<<<<<<<")
    time.sleep(1)

{'name': 'Robert Sanchez', 'ssn': '503-99-0536', 'job': 'Chartered legal executive (England and Wales)', 'residence': '5923 Wright Parks Apt. 104\nSnyderton, PA 31900', 'blood_group': 'B-', 'sex': 'M', 'birthdate': '19460317', 'uuid': 'FUEWCPtKejjtrsVSs5CJPR', 'timestamp': '2024-05-13 12:54:40'}
>>>>>>>>>>>  1 MESSAGE SENT  <<<<<<<<<<<<
{'name': 'Matthew Warner', 'ssn': '482-94-1558', 'job': 'Advertising account planner', 'residence': '41437 York Canyon Suite 241\nEast Amandaborough, ME 82337', 'blood_group': 'B-', 'sex': 'M', 'birthdate': '19850320', 'uuid': 'F2J9WQVPwx5HZU3UbUZFdc', 'timestamp': '2024-05-13 12:54:41'}
>>>>>>>>>>>  2 MESSAGE SENT  <<<<<<<<<<<<
{'name': 'Brianna Vance', 'ssn': '067-73-2326', 'job': 'Development worker, community', 'residence': '86558 Deborah Grove Suite 806\nBrentfort, MD 24583', 'blood_group': 'AB+', 'sex': 'F', 'birthdate': '19410925', 'uuid': '5y8ytEiEhgEsYicBh74TAK', 'timestamp': '2024-05-13 12:54:42'}
>>>>>>>>>>>  3 MESSAGE SENT  <<<<<<<<<<<<
{'na

KeyboardInterrupt: 

In [37]:
from datetime import datetime

topic_name = "my_status_topic"
consumer = KafkaConsumer (topic_name , auto_offset_reset='earliest', 
                          bootstrap_servers = bootstrap_servers)

# Read and print message from consumer
for msg in consumer:
    try:
        raw = json.loads(msg.value)
        raw["ts_ms"] = datetime.fromtimestamp(raw["ts_ms"] / 1000).strftime('%Y-%m-%d %H:%M:%S')
        pprint(raw)
        print("--"*50)
    except:
        pass

KeyboardInterrupt: 

In [29]:
# Create the Spark Session
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.sql import window as W
from pyspark.sql import functions as F

spark = (
    SparkSession 
    .builder 
    .appName("Streaming from Kafka") 
    .config("spark.streaming.stopGracefullyOnShutdown", True) 
    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0')
    .config("spark.driver.extraClassPath", "./jdbc/mysql-connector-j-8.4.0.jar") \
    .config("spark.sql.shuffle.partitions", 8)
    .master("local[*]") 
    .getOrCreate()
)

In [32]:
kdf = spark.read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:19091,kafka2:19092,kafka3:19093") \
    .option("subscribe", topic_name) \
    .option("startingOffsets", "earliest") \
    .load()

kdf.rdd.getNumPartitions()

1

In [33]:
kdf.show()

+----+--------------------+--------------------+---------+------+--------------------+-------------+
| key|               value|               topic|partition|offset|           timestamp|timestampType|
+----+--------------------+--------------------+---------+------+--------------------+-------------+
|null|[7B 22 62 65 66 6...|dataops.dataops.d...|        0|     0|2024-05-14 12:57:...|            0|
|null|[7B 22 62 65 66 6...|dataops.dataops.d...|        0|     1|2024-05-14 12:57:...|            0|
|null|[7B 22 62 65 66 6...|dataops.dataops.d...|        0|     2|2024-05-14 12:57:...|            0|
|null|[7B 22 62 65 66 6...|dataops.dataops.d...|        0|     3|2024-05-14 13:46:...|            0|
|null|[7B 22 62 65 66 6...|dataops.dataops.d...|        0|     4|2024-05-14 13:46:...|            0|
|null|[7B 22 62 65 66 6...|dataops.dataops.d...|        0|     5|2024-05-14 13:46:...|            0|
|null|[7B 22 62 65 66 6...|dataops.dataops.d...|        0|     6|2024-05-14 13:50:...|     

In [34]:
value_df = kdf.withColumn("value", F.expr("cast(value as string)"))

value_df.show(truncate=False)

+----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+---------+------+-----------------------+-------------+
|key |value                                                                                                            

In [25]:


pprint(json.loads(value_df.first().value))

{'after': {'birthdate': 19810916,
           'blood_group': 'AB+',
           'index': 0,
           'job': 'Psychologist, prison and probation services',
           'name': 'James Murray',
           'residence': 'PSC 1544, Box 3161\nAPO AP 52676',
           'sex': 'M',
           'ssn': '741-53-6179',
           'uuid': 'Cb7TYKQGDVZrB84FoUywnE'},
 'before': None,
 'op': 'r',
 'source': {'connector': 'mysql',
            'db': 'dataops',
            'file': 'binlog.000004',
            'gtid': None,
            'name': 'dataops',
            'pos': 156,
            'query': None,
            'row': 0,
            'sequence': None,
            'server_id': 0,
            'snapshot': 'first_in_data_collection',
            'table': 'fakeuser',
            'thread': None,
            'ts_ms': 1715691422000,
            'version': '2.2.1.Final'},
 'transaction': None,
 'ts_ms': 1715691422237}


In [1]:
from pprint import pprint

aa = {"eventId": "e3cb26d3-41b2-49a2-84f3-0156ed8d7502", "eventOffset": 10001, "eventPublisher": "device", "customerId": "CI00103", "data": {"devices": [{"deviceId": "D001", "temperature": 15, "measure": "C", "status": "ERROR"}, {"deviceId": "D002", "temperature": 16, "measure": "C", "status": "SUCCESS"}]}, "eventTime": "2023-01-05 11:13:53.643364"}

pprint(aa)

{'customerId': 'CI00103',
 'data': {'devices': [{'deviceId': 'D001',
                       'measure': 'C',
                       'status': 'ERROR',
                       'temperature': 15},
                      {'deviceId': 'D002',
                       'measure': 'C',
                       'status': 'SUCCESS',
                       'temperature': 16}]},
 'eventId': 'e3cb26d3-41b2-49a2-84f3-0156ed8d7502',
 'eventOffset': 10001,
 'eventPublisher': 'device',
 'eventTime': '2023-01-05 11:13:53.643364'}
