In [1]:
# pip install fastavro

from confluent_kafka.schema_registry import SchemaRegistryClient
from confluent_kafka.schema_registry.avro import AvroSerializer
from confluent_kafka import SerializingProducer
from confluent_kafka.serialization import StringSerializer

from sqlalchemy import create_engine, text
import getpass
import psycopg2

def delivery_report(err, msg):
    """
    Reports the failure or success of a message delivery.
    Args:
        err (KafkaError): The error that occurred on None on success.
        msg (Message): The message that was produced or failed.
    Note:
        In the delivery report callback the Message.key() and Message.value()
        will be the binary format as encoded by any configured Serializers and
        not the same object that was passed to produce().
        If you wish to pass the original object(s) for key and value to delivery
        report callback we recommend a bound callback or lambda where you pass
        the objects along.
    """
    if err is not None:
        print("Delivery failed for User record {}: {}".format(msg.key(), err))
        return
    print('User record {} successfully produced to {} [{}] at offset {}'.format(
        msg.key(), msg.topic(), msg.partition(), msg.offset()))

# Create a Schema Registry reference (client)
sr = SchemaRegistryClient({"url": 'http://localhost:8081'})

# FYI: https://avro.apache.org/docs/current/spec.html
schema_str = """{
    "name": "Order",
    "type": "record",
    "fields": [
        {
            "name": "order_id",
            "type": "long"
        },
        {
            "name": "product_id",
            "type": "long"
        },
        {
            "name": "user_id",
            "type": "string"
        }
    ]
}"""

# Create an Avro Serializer, connecting the Schema Registry and the schema definition
avro_serializer = AvroSerializer(sr,schema_str)

# Producer configuration: also including how we serialize values (can also serialize keys)
producer_conf = {'bootstrap.servers': "localhost:19092,localhost:29092",
                    'value.serializer': avro_serializer}

# Create the producer
producer = SerializingProducer(producer_conf)


In [2]:
sample = {"order_id": 100,
"product_id": 20,
"user_id": 1}

producer.produce(topic="sales_schema", value=sample, on_delivery=delivery_report)

# Error: user_id should be string

ValueSerializationError: KafkaError{code=_VALUE_SERIALIZATION,val=-161,str="must be string on field user_id"}

In [3]:
sample = {"order_id": 100,
"product_id": 20,
"user_id": "1"}

producer.produce(topic="sales_schema", value=sample, on_delivery=delivery_report)

# OK --> now schema is actually registered

In [4]:
# Schema changes, we now want to have customer_id which is integer. 
# There is no more user_id

schema_str = """{
    "name": "Order",
    "type": "record",
    "fields": [
        {
            "name": "order_id",
            "type": "long"
        },
        {
            "name": "product_id",
            "type": "long"
        },
        {
            "name": "customer_id",
            "type": "long"
        }
    ]
}"""

In [5]:
avro_serializer = AvroSerializer(sr,schema_str)

producer_conf = {'bootstrap.servers': "localhost:19092,localhost:29092",
                    'value.serializer': avro_serializer}

producer = SerializingProducer(producer_conf)

In [6]:
host = ""
port = ""
db = ""
user = ""
pw = ""

engine = create_engine(f'postgresql+psycopg2://{user}:{pw}@{host}:{port}/{db}')

with engine.connect() as conn:
    result = conn.execute(text("SELECT order_id, product_id, customer_id FROM practice.sales_order"))
    

In [7]:
for r in result:
    event = dict(r.items())
    producer.produce(topic="sales_schema", value=event, on_delivery=delivery_report)

# Error: our new schema is incompatible with old schema
# (default compatibility is BACKWARD compatible)
# It is OK to: 
# * Delete fields
# * Add optional fields


ValueSerializationError: KafkaError{code=_VALUE_SERIALIZATION,val=-161,str="Schema being registered is incompatible with an earlier schema for subject "sales_schema-value", details: [Incompatibility{type:READER_FIELD_MISSING_DEFAULT_VALUE, location:/fields/2, message:customer_id, reader:{"type":"record","name":"Order","fields":[{"name":"order_id","type":"long"},{"name":"product_id","type":"long"},{"name":"customer_id","type":"long"}]}, writer:{"type":"record","name":"Order","fields":[{"name":"order_id","type":"long"},{"name":"product_id","type":"long"},{"name":"user_id","type":"string"}]}}] (HTTP status code 409, SR code 409)"}

In [12]:
# So we delete the user_id and add optional field customer_id

schema_str = """{
    "name": "Order",
    "type": "record",
    "fields": [
        {
            "name": "order_id",
            "type": "long"
        },
        {
            "name": "product_id",
            "type": "long"
        },
        {
            "name": "customer_id",
            "type": "long",
            "default": -1
        }
    ]
}"""

# We need to create a new serializer and new producer with this new schema

avro_serializer = AvroSerializer(sr,schema_str)

producer_conf = {'bootstrap.servers': "localhost:19092,localhost:29092",
                    'value.serializer': avro_serializer}

producer = SerializingProducer(producer_conf)


In [13]:
# now everything works
 
with engine.connect() as conn:
    result = conn.execute(text("SELECT order_id, product_id, customer_id FROM practice.sales_order"))

for r in result:
    event = dict(r.items())
    producer.produce(topic="sales_schema", value=event, on_delivery=delivery_report)

# you can validate from terminal by creating a simple consumer:
# docker exec --interactive --tty schema-registry kafka-avro-console-consumer --bootstrap-server broker-1:9092  --key-deserializer org.apache.kafka.common.serialization.StringDeserializer --topic sales_schema --from-beginning
# you should see the first msesage with user_id and the subsequent messages with customer_id

In [29]:
# List existing schemas:
print(sr.get_subjects())
# Get versions for a schema:
print(sr.get_versions("sales_schema-value"))

# print type or string of old schema:
old_schema = sr.get_version("sales_schema-value", 1)
print(old_schema.schema.schema_type)
print(old_schema.schema.schema_str)

['sales-value', 'sales3-value', 'sales_schema-value']
[1, 2]
AVRO
{"type":"record","name":"Order","fields":[{"name":"order_id","type":"long"},{"name":"product_id","type":"long"},{"name":"user_id","type":"string"}]}


In [None]:
# Reading material:
# https://medium.com/slalom-technology/introduction-to-schema-registry-in-kafka-915ccf06b902
# https://docs.confluent.io/platform/current/schema-registry/avro.html#compatibility-types