Create and clean up kafka topics manually to reduce memory burden for…

… Adala server (#107)
HumanSignal · May 21, 2024 · d163a69 · d163a69
1 parent 33b2a9b
commit d163a69
Show file tree

Hide file tree

Showing 7 changed files with 103 additions and 31 deletions.
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -20,6 +20,7 @@ services:
       - ALLOW_PLAINTEXT_LISTENER=yes
       - KAFKA_CFG_NODE_ID=1
       - KAFKA_KRAFT_CLUSTER_ID=MkU3OEVBNTcwNTJENDM2Qk
+      - KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE=false
   app:
     build:
       context: .
@@ -30,8 +31,9 @@ services:
       redis:
         condition: service_healthy
     environment:
-      - KAFKA_BOOTSTRAP_SERVERS=kafka:9093
       - REDIS_URL=redis://redis:6379/0
+      - KAFKA_BOOTSTRAP_SERVERS=kafka:9093 # TODO pull from .env
+      - KAFKA_RETENTION_MS=180000  # TODO pull from .env
     command:
       ["poetry", "run", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
   worker:

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -36,6 +36,7 @@ celery = {version = "^5.3.6", extras = ["redis"]}
 uvicorn = "*"
 pydantic-settings = "^2.2.1"
 label-studio-sdk = "^0.0.32"
+kafka-python = "^2.0.2"
 
 [tool.poetry.dev-dependencies]
 pytest = "^7.4.3"

diff --git a/server/.env.example b/server/.env.example
@@ -1 +1,4 @@
 KAFKA_BOOTSTRAP_SERVERS="localhost:9093"
+
+# this value is only for local dev. In our deployments, it is not set here, but in another place: https://github.com/HumanSignal/infra/pull/67
+KAFKA_RETENTION_MS=180000  # 30 minutes
diff --git a/server/app.py b/server/app.py
@@ -25,7 +25,7 @@
     process_streaming_output,
     streaming_parent_task,
 )
-from utils import get_input_topic, Settings
+from utils import get_input_topic_name, Settings
 from server.handlers.result_handlers import ResultHandler
 
 
@@ -230,7 +230,7 @@ async def submit_batch(batch: BatchData):
         Response: Generic response indicating status of request
     """
 
-    topic = get_input_topic(batch.job_id)
+    topic = get_input_topic_name(batch.job_id)
     producer = AIOKafkaProducer(
         bootstrap_servers=settings.kafka_bootstrap_servers,
         value_serializer=lambda v: json.dumps(v).encode("utf-8"),

diff --git a/server/tasks/process_file.py b/server/tasks/process_file.py
@@ -11,7 +11,13 @@
 from aiokafka.errors import UnknownTopicOrPartitionError
 from celery import Celery, states
 from celery.exceptions import Ignore
-from server.utils import get_input_topic, get_output_topic, Settings
+from server.utils import (
+    get_input_topic_name,
+    get_output_topic_name,
+    ensure_topic,
+    delete_topic,
+    Settings,
+)
 from server.handlers.result_handlers import ResultHandler
 
 
@@ -54,21 +60,29 @@ def streaming_parent_task(
     # Parent job ID is used for input/output topic names
     parent_job_id = self.request.id
 
-    # Override kafka_bootstrap_servers with value from settings
+    # create kafka topics
+    input_topic_name = get_input_topic_name(parent_job_id)
+    ensure_topic(input_topic_name)
+    output_topic_name = get_output_topic_name(parent_job_id)
+    ensure_topic(output_topic_name)
+
+    # Override default agent kafka settings
     settings = Settings()
     agent.environment.kafka_bootstrap_servers = settings.kafka_bootstrap_servers
+    agent.environment.kafka_input_topic = input_topic_name
+    agent.environment.kafka_output_topic = output_topic_name
 
     inference_task = process_file_streaming
     logger.info(f"Submitting task {inference_task.name} with agent {agent}")
-    input_result = inference_task.delay(agent=agent, parent_job_id=parent_job_id)
+    input_result = inference_task.delay(agent=agent)
     input_job_id = input_result.id
     logger.info(f"Task {inference_task.name} submitted with job_id {input_job_id}")
 
     result_handler_task = process_streaming_output
     logger.info(f"Submitting task {result_handler_task.name}")
     output_result = result_handler_task.delay(
         input_job_id=input_job_id,
-        parent_job_id=parent_job_id,
+        output_topic_name=output_topic_name,
         result_handler=result_handler,
         batch_size=batch_size,
     )
@@ -95,6 +109,10 @@ def streaming_parent_task(
     ):
         time.sleep(1)
 
+    # clean up kafka topics
+    delete_topic(input_topic_name)
+    delete_topic(output_topic_name)
+
     logger.info("Both input and output jobs complete")
 
     # Update parent task status to SUCCESS and pass metadata again
@@ -109,45 +127,40 @@ def streaming_parent_task(
     raise Ignore()
 
 
-@app.task(
-    name="process_file_streaming", track_started=True, bind=True, serializer="pickle"
-)
-def process_file_streaming(self, agent: Agent, parent_job_id: str):
-    # Set input and output topics using parent job ID
-    agent.environment.kafka_input_topic = get_input_topic(parent_job_id)
-    agent.environment.kafka_output_topic = get_output_topic(parent_job_id)
+@app.task(name="process_file_streaming", track_started=True, serializer="pickle")
+def process_file_streaming(agent: Agent):
+    # agent's kafka_bootstrap servers and kafka topics should be set in parent task
 
     # Run the agent
     asyncio.run(agent.arun())
 
 
 async def async_process_streaming_output(
     input_job_id: str,
-    parent_job_id: str,
+    output_topic_name,
     result_handler: ResultHandler,
     batch_size: int,
 ):
-    logger.info(f"Polling for results {parent_job_id=}")
+    logger.info(f"Polling for results {output_topic_name=}")
 
-    topic = get_output_topic(parent_job_id)
     settings = Settings()
 
     # Retry to workaround race condition of topic creation
     retries = 5
     while retries > 0:
         try:
             consumer = AIOKafkaConsumer(
-                topic,
+                output_topic_name,
                 bootstrap_servers=settings.kafka_bootstrap_servers,
                 value_deserializer=lambda v: json.loads(v.decode("utf-8")),
                 auto_offset_reset="earliest",
             )
             await consumer.start()
-            logger.info(f"consumer started {parent_job_id=}")
+            logger.info(f"consumer started {output_topic_name=}")
             break
         except UnknownTopicOrPartitionError as e:
             logger.error(msg=e)
-            logger.info(f"Retrying to create consumer with topic {topic}")
+            logger.info(f"Retrying to create consumer with topic {output_topic_name}")
 
             await consumer.stop()
             retries -= 1
@@ -183,20 +196,17 @@ async def async_process_streaming_output(
     await consumer.stop()
 
 
-@app.task(
-    name="process_streaming_output", track_started=True, bind=True, serializer="pickle"
-)
+@app.task(name="process_streaming_output", track_started=True, serializer="pickle")
 def process_streaming_output(
-    self,
     input_job_id: str,
-    parent_job_id: str,
+    output_topic_name: str,
     result_handler: ResultHandler,
     batch_size: int,
 ):
     try:
         asyncio.run(
             async_process_streaming_output(
-                input_job_id, parent_job_id, result_handler, batch_size
+                input_job_id, output_topic_name, result_handler, batch_size
             )
         )
     except Exception as e:

diff --git a/server/utils.py b/server/utils.py
@@ -1,6 +1,8 @@
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from typing import List, Union
 from pathlib import Path
+from kafka.admin import KafkaAdminClient, NewTopic
+from kafka.errors import TopicAlreadyExistsError
 
 
 class Settings(BaseSettings):
@@ -10,16 +12,55 @@ class Settings(BaseSettings):
     """
 
     kafka_bootstrap_servers: Union[str, List[str]]
+    kafka_retention_ms: int
 
     model_config = SettingsConfigDict(
         # have to use an absolute path here so celery workers can find it
         env_file=(Path(__file__).parent / ".env"),
     )
 
 
-def get_input_topic(job_id: str):
-    return f"adala-input-{job_id}"
+def get_input_topic_name(job_id: str):
+    topic_name = f"adala-input-{job_id}"
 
+    return topic_name
 
-def get_output_topic(job_id: str):
-    return f"adala-output-{job_id}"
+
+def get_output_topic_name(job_id: str):
+    topic_name = f"adala-output-{job_id}"
+
+    return topic_name
+
+
+def ensure_topic(topic_name: str):
+    settings = Settings()
+    bootstrap_servers = settings.kafka_bootstrap_servers
+    retention_ms = settings.kafka_retention_ms
+
+    admin_client = KafkaAdminClient(
+        bootstrap_servers=bootstrap_servers, client_id="topic_creator"
+    )
+
+    topic = NewTopic(
+        name=topic_name,
+        num_partitions=1,
+        replication_factor=1,
+        topic_configs={"retention.ms": str(retention_ms)},
+    )
+
+    try:
+        admin_client.create_topics(new_topics=[topic])
+    except TopicAlreadyExistsError:
+        # we shouldn't hit this case when KAFKA_CFG_AUTO_CREATE_TOPICS=false unless there is a legitimate name collision, so should raise here after testing
+        pass
+
+
+def delete_topic(topic_name: str):
+    settings = Settings()
+    bootstrap_servers = settings.kafka_bootstrap_servers
+
+    admin_client = KafkaAdminClient(
+        bootstrap_servers=bootstrap_servers, client_id="topic_deleter"
+    )
+
+    admin_client.delete_topics(topics=[topic_name])