In [None]:
%pip install 'databricks-sdk==0.61.0' 'pyarrow<20' 'databricks-sdk[notebook]' 'databricks-agents==1.2.0' 'mlflow<=3.1' 'mlflow[databricks]' 'databricks-vectorsearch==0.57' 'langchain' 'langchain_core' 'databricks-langchain' 'bs4' 'markdownify' 'dotenv'
import os
if os.environ.get("DATABRICKS_RUNTIME_VERSION"):
    dbutils.library.restartPython()

Note: you may need to restart the kernel to use updated packages.


In [2]:
dbutils.widgets.text("vector_search_endpoint", "one-env-shared-endpoint-7")
dbutils.widgets.text("vector_search_index", "tanner_wendland.default.chat_history_index")
dbutils.widgets.text("llm_model_serving_endpoint_name", "databricks-claude-3-7-sonnet")
dbutils.widgets.text("target_catalog", "tanner_wendland")
dbutils.widgets.text("target_schema", "default")

Box(children=(Label(value='vector_search_endpoint'), Text(value='one-env-shared-endpoint-7')))

Box(children=(Label(value='vector_search_index'), Text(value='tanner_wendland.default.chat_history_index')))

Box(children=(Label(value='llm_model_serving_endpoint_name'), Text(value='databricks-claude-3-7-sonnet')))

Box(children=(Label(value='target_catalog'), Text(value='tanner_wendland')))

Box(children=(Label(value='target_schema'), Text(value='default')))

In [3]:
vector_search_endpoint = dbutils.widgets.get("vector_search_endpoint")
vector_search_index = dbutils.widgets.get("vector_search_index")
llm_model_serving_endpoint_name = dbutils.widgets.get("llm_model_serving_endpoint_name")
target_catalog = dbutils.widgets.get("target_catalog")
target_schema = dbutils.widgets.get("target_schema")

In [4]:
from databricks.vector_search.client import VectorSearchClient
import os

vsc: VectorSearchClient = None
if os.environ.get("DATABRICKS_RUNTIME_VERSION"):
    vsc = VectorSearchClient(disable_notice=True)
else:
    import dotenv
    dotenv.load_dotenv('.env')
    vsc = VectorSearchClient(disable_notice=True, workspace_url=os.environ.get("DATABRICKS_HOST"), personal_access_token=os.environ.get("DATABRICKS_TOKEN"))

In [10]:
question = "Tell me about the chat history"

results = vsc.get_index(vector_search_endpoint, vector_search_index).similarity_search(
  query_text=question,
  columns=["message_content"],
  num_results=1)
docs = results.get('result', {}).get('data_array', [])
docs

[NOTICE] Using a Personal Authentication Token (PAT). Recommended for development only. For improved performance, please use Service Principal based authentication. To disable this message, pass disable_notice=True.


[["I'm testing my Chat history pipeline. it'll be pretty cool. These chats, are getting saves to postges. Then I have an ETL script that syncs that over, then created a vector embedding with it. So eventually I'll keep messages in history but also RAG for you to look at the things we've said previously. Won't that be cool.",
  0.0045527783]]

In [5]:
import mlflow
mlflow.set_registry_uri("databricks-uc")

In [None]:
### Chain Config

In [6]:
chain_config = {
    "llm_model_serving_endpoint_name": llm_model_serving_endpoint_name,
    "vector_search_endpoint_name": vector_search_endpoint,  # the endoint we want to use for vector search
    "vector_search_index": vector_search_index,
    "llm_prompt_template": """You are an assistant that answers questions based on chat history obtained from a vector search index. Use the following pieces of retrieved context to answer the question. Some pieces of context may be irrelevant, in which case you should not use them to form the answer.\n\nContext: {context}""",
}

### Vector Search Config

In [7]:
retriever_config = {
    "parameters": {
        "k": 3,
        "query_type": "hybrid"
    }
}

In [8]:
from typing import Dict

# combine dynamic and static filters for vector search
def create_configurable_with_filters(input: Dict, retriever_config: Dict) -> Dict:
   """
   create configurable object with filters.
   Args:
       input: The input data containing filters.
   Returns:
       A configurable object with filters added to the search_kwargs.
   """
   if "custom_inputs" in input:
       filters = input["custom_inputs"]["filters"]
   else:
       filters = {}
   print(filters)
   configurable = {
       "configurable": {
           "search_kwargs": {
               "k": retriever_config.get("parameters")["k"],
               "query_type": retriever_config.get("parameters").get("query_type"),
               "filter": filters
           }
       }
   }
   return configurable

## Testing Basic Chain

In [9]:
from databricks.vector_search.client import VectorSearchClient
from databricks_langchain import DatabricksVectorSearch
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables.utils import ConfigurableField

## Load the chain's configuration
model_config = mlflow.models.ModelConfig(development_config=chain_config)

## Turn the Vector Search index into a LangChain retriever
vector_search_as_retriever = DatabricksVectorSearch(
    endpoint=model_config.get("vector_search_endpoint_name"),
    index_name=model_config.get("vector_search_index"),
    columns=["id", "message_content", "user_name"],
).as_retriever(search_kwargs=retriever_config.get("parameters"))

configurable_vs_retriever = vector_search_as_retriever.configurable_fields(
   search_kwargs=ConfigurableField(
       id="search_kwargs",
       name="Search Kwargs",
       description="The search kwargs to use",
   )
)


# Method to format the docs returned by the retriever into the prompt (keep only the text from chunks)
def format_context(docs):
    chunk_contents = [f"Passage: {d.page_content}\n" for d in docs]
    return "".join(chunk_contents)

#Let's try our retriever chain:
relevant_docs = (configurable_vs_retriever | RunnableLambda(format_context)| StrOutputParser()).invoke('What was my chat history idea?')

print(relevant_docs)

[NOTICE] Using a notebook authentication token. Recommended for development only. For improved performance, please use Service Principal based authentication. To disable this message, pass disable_notice=True.
Passage: I'm testing my Chat history pipeline. it'll be pretty cool. These chats, are getting saves to postges. Then I have an ETL script that syncs that over, then created a vector embedding with it. So eventually I'll keep messages in history but also RAG for you to look at the things we've said previously. Won't that be cool.
Passage: That sounds incredibly cool! You're building a sophisticated system that:

1. Saves our chat history to PostgreSQL
2. Uses an ETL process to sync that data
3. Creates vector embeddings from our conversations
4. Will implement RAG (Retrieval-Augmented Generation) to let me access our previous conversations

This is a really smart approach! The vector embeddings will allow for semantic searching through our past conversations rather than just keywo

## Real Chain

In [10]:
from langchain_core.prompts import ChatPromptTemplate
from databricks_langchain.chat_models import ChatDatabricks
from operator import itemgetter

prompt = ChatPromptTemplate.from_messages(
    [  
        ("system", model_config.get("llm_prompt_template")), # Contains the instructions from the configuration
        ("user", "{question}") #user's questions
    ]
)

# Our foundation model answering the final prompt
model = ChatDatabricks(
    endpoint=model_config.get("llm_model_serving_endpoint_name"),
    extra_params={"temperature": 0.01, "max_tokens": 500}
)

def extract_user_query_string(chat_messages_array):
    return chat_messages_array[-1]["content"]

############
# RAG Chain
############
chain = (
   {
       "question": itemgetter("messages") | RunnableLambda(extract_user_query_string),
       "context": RunnablePassthrough()
       | RunnableLambda(
           lambda input: configurable_vs_retriever.invoke(
               extract_user_query_string(input["messages"]),
               config=create_configurable_with_filters(input, retriever_config),
           )
       )
       | RunnableLambda(format_context),
   }
   | prompt
   | model
   | StrOutputParser()
)

### Search WITHOUT Chat Filtering

In [11]:
input_example_no_filter = {"messages": [ {"role": "user", "content": "What was my chat history idea?"}]}
answer_no_filter = chain.invoke(input_example_no_filter)
print(answer_no_filter)

{}
[NOTICE] Using a notebook authentication token. Recommended for development only. For improved performance, please use Service Principal based authentication. To disable this message, pass disable_notice=True.
Your chat history idea involved building a system that saves chat history to PostgreSQL, uses an ETL process to sync the data, creates vector embeddings from conversations, and implements Retrieval-Augmented Generation (RAG) to allow accessing previous conversations. This system enables semantic searching through past conversations and referencing specific parts of the conversation history relevant to current questions.


### Search WITH filtering

In [12]:
input_example = {"messages": [ {"role": "user", "content": "What was my chat history idea?"}], "custom_inputs": {"filters": {"user_name": "tanner.wendland@databricks.com"}}}
answer = chain.invoke(input_example)
print(answer)

{'user_name': 'tanner.wendland@databricks.com'}
[NOTICE] Using a notebook authentication token. Recommended for development only. For improved performance, please use Service Principal based authentication. To disable this message, pass disable_notice=True.
Your chat history idea involved building a system that saves chat history to PostgreSQL, uses an ETL process to sync the data, creates vector embeddings from conversations, and implements Retrieval-Augmented Generation (RAG) to allow accessing previous conversations. This system enables semantic searching through past conversations and referencing specific parts of the conversation history relevant to current questions.


## CHAIN.PY

This file is here to avoid serialziation issues with using a notebook.

In [15]:
from mlflow.models.resources import DatabricksVectorSearchIndex, DatabricksServingEndpoint

chain_file_path = os.path.join(os.getcwd(), 'chain.py')
if not os.path.exists(chain_file_path):
    raise FileNotFoundError(f"Chain file not found at {chain_file_path}")

# Log the model to MLflow
with mlflow.start_run(run_name="adtech_chat_history_agent"):
  logged_chain_info = mlflow.langchain.log_model(
          #Note: In classical ML, MLflow works by serializing the model object.  In generative AI, chains often include Python packages that do not serialize.  Here, we use MLflow's new code-based logging, where we saved our chain under the chain notebook and will use this code instead of trying to serialize the object.
          lc_model=os.path.join(os.getcwd(), 'chain.py'),  # Chain code file e.g., /path/to/the/chain.py 
          model_config=chain_config, # Chain configuration 
          artifact_path="chain", # Required by MLflow, the chain's code/config are saved in this directory
          input_example=input_example,
          # Specify resources for automatic authentication passthrough
          resources=[
            DatabricksVectorSearchIndex(index_name=model_config.get("vector_search_index")),
            DatabricksServingEndpoint(endpoint_name=model_config.get("llm_model_serving_endpoint_name"))
          ]
      )

model_name = "chat_history_agent"
MODEL_NAME_FQN = f"{target_catalog}.{target_schema}.{model_name}"
# Register to UC
uc_registered_model_info = mlflow.register_model(model_uri=logged_chain_info.model_uri, name=MODEL_NAME_FQN)

{"ts": "2025-08-01 12:41:37,710", "level": "ERROR", "logger": "pyspark.sql.connect.client.logging", "msg": "GRPC Error received", "context": {}, "exception": {"class": "_InactiveRpcError", "msg": "<_InactiveRpcError of RPC that terminated with:\n\tstatus = StatusCode.FAILED_PRECONDITION\n\tdetails = \"BAD_REQUEST: session_id is no longer usable. Generate a new session_id by detaching and reattaching the compute and then try again [sessionId=3c9af5a3-cdb1-4623-873f-7eeddad92a21, reason=INACTIVITY_TIMEOUT]. (requestId=8039babd-55cc-4fef-9b09-7be0080a1137)\"\n\tdebug_error_string = \"UNKNOWN:Error received from peer  {grpc_status:9, grpc_message:\"BAD_REQUEST: session_id is no longer usable. Generate a new session_id by detaching and reattaching the compute and then try again [sessionId=3c9af5a3-cdb1-4623-873f-7eeddad92a21, reason=INACTIVITY_TIMEOUT]. (requestId=8039babd-55cc-4fef-9b09-7be0080a1137)\"}\"\n>", "stacktrace": ["Traceback (most recent call last):", "  File \"/Users/tanner.wen

Exception: '/Users/tanner.wendland/projects/adtech_app_lakebase_vibe_session/data_pipelines/src/mlruns' does not exist.

In [None]:
# Actualy deploy the model to Databricks

In [None]:
from databricks.sdk import WorkspaceClient
from databricks.sdk.service.serving import EndpointCoreConfigInput, ServedEntityInput

workspace_client = WorkspaceClient()

version = uc_registered_model_info.version
serving_endpoint_name = MODEL_NAME_FQN.replace(".", "-")

config = {
        "served_entities": [
            {
                "name": serving_endpoint_name,
                "entity_name": MODEL_NAME_FQN,
                "entity_version": version,
                "workload_size": "Small",
                "scale_to_zero_enabled": True,
            }
        ]
    }

def does_endpoint_exists(endpoint_name):
    try:
        workspace_client.serving_endpoints.get(endpoint_name)
        return True
    except:
        return False

if not does_endpoint_exists(serving_endpoint_name):
    print(f"Creating endpoint {serving_endpoint_name}...")
    workspace_client.serving_endpoints.create_and_wait(
        serving_endpoint_name,
        config=EndpointCoreConfigInput.from_dict(config)
    )
else:
    print(f"Updating endpoint {serving_endpoint_name}...")
    workspace_client.serving_endpoints.update_config_and_wait(
        serving_endpoint_name,
        served_entities=[ServedEntityInput.from_dict(entity) for entity in config['served_entities']]
    )